## To install ipeds: devtools::install_github('jbryer/ipeds')E
## To install unzip: install.packages("utils")
library(ipeds)
library(tidyverse)
library(dplyr)
library(tidycensus)
library(knitr)
library(RODBC)
library(questionr)
library(tibble)
###Be sure to enter cencus API key ###census_api_key(enter key here)
###Get Access file path IPEDSDatabase <- odbcDriverConnect(“Driver={Microsoft Access Driver (.mdb, .accdb)};DBQ=FILE PATH HERE”)
institutioninformation <- sqlFetch(IPEDSDatabase, "HD2021")
instituioninformation<- subset(institutioninformation, CONTROL == 1)%>%
subset(C21SZSET %in% c(1, 2, 3, 4, 5))%>%
select(UNITID, INSTNM, COUNTYCD, C21SZSET)
graduationinformation <- sqlFetch(IPEDSDatabase, "GR200_21")
graduationinformation <- graduationinformation %>%
select(UNITID, L4GR100, L4GR150, L4GR200)
transferinformation<- sqlFetch(IPEDSDatabase, "DRVGR2021")
transferinformation<- transferinformation %>%
select(UNITID, TRRTTOT)
retentioninformation <- sqlFetch(IPEDSDatabase, "EF2021D")
retentioninformation <- retentioninformation %>%
select(UNITID, RET_PCF, RET_PCP)
ipeds<- left_join(instituioninformation, graduationinformation, by = "UNITID")
ipeds<- left_join(ipeds, transferinformation, by = "UNITID")
ipeds<- left_join(ipeds, retentioninformation)
ipeds<- na.omit(ipeds)
ipeds<- ipeds%>%
mutate(gradtransf = .01*(TRRTTOT + L4GR150))%>%
mutate(RET_PCF = .01*RET_PCF)%>%
mutate(RET_PCP = .01*RET_PCP)
ipeds$COUNTYCD<-paste0("0",as.character(ipeds$COUNTYCD))
write.csv(ipeds, "ipeds.csv")
#Capture the aligned cesus data and export to an excel file
key <- load_variables(2020, "acs5", cache = TRUE)
write.csv(key, "censuskey.csv")
years<- lst(2015, 2020)
my_vars<- c(total_pop = "B01003_001")
population<- map_dfr(years, ~ get_acs(geography = "county", variables = my_vars, year = .x, survey = "acs5", geometry = FALSE), .id = "year" # when combining results, add id var (name of list item)
) %>%
select(-moe) %>% # shhhh
arrange(variable, NAME)
population<- select(population, GEOID, year, estimate)
population<- population %>%
spread(year, estimate)
population<- population %>%
mutate("popchange" = population$'2020'-population$'2015')
population<- population %>%
mutate("percentpopchange" = popchange/population$'2015')
population<- select(population, GEOID, popchange, percentpopchange)
#Estimated median household income in past 12 months, inflation adjusted 2020
income <- get_acs( geography = "county", variables = c(MedIncome="B19013_001"))
income<- rename.variable(income, "estimate","med_income")
income<- select(income, GEOID, med_income)
#Total in Population
HealthPro<-get_acs(geography = "county", variables = c(White="C27001A_005", Black="C27001B_005", Native="C27001C_005", Asian = "C27001D_005", Pacific="C27001E_005", Other= "C27001F_005", TwoMore="C27001G_005", WhiteNot="C27001H_005", Hispanic="C27001I_005"))
HealthPro<- HealthPro %>%
select(GEOID, NAME, variable, estimate)
HealthPro<- HealthPro %>%
spread(variable, estimate)
HealthPro<- HealthPro %>%
replace_na(list(White= 0, Black=0, Native=0, Asian=0, Pacific=0, Other=0, TwoMore=0, WhiteNot=0, Hispanic=0))
HealthPro<- HealthPro %>%
mutate("HPro" = White + Black + Native + Asian + Pacific + Other + TwoMore + WhiteNot + Hispanic)
#Total with health
HealthTot<-get_acs(geography = "county", variables = c(Whitet="C27001A_007", Blackt="C27001B_007", Nativet="C27001C_007", Asiant = "C27001D_007", Pacifict="C27001E_007", Othert= "C27001F_007", TwoMoret="C27001G_007", WhiteNott="C27001H_007", Hispanict="C27001I_007"))
HealthTot<- HealthTot %>%
select(GEOID, NAME, variable, estimate)
HealthTot<- HealthTot %>%
spread(variable, estimate)
HealthTot<- HealthTot %>%
replace_na(list(Whitet= 0, Blackt=0, Nativet=0, Asiant=0, Pacifict=0, Othert=0, TwoMoret=0, WhiteNott=0, Hispanict=0))
HealthTot<- HealthTot %>%
mutate("HTot" = Whitet + Blackt + Nativet + Asiant + Pacifict + Othert + TwoMoret + WhiteNott + Hispanict)
#Join the health variables
Health<-left_join(HealthTot, HealthPro, by = "GEOID")
Health<-rename.variable(Health, "NAME.x","County")
Health<- select(Health, GEOID, County, HPro, HTot)
Health<- Health%>%
mutate("NoHealth"=HTot/HPro)
#Total in Population
MalePop<-get_acs(geography = "county", variables = c(WhMalePop="C23002A_004", BMalePop="C23002B_004", NMalePop="C23002C_004", AMalePop="C23002D_004", PIPop="C23002E_004", OMalePop="C23002F_004", TwoMalePop="C23002G_004", WAMalePop="C23002H_004", HMalePopl="C23002I_004"))
MalePop<- MalePop %>%
select(GEOID, NAME, variable, estimate)
MalePop<- MalePop %>%
spread(variable, estimate)
MalePop<- MalePop %>%
replace_na(list(WhMalePop = 0, BMalePop = 0, NMalePop = 0, AMalePop= 0, PIPop = 0, OMalePop = 0, TwoMalePop = 0, WAMalePop = 0, HMalePopl= 0))
MalePop<- MalePop %>%
mutate("MPop" = WhMalePop + BMalePop + NMalePop + AMalePop + PIPop + OMalePop + TwoMalePop + WAMalePop + HMalePopl)
MalePop<- MalePop %>%
select(GEOID, NAME, MPop)
#Total Unemployed
MaleEmp<-get_acs(geography = "county", variables = c(WhMaleEmp="C23002A_008", BMaleEmp="C23002B_008", NMaleEmp="C23002C_008", AMaleEmp="C23002D_008", PIMaleEmpl="C23002E_008", OMaleEmp="C23002F_008", TwoMaleEmp="C23002G_008", WAMaleEmp="C23002H_008", HMaleEmpl="C23002I_008"))
MaleEmp<- MaleEmp %>%
select(GEOID, NAME, variable, estimate)
MaleEmp<- MaleEmp %>%
spread(variable, estimate)
MaleEmp<- MaleEmp %>%
replace_na(list(WhMaleEmp = 0, BMaleEmp = 0, NMaleEmp = 0, AMaleEmp= 0, PMaleEmpl = 0, OMaleEmp = 0, TwoMaleEmp = 0, WAMaleEmp = 0, HMaleEmpl= 0))
MaleEmp<- MaleEmp %>%
mutate("MUnEmp" = WhMaleEmp + BMaleEmp + NMaleEmp + AMaleEmp + PIMaleEmpl + OMaleEmp + TwoMaleEmp + WAMaleEmp + HMaleEmpl)
MaleEmp<- MaleEmp %>%
select(GEOID,MUnEmp)
MaleEmp<- left_join(MalePop, MaleEmp, by = "GEOID")
MaleEmp<- MaleEmp%>%
mutate("MUnEmploy"= MUnEmp/MPop)
#Total in Population
FemalePop<-get_acs(geography = "county", variables = c(WhFemalePop="C23002A_016", BFemalePop="C23002B_016", NFemalePop="C23002C_016", AFemalePop="C23002D_016", PIPop="C23002E_016", OFemalePop="C23002F_016", TwoFemalePop="C23002G_016", WAFemalePop="C23002H_016", HFemalePopl="C23002I_016"))
FemalePop<- FemalePop %>%
select(GEOID, NAME, variable, estimate)
FemalePop<- FemalePop %>%
spread(variable, estimate)
FemalePop<- FemalePop %>%
replace_na(list(WhFemalePop = 0, BFemalePop = 0, NFemalePop = 0, AFemalePop= 0, PIPop = 0, OFemalePop = 0, TwoFemalePop = 0, WAFemalePop = 0, HFemalePopl= 0))
FemalePop<- FemalePop %>%
mutate("FPop" = WhFemalePop + BFemalePop + NFemalePop + AFemalePop + PIPop + OFemalePop + TwoFemalePop + WAFemalePop + HFemalePopl)
FemalePop<- FemalePop %>%
select(GEOID, NAME, FPop)
#Total Unemployed
FemaleEmp<-get_acs(geography = "county", variables = c(WhFemaleEmp="C23002A_021", BFemaleEmp="C23002B_021", NFemaleEmp="C23002C_021", AFemaleEmp="C23002D_021", PIFemaleEmpl="C23002E_021", OFemaleEmp="C23002F_021", TwoFemaleEmp="C23002G_021", WAFemaleEmp="C23002H_021", HFemaleEmpl="C23002I_021"))
FemaleEmp<- FemaleEmp %>%
select(GEOID, NAME, variable, estimate)
FemaleEmp<- FemaleEmp %>%
spread(variable, estimate)
FemaleEmp<- FemaleEmp %>%
replace_na(list(WhFemaleEmp = 0, BFemaleEmp = 0, NFemaleEmp = 0, AFemaleEmp= 0, PFemaleEmpl = 0, OFemaleEmp = 0, TwoFemaleEmp = 0, WAFemaleEmp = 0, HFemaleEmpl= 0))
FemaleEmp<- FemaleEmp %>%
mutate("FUnEmp" = WhFemaleEmp + BFemaleEmp + NFemaleEmp + AFemaleEmp + PIFemaleEmpl + OFemaleEmp + TwoFemaleEmp + WAFemaleEmp + HFemaleEmpl)
FemaleEmp<- FemaleEmp %>%
select(GEOID,FUnEmp)
FemaleEmp<- left_join(FemalePop, FemaleEmp, by = "GEOID")
FemaleEmp<- FemaleEmp%>%
mutate("FUnEmploy"= FUnEmp/FPop)
employ<-left_join(MaleEmp, FemaleEmp, by="GEOID")
employ<- select(employ, GEOID, NAME.x, MPop, MUnEmp, FPop, FUnEmp)
employ<-rename.variable(employ, "NAME.x","County")
employ<- employ %>%
mutate("total"= MPop+FPop)
employ<- employ %>%
mutate("popemployed"= MUnEmp+FUnEmp)
employ<- employ %>%
mutate ("Percent_Unemployed" = (popemployed/total))
#Total in Population
RacePop<-get_acs(geography = "county", variables = c(RacePop="B01001_001"))
RacePop<- rename.variable(RacePop, "estimate","RacePop")
#Total White
WhiteTot<-get_acs(geography = "county", variables = c(WhiteTot="B01001A_001"))
WhiteTot<-rename.variable(WhiteTot, "estimate","WhiteTot")
#Join the Race/Ethnicity
TotalWhite<-left_join(RacePop, WhiteTot, by = "GEOID")
TotalWhite<-rename.variable(TotalWhite, "NAME.x","County")
TotalWhite<- select(TotalWhite, GEOID, County, RacePop, WhiteTot)
TotalWhite<- TotalWhite%>%
mutate("PercentWhite"= WhiteTot/RacePop)
#Total Veterans
VetPop<-get_acs(geography = "county", variables = c(VetPop="B21001_002"))
VetPop<- rename.variable(VetPop, "estimate","VetPop")
#Total NonVeterans
VetNon<-get_acs(geography = "county", variables = c(VetNon="B21001_003"))
VetNon<-rename.variable(VetNon, "estimate","VetNon")
#Join the Veterans
Veteran<-left_join(VetPop, VetNon, by = "GEOID")
Veteran<-rename.variable(Veteran, "NAME.x","County")
Veteran<- select(Veteran, GEOID, County, VetNon, VetPop)
Veteran<- Veteran%>%
mutate("TotalPopV"= VetNon + VetPop)
Veteran<- Veteran%>%
mutate("PercentVet"= VetPop/TotalPopV)
#Total Pop
House<-get_acs(geography = "county", variables = c(House="B07401_001"))
House<- rename.variable(House, "estimate","House")
House<- select(House, GEOID, House)
#Total Same House
HouseSame<-get_acs(geography = "county", variables = c(HouseSame="B07401_017"))
HouseSame<-rename.variable(HouseSame, "estimate","HouseSame")
HouseSame<- select(HouseSame, GEOID, HouseSame)
HousePercent<- left_join(HouseSame, House, by = "GEOID")
HousePercent<-
HousePercent%>%
mutate("HousePercent" = HouseSame/House)
HousePercent<- select(HousePercent, GEOID, HousePercent)
#Get Married Statuses
Married<-get_acs(geography = "county", variables = c(MarriedPop="B07408_001",NeverMarried="B07408_002", Married="B07408_003", Divorced="B07408_004", Separated="B07408_005", Widowed="B07408_006"))
Married<- select(Married, GEOID, variable, estimate)
#Tranverse the data with spread
Married<- Married %>%
spread(variable, estimate)
#Calculate Proportions and clean dataset
Married<- Married %>%
mutate("percentnevermarried"=NeverMarried/MarriedPop)
Married<- Married %>%
mutate("percentmarried"=Married/MarriedPop)
Married<- Married %>%
mutate("percentdivorced"=Divorced/MarriedPop)
Married<- Married %>%
mutate("percentseparated"=Separated/MarriedPop)
Married<- Married %>%
mutate("percentwidowed"=Widowed/MarriedPop)
Married<- Married %>%
mutate("percentsingle"=(1- percentmarried)+percentwidowed)
#Clean the dataset
Married<- select(Married, GEOID, percentnevermarried, percentmarried, percentdivorced, percentseparated, percentwidowed, percentsingle)
Education<-get_acs(geography = "county", variables = c(EduPop="B07409_001",LessHS="B07409_002", HSorEquiv="B07409_003", SomeCollegeAss="B07409_004", Bach="B07409_005", GradorProf="B07409_006"))
Education<- select(Education, GEOID, variable, estimate)
#Tranverse the data with spread
Education<- Education %>%
spread(variable, estimate)
#Calculate Proportions and clean dataset
Education<- Education %>%
mutate("PercentLessthanHS"=LessHS/EduPop)
Education<- Education %>%
mutate("PercentHS"=(HSorEquiv+SomeCollegeAss+Bach+GradorProf)/EduPop)
Education<- Education %>%
mutate("PercentSomeorASS"=(SomeCollegeAss+Bach+GradorProf)/EduPop)
Education<- Education %>%
mutate("PercentBach"=(Bach+GradorProf)/EduPop)
Education<- Education %>%
mutate("PercentGradorPro"=GradorProf/EduPop)
Education<- select(Education, GEOID, PercentLessthanHS, PercentHS, PercentSomeorASS, PercentBach, PercentGradorPro)
Tribes<-get_acs(geography = "county", variables = c(pop = "B01003_001", tribes = "B02014_002"))
Tribes<- select(Tribes, GEOID, variable, estimate)
# Transpose with spread
Tribes<- Tribes %>%
spread(variable,estimate)
# Calculate the proportion
Tribes<- Tribes %>%
mutate(percenttribe = tribes/pop)
Tribes<- select(Tribes, GEOID, percenttribe)
Parenting<-get_acs(geography = "county", variables = c(totkids = "B05009_001", undersixoneparent="B05009_013", sixtoseventeeenoneparent="B05009_031"))
Parenting<- select(Parenting, GEOID, variable, estimate)
#Tranverse the data with spread
Parenting<- Parenting %>%
spread(variable, estimate)
#Calculate proportion
Parenting<- Parenting %>%
mutate(SingleParPercent = (sixtoseventeeenoneparent + undersixoneparent)/totkids)
Parenting<- select(Parenting, GEOID, SingleParPercent)
Citizen<-get_acs(geography = "county", variables = c(totalpop= "B01003_001", totalmig = "B05011_001", notcitizen="B05011_002", naturalized="B05011_003"))
Citizen<- select(Citizen, GEOID, variable, estimate)
#Tranverse the data with spread
Citizen<- Citizen %>%
spread(variable, estimate)
#Calculate proportion
Citizen<- Citizen %>%
mutate(PercentNotCitizen = notcitizen/totalpop)
Citizen<- Citizen %>%
mutate(PercentImmigrant = totalmig/totalpop)
Citizen<- select(Citizen, GEOID, PercentNotCitizen, PercentImmigrant)
Renters<- get_acs(geography = "county", variables = c(totalh="B07013_001", renters = "B07013_003"))
Renters<- select(Renters, GEOID, variable, estimate)
#Transverse the data with spread
Renters<- Renters %>%
spread(variable, estimate)
#Calculate proportions
Renters<- Renters %>%
mutate(PercentRent = renters/totalh)
Renters<- select(Renters, GEOID, PercentRent)
census<- left_join(employ, Health, by = "GEOID")
census<- rename.variable(census, "GEOID.x", "GEOID")
census<- left_join(census, income, by = "GEOID")
census<- left_join(census, TotalWhite, by = "GEOID")
census<- left_join(census, Veteran, by = "GEOID")
census<- left_join(census, population, by = "GEOID")
census<- left_join(census, HousePercent, by = "GEOID")
census<- left_join(census, Married, by = "GEOID")
census<- left_join(census, Education, by = "GEOID")
census<- left_join(census, Tribes, by = "GEOID")
census<- left_join(census, Parenting, by= "GEOID")
census<- left_join(census, Citizen, by = "GEOID")
census<- left_join(census, Renters, by = "GEOID")
census<-rename.variable(census, "County.x","County")
census<- census %>%
mutate("WithHealth" = 1-NoHealth)
census<- select(census, GEOID, County, Percent_Unemployed, HPro, HTot, NoHealth, WithHealth, med_income, RacePop, WhiteTot, PercentWhite, VetNon, VetPop, PercentVet, popchange, percentpopchange, HousePercent, percentnevermarried, percentmarried, percentdivorced, percentseparated, percentwidowed, percentsingle, PercentLessthanHS, PercentHS, PercentSomeorASS, PercentBach, PercentGradorPro, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)
write.csv(census, "census.csv")
census<- read.csv("census.csv")
ipeds<- read.csv("ipeds.csv")
ipeds$COUNTYCD<- as.numeric(ipeds$COUNTYCD)
ipedsgraddata<-left_join(ipeds, census, by = c("COUNTYCD" = "GEOID"))
write.csv(ipedsgraddata, "ipedsgradmassive2.csv")
##Results may change slightly with each iteration of the model.
library(randomForest)
library(tidyverse)
library(ROCR)
set.seed(71)
gf<- read.csv('ipedsgradmassive2.csv')
gf<- select(gf, gradtransf, Percent_Unemployed, WithHealth, med_income, PercentWhite, percentpopchange, HousePercent, percentmarried, percentdivorced, percentseparated, PercentSomeorASS, PercentBach, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)
gf<- na.omit(gf)
descriptives<- psych::describe(gf)
descriptives
## vars n mean sd median trimmed mad
## gradtransf 1 775 0.46 0.12 0.45 0.46 0.10
## Percent_Unemployed 2 775 0.05 0.01 0.04 0.04 0.01
## WithHealth 3 775 0.88 0.06 0.89 0.88 0.06
## med_income 4 775 65613.25 19449.81 61310.00 63379.26 16028.39
## PercentWhite 5 775 0.72 0.18 0.75 0.73 0.20
## percentpopchange 6 775 0.01 0.04 0.00 0.01 0.04
## HousePercent 7 775 0.87 0.03 0.88 0.87 0.03
## percentmarried 8 775 0.48 0.05 0.48 0.48 0.05
## percentdivorced 9 775 0.11 0.02 0.11 0.11 0.02
## percentseparated 10 775 0.02 0.01 0.02 0.02 0.01
## PercentSomeorASS 11 775 0.60 0.09 0.60 0.60 0.09
## PercentBach 12 775 0.29 0.11 0.26 0.28 0.11
## percenttribe 13 775 0.01 0.03 0.00 0.00 0.00
## SingleParPercent 14 775 0.36 0.09 0.35 0.35 0.08
## PercentNotCitizen 15 775 0.05 0.05 0.03 0.04 0.03
## PercentImmigrant 16 775 0.10 0.09 0.06 0.08 0.06
## PercentRent 17 775 0.31 0.09 0.31 0.31 0.09
## min max range skew kurtosis se
## gradtransf 0.17 0.95 0.78 0.55 0.98 0.00
## Percent_Unemployed 0.00 0.11 0.11 0.80 2.04 0.00
## WithHealth 0.59 0.97 0.39 -0.92 0.92 0.00
## med_income 24958.00 140258.00 115300.00 1.22 1.88 698.66
## PercentWhite 0.15 0.98 0.83 -0.70 -0.33 0.01
## percentpopchange -0.11 0.22 0.33 0.77 1.75 0.00
## HousePercent 0.74 0.94 0.21 -0.75 1.07 0.00
## percentmarried 0.26 0.62 0.36 -0.72 1.40 0.00
## percentdivorced 0.05 0.19 0.14 0.22 -0.32 0.00
## percentseparated 0.00 0.06 0.06 0.96 1.75 0.00
## PercentSomeorASS 0.33 0.83 0.49 -0.02 -0.43 0.00
## PercentBach 0.09 0.63 0.54 0.68 -0.15 0.00
## percenttribe 0.00 0.42 0.42 8.86 88.18 0.00
## SingleParPercent 0.10 0.75 0.66 0.90 1.99 0.00
## PercentNotCitizen 0.00 0.21 0.21 1.26 0.84 0.00
## PercentImmigrant 0.00 0.47 0.47 1.41 1.32 0.00
## PercentRent 0.13 0.79 0.66 0.89 1.97 0.00
write.csv(descriptives, "descriptives.csv")
mtry <- tuneRF(gf[-1],gf$gradtransf, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 5 OOB error = 0.01051121
## Searching left ...
## mtry = 4 OOB error = 0.01060431
## -0.008857718 0.01
## Searching right ...
## mtry = 7 OOB error = 0.01036733
## 0.01368802 0.01
## mtry = 10 OOB error = 0.01044787
## -0.007768348 0.01
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
## mtry OOBError
## 4 4 0.01060431
## 5 5 0.01051121
## 7 7 0.01036733
## 10 10 0.01044787
ind <- sample(2, nrow(gf), replace = TRUE, prob = c(0.7, 0.3))
train <- gf[ind==1,]
test <- gf[ind==2,]
Graduation_Transfer_Model <-randomForest(gradtransf~.,data=train, mtry=best.m, importance=TRUE, ntree=500, proximity = TRUE)
print(Graduation_Transfer_Model)
##
## Call:
## randomForest(formula = gradtransf ~ ., data = train, mtry = best.m, importance = TRUE, ntree = 500, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 0.009788583
## % Var explained: 26.75
## Prediction with test data
pred_test <- predict(Graduation_Transfer_Model, newdata = test)
## Calculate RMSE of the difference between training model and test model
RMSE<-sqrt(mean((pred_test-test$gradtransf)^2))
RMSE
## [1] 0.1111422
## Importance table and plot
randomForest::importance(Graduation_Transfer_Model)
## %IncMSE IncNodePurity
## Percent_Unemployed 20.859259 0.7349980
## WithHealth 9.313247 0.2803900
## med_income 12.104970 0.3274085
## PercentWhite 12.336679 0.6518768
## percentpopchange 11.214726 0.3943554
## HousePercent 7.487493 0.2686669
## percentmarried 12.133233 0.4272672
## percentdivorced 13.583127 0.4092022
## percentseparated 12.372320 0.3511591
## PercentSomeorASS 8.169681 0.2852537
## PercentBach 12.549303 0.2802166
## percenttribe 7.032034 0.4353853
## SingleParPercent 10.989753 0.3003422
## PercentNotCitizen 14.810841 0.5628101
## PercentImmigrant 13.283750 0.4941443
## PercentRent 9.066181 0.3429072
importance_gr<- rownames_to_column(as.data.frame(importance(Graduation_Transfer_Model)))%>%
mutate(Category = "Grad_Plus_Trans")
varImpPlot(Graduation_Transfer_Model)
library(randomForest)
library(tidyverse)
library(ROCR)
set.seed(71)
gf1<- read.csv('ipedsgradmassive2.csv')
gf1<- select(gf1, RET_PCF, Percent_Unemployed, WithHealth, med_income, PercentWhite, percentpopchange, HousePercent, percentmarried, percentdivorced, percentseparated, PercentSomeorASS, PercentBach, percenttribe, SingleParPercent, PercentNotCitizen, PercentImmigrant, PercentRent)
gf1<- na.omit(gf1)
descriptives2<- psych::describe(gf1)
descriptives2
## vars n mean sd median trimmed mad
## RET_PCF 1 775 0.60 0.08 0.60 0.60 0.07
## Percent_Unemployed 2 775 0.05 0.01 0.04 0.04 0.01
## WithHealth 3 775 0.88 0.06 0.89 0.88 0.06
## med_income 4 775 65613.25 19449.81 61310.00 63379.26 16028.39
## PercentWhite 5 775 0.72 0.18 0.75 0.73 0.20
## percentpopchange 6 775 0.01 0.04 0.00 0.01 0.04
## HousePercent 7 775 0.87 0.03 0.88 0.87 0.03
## percentmarried 8 775 0.48 0.05 0.48 0.48 0.05
## percentdivorced 9 775 0.11 0.02 0.11 0.11 0.02
## percentseparated 10 775 0.02 0.01 0.02 0.02 0.01
## PercentSomeorASS 11 775 0.60 0.09 0.60 0.60 0.09
## PercentBach 12 775 0.29 0.11 0.26 0.28 0.11
## percenttribe 13 775 0.01 0.03 0.00 0.00 0.00
## SingleParPercent 14 775 0.36 0.09 0.35 0.35 0.08
## PercentNotCitizen 15 775 0.05 0.05 0.03 0.04 0.03
## PercentImmigrant 16 775 0.10 0.09 0.06 0.08 0.06
## PercentRent 17 775 0.31 0.09 0.31 0.31 0.09
## min max range skew kurtosis se
## RET_PCF 0.22 0.89 0.67 -0.31 1.24 0.00
## Percent_Unemployed 0.00 0.11 0.11 0.80 2.04 0.00
## WithHealth 0.59 0.97 0.39 -0.92 0.92 0.00
## med_income 24958.00 140258.00 115300.00 1.22 1.88 698.66
## PercentWhite 0.15 0.98 0.83 -0.70 -0.33 0.01
## percentpopchange -0.11 0.22 0.33 0.77 1.75 0.00
## HousePercent 0.74 0.94 0.21 -0.75 1.07 0.00
## percentmarried 0.26 0.62 0.36 -0.72 1.40 0.00
## percentdivorced 0.05 0.19 0.14 0.22 -0.32 0.00
## percentseparated 0.00 0.06 0.06 0.96 1.75 0.00
## PercentSomeorASS 0.33 0.83 0.49 -0.02 -0.43 0.00
## PercentBach 0.09 0.63 0.54 0.68 -0.15 0.00
## percenttribe 0.00 0.42 0.42 8.86 88.18 0.00
## SingleParPercent 0.10 0.75 0.66 0.90 1.99 0.00
## PercentNotCitizen 0.00 0.21 0.21 1.26 0.84 0.00
## PercentImmigrant 0.00 0.47 0.47 1.41 1.32 0.00
## PercentRent 0.13 0.79 0.66 0.89 1.97 0.00
write.csv(descriptives2, "descriptives2.csv")
mtry <- tuneRF(gf1[-1],gf1$RET_PCF, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 5 OOB error = 0.006336744
## Searching left ...
## mtry = 4 OOB error = 0.006285523
## 0.008083153 0.01
## Searching right ...
## mtry = 7 OOB error = 0.006351955
## -0.002400447 0.01
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
## mtry OOBError
## 4 4 0.006285523
## 5 5 0.006336744
## 7 7 0.006351955
ind <- sample(2, nrow(gf1), replace = TRUE, prob = c(0.7, 0.3))
train <- gf1[ind==1,]
test <- gf1[ind==2,]
Full_Time_Retention_Model <-randomForest(RET_PCF~.,data=train, mtry=best.m, importance=TRUE, ntree=500, proximity = TRUE)
print(Full_Time_Retention_Model)
##
## Call:
## randomForest(formula = RET_PCF ~ ., data = train, mtry = best.m, importance = TRUE, ntree = 500, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.007033668
## % Var explained: 6.75
## Prediction with test data
pred_test <- predict(Full_Time_Retention_Model, newdata = test)
## Calculate RMSE of the difference between training model and test model
RMSE<-sqrt(mean((pred_test-test$RET_PCF)^2))
RMSE
## [1] 0.07212177
## Importance table and plot
importance(Full_Time_Retention_Model)
## %IncMSE IncNodePurity
## Percent_Unemployed 5.624713 0.2116929
## WithHealth 7.643891 0.2064366
## med_income 16.440263 0.2758204
## PercentWhite 9.187872 0.1630955
## percentpopchange 5.398700 0.2038181
## HousePercent 2.699104 0.2283913
## percentmarried 10.553436 0.2312817
## percentdivorced 11.058296 0.2920554
## percentseparated 10.802755 0.1830417
## PercentSomeorASS 7.550398 0.1637116
## PercentBach 9.992177 0.1845852
## percenttribe 5.657944 0.1801108
## SingleParPercent 8.994367 0.1904701
## PercentNotCitizen 14.185082 0.2684800
## PercentImmigrant 13.864259 0.2715811
## PercentRent 7.874369 0.1902822
importance_ret<- rownames_to_column(as.data.frame(importance(Full_Time_Retention_Model)))%>%
mutate(Category = "Full_Time_Retention")
varImpPlot(Full_Time_Retention_Model)
tabledata<- left_join(importance_gr, importance_ret, by = "rowname")
write.csv(tabledata, "tabledata.csv")
library(ggplot2)
im<- rbind(importance_gr, importance_ret)%>%
rename("Variable" = rowname)%>%
rename("Increase_MSE" = "%IncMSE")%>%
rename("Increase_Node_Purity" = IncNodePurity)%>%
select(Variable, Category, Increase_MSE, Increase_Node_Purity)%>%
pivot_longer(cols = c("Increase_MSE", "Increase_Node_Purity"),
names_to = "Measure",
values_to = "Rate")%>%
mutate(Rate = round(Rate, 2))
im1<- im%>%
subset(Measure == "Increase_MSE")
im2<- im%>%
subset(Measure == "Increase_Node_Purity")
ggplot(im1, aes(x = Variable, y = Rate, fill = Category))+
geom_bar(stat='identity', position = position_dodge())+
coord_flip()+
geom_text(aes(label=Rate), color="black",hjust = 1.6, position = position_dodge(1), size=3)+
scale_fill_brewer(palette="Set2")+
labs(title="Effects on MSE of Predictors by Dependent Variable", x = "Predictor", y ="Percent Increase MSE")
ggplot(im2, aes(x = Variable, y = Rate, fill = Category))+
geom_bar(stat='identity', position = position_dodge())+
coord_flip()+
geom_text(aes(label=Rate), color="black",hjust = 1.6, position = position_dodge(1), size=3)+
scale_fill_brewer(palette="Accent")+
labs(title="Effects on Node Purity of Predictors by Dependent Variable", x = "Predictor", y ="Increase Node Purity")
library(ggpubr)
formula <- ipedsgraddata$RET_PCF ~ poly(ipedsgraddata$gradtransf, 3, raw = TRUE)
ggplot(ipedsgraddata, aes(x=RET_PCF, y=gradtransf)) +
geom_point()+
geom_smooth(method = lm)+
labs(title="Full-Time Retention Predicting Graduation Plus Transfer",
x="Full-Time Retention", y = "Graduation Plus Transfer")+
stat_regline_equation(
aes(label = paste(..adj.rr.label..)),
formula = formula)