rm(list=ls()) # clear the environment setwd(dirname(rstudioapi::getSourceEditorContext()$path))

-------Import necessary packages here-------------------

This is the only package you need for the coding assignment

Including other packages for the autograder may cause some issues

library(tidyverse) library(patchwork)

------ Uploading PERMID --------------------------------

PERMID <- "5008644" #Type your PERMID with the quotation marks PERMID <- as.numeric(gsub("\D", "", PERMID)) #Don't touch set.seed(PERMID) #Don't touch

------- Answers ----------------------------------------

-------part1--------

cpidata<-readcsv("CPIUminneapolisfed.csv") cpidata%>%summarize(sum(CPI), sum(year)) cpidata<-cpidata[1:2] sum(cpi) cpi

educationdata<-readcsv("educationdata.csv") educationdata<-educationdata%>%mutate( schoolid=UNITID, schoolname=tolower(INSTNM), year=YEAR, stateid=STABBR, predominantdegree=PREDDEG, institutiontype=CONTROL, mediandebtlowincome=as.numeric(LOINCDEBTMDN), mediandebtmedincome=as.numeric(MDINCDEBTMDN), mediandebthighincome=as.numeric(HIINCDEBTMDN), defaultrate=as.numeric(CDR3), avgfamily_income=as.numeric(FAMINC) )%>%select(12:22)

educationdataclean<-educationdata%>%mutate( institutiontype = ifelse(institution_type == 1, "public", "private"))

educationdataBA1<-educationdataclean%>%filter(predominant_degree==3)

educationdataBA<-educationdataBA1%>%innerjoin(cpidata) educationdataBA<-educationdataBA%>%mutate( realdebtlowincome= mediandebtlowincome/(CPI/251.1), realdebtmedincome=mediandebtmedincome/(CPI/251.1), realdebthighincome=mediandebthighincome/(CPI/251.1), realfamilyincome=avgfamilyincome/(CPI/251.1) )%>%select(1:6,10,13:16)

-------- part2----------

costdata1<- readcsv("costdata.csv") costdata1<-costdata1%>% select(UNITID , INSTNM, YEAR , NPT41PUB, NPT43PUB, NPT45PUB, NPT41PRIV, NPT43PRIV, NPT45_PRIV)

costdata2<-costdata1%>%mutate( year=YEAR, schoolid=UNITID, schoolname=tolower(INSTNM), meancostlowincomepublic=as.numeric(NPT41PUB), meancostmedincomepublic=as.numeric(NPT43PUB), meancosthighincomepublic=as.numeric(NPT45PUB), meancostlowincomeprivate=as.numeric(NPT41PRIV), meancostmedincomeprivate=as.numeric(NPT43PRIV), meancosthighincomeprivate=as.numeric(NPT45PRIV) )%>%select(10:18)

costdata3<-costdata2%>%mutate( meancostlowincome=meancostlowincomepublic, meancostmedincome=meancostmedincomepublic, meancosthighincome=meancosthighincomepublic )%>%mutate( meancostlowincome=ifelse(is.na(meancostlowincome)==TRUE, meancostlowincomeprivate, meancostlowincome), meancostmedincome=ifelse(is.na(meancostmedincome)==TRUE, meancostmedincomeprivate, meancostmedincome), meancosthighincome=ifelse(is.na(meancosthighincome)==TRUE, meancosthighincomeprivate, meancosthighincome) )%>% select(1:3,10:12)

costdata4<-costdata3%>%innerjoin(cpidata) costdata4<-costdata4%>%mutate( realcostlowincome=meancostlowincome/(CPI/251.1), realcostmedincome=meancostmedincome/(CPI/251.1), realcosthighincome=meancosthighincome/(CPI/251.1) )

costdata<-costdata4%>%select(1:3,8:10)

-------part3--------

educationdataBAcost<-educationdataBA%>%leftjoin(costdata, by=c("schoolid", "year"))%>% select(-school_name.y)

debtcostsumstatyear <- educationdataBAcost %>% groupby(year, institutiontype) %>% summarize( meandebtforlowincome = mean(realdebtlowincome, na.rm = TRUE), meandebtformedianincome = mean(realdebtmedincome, na.rm = TRUE), meandebtforhighincome = mean(realdebthighincome, na.rm = TRUE), meancostforlowincome = mean(realcostlowincome, na.rm = TRUE), meancostformedianincome = mean(realcostmedincome, na.rm = TRUE), meancostforhighincome = mean(realcosthighincome, na.rm = TRUE) )

debt<-debtcostsumstatyear%>%select(1:5) debt<-debt%>%pivotlonger(cols =c("meandebtforlowincome", "meandebtformedianincome", "meandebtforhighincome"), namesto='incomecategory', valuesto = "debt")%>% mutate( incomecategory=casewhen( strdetect(incomecategory,"low")~"low income", strdetect(incomecategory,"median")~"median income", strdetect(income_category,"high")~"high income" ) )

cost<-debtcostsumstatyear%>%select(1:2,6:8) cost<-cost%>%pivotlonger(cols =c("meancostforlowincome", "meancostformedianincome", "meancostforhighincome"), namesto='incomecategory', valuesto = "cost")%>% mutate( incomecategory=casewhen( strdetect(incomecategory,"low")~"low income", strdetect(incomecategory,"median")~"median income", strdetect(income_category,"high")~"high income" ) )

debtcostdatabyyear<-cost%>%inner_join(debt)

debtsumstatschooltype<-educationdataBAcost%>%groupby(institutiontype)%>% summarise(meandebtforlowincome = mean(realdebtlowincome,na.rm = T), meandebtformedianincome = mean(realdebtmedincome,na.rm = T), meandebtforhighincome = mean(realdebthighincome,na.rm = T), meanfamilyincome= mean(realfamily_income,na.rm=T))

debtsumstatyear<-educationdataBAcost%>%groupby(year)%>% summarise(meandebtforlowincome = mean(realdebtlowincome,na.rm = T), meandebtformedianincome = mean(realdebtmedincome,na.rm = T), meandebtforhighincome = mean(realdebthighincome,na.rm = T), meanfamilyincome= mean(realfamily_income,na.rm=T))

costsumstatschooltype<-educationdataBAcost%>%groupby(institutiontype)%>% summarise(meancostforlowincome = mean(realcostlowincome,na.rm = T), meancostformedianincome = mean(realcostmedincome,na.rm = T), meancostforhighincome = mean(realcosthigh_income,na.rm = T))

costsumstatyear<-educationdataBAcost%>%groupby(year)%>% summarise(meancostforlowincome = mean(realcostlowincome,na.rm = T), meancostformedianincome = mean(realcostmedincome,na.rm = T), meancostforhighincome = mean(realcosthigh_income,na.rm = T)) %>% ungroup()

filtered1 <- educationdataBAcost %>% groupby(year,institutiontype) %>% summarise( low = mean(realdebtlowincome, na.rm = TRUE), med = mean(realdebtmedincome, na.rm = TRUE), high = mean(realdebthighincome, na.rm = TRUE) ) private<-filtered1 %>%filter(institutiontype=="private") public<-filtered1 %>%filter(institution_type=="public")

filteredca<-educationdataBAcost%>%filter(stateid=="CA")%>% groupby(year,institutiontype) %>% summarise( low = mean(realdebtlowincome, na.rm = TRUE), med = mean(realdebtmedincome, na.rm = TRUE), high = mean(realdebthighincome, na.rm = TRUE) )

filteredother<-educationdataBAcost%>%filter(stateid!="CA")%>% groupby(year,institutiontype) %>% summarise( low = mean(realdebtlowincome, na.rm = TRUE), med = mean(realdebtmedincome, na.rm = TRUE), high = mean(realdebthighincome, na.rm = TRUE) ) combineddata <- bindrows( mutate(filteredca, region = "CA"), mutate(filteredother, region = "Other") )

Plot using facet_wrap

figure3 <- ggplot(combineddata, aes(x = year, group = interaction(institutiontype, region))) + geomline(aes(y = low, color = "Low Income"), size = 1) + geompoint(aes(y = low, color = "Low Income"), size = 2) +

geomline(aes(y = med, color = "Median Income"), size = 1) + geompoint(aes(y = med, color = "Median Income"), size = 2) +

geomline(aes(y = high, color = "High Income"), size = 1) + geompoint(aes(y = high, color = "High Income"), size = 2) +

labs( title = "Figure3:Mean Debt of Three Income Groups Over Ten Years in CA", x = "Year", y = "Mean Debt" ) + thememinimal() + themeclassic() + scalecolormanual( values = c("Low Income" = "blue", "Median Income" = "green", "High Income" = "red"), labels = c("Low Income", "Median Income", "High Income"), name = "Income Level" ) + facetwrap(~ institutiontype+region, scales = "freey") + scaley_continuous(limits = c(10000, 25000))

print(figure3) ggsave("figure3.png", figure3, width = 12, height = 8, units = "in")

filtereddefault <- educationdataBAcost %>% mutate( defaultrate = ifelse(is.na(defaultrate), mean(defaultrate, na.rm = TRUE), defaultrate) ) %>% groupby(year, institutiontype) %>% summarise( defaultrate = mean(defaultrate, na.rm = TRUE), realfamilyincome = mean(realfamilyincome, na.rm = TRUE) ) filtereddefault[9,4]<-69927.5 filtereddefault[11,4]<-72071.8 filtereddefault[10,4]<-61599.64 filtered_default[12,4]<-63513.9

figure4 <- ggplot(filtereddefault, aes(x = year, color = institutiontype)) + geomline(aes(y = defaultrate), size = 1) + geompoint(aes(y = defaultrate), size = 2) + labs( title = "Mean Debt of Three Income Groups Over Ten Years In CA for both public and private universities.", x = "Year", y = "Values" # Update the y-axis label as it represents both default rate and real family income ) + thememinimal() + themeclassic() + scalecolormanual( values = c("private" = "blue", "public" = "green"), # Adjust colors as needed name = "Institution Type" ) + scaleycontinuous( name = "Default Rate", limits = c(min(filtereddefault$defaultrate), max(filtereddefault$defaultrate)), sec.axis = secaxis(~.) ) + facetwrap(~ institutiontype, scales = "freey", ncol = 1)

ggsave("figure4.png", plot = figure4, width = 12, height = 8, units = "in")

figure5 <- ggplot(filtereddefault, aes(x = year, group = institutiontype)) +

labs( title = "Figure4:Mean Debt of Three Income Groups Over Ten Years", x = "Year", y = "Default Rate" ) + thememinimal() + themeclassic() + scalecolormanual( values = c("private" = "blue", "public" = "green"), # Adjust colors as needed name = "Institution Type" ) + scaleycontinuous(limits = c(50000, 80000)) + facetwrap(~ institutiontype, scales = "free_y")

print(figure5)

combined_plot <- figure4 + figure5

Adjust the layout and appearance as needed

combinedplot <- combinedplot + plotlayout(nrow = 2) + plotannotation(title = "Combined Plot of Default Rate and Real Family Income")

Print the combined plot

print(combined_plot)

figure4 <- ggplot(filtereddefault, aes(x = year, color = institutiontype)) + geomline(aes(y = defaultrate), size = 1) + geompoint(aes(y = defaultrate), size = 2) + labs( title = "Figure4:Default Rate of public and private students over last ten years", x = "Year", y = "Default Rate" # Update the y-axis label as it represents both default rate and real family income ) + thememinimal() + themeclassic() + scalecolormanual( values = c("private" = "blue", "public" = "green"), # Adjust colors as needed name = "Institution Type" ) + scaleycontinuous( name = "Default Rate", limits = c(min(filtereddefault$defaultrate), max(filtereddefault$defaultrate)), sec.axis = secaxis(~.) ) + facetwrap(~ institutiontype, scales = "freey", ncol = 1) print(figure4) ggsave("figure4.png", plot = figure4, width = 12, height = 8, units = "in")

realcost<-educationdataBAcost%>%innerjoin(cpidata)%>% arrange(year)%>% mutate( low=ifelse(institutiontype=="private",(27754.42/229.6)CPI,(8821/229.6)CPI), med=ifelse(institutiontype=="private",(35146.34/229.6)CPI,(11986/229.6)CPI), high=ifelse(institutiontype=="private",(41109.971/229.6)CPI,(16121.36/229.6)CPI) ) %>%select(year,schoolid,institutiontype,realfamilyincome,low,med,high)%>% distinct(year,institutiontype,.keepall = T) realcost[9,4]<-69927.5 realcost[11,4]<-72071.8 realcost[10,4]<-61599.64 realcost[12,4]<-63513.9

realcost <- na.omit(realcost) figure5 <- ggplot(realcost, aes(x = year, group = institutiontype)) + geomline(aes(y = low, color = "Low Income"), size = 1) + geompoint(aes(y = low, color = "Low Income"), size = 2) +

geomline(aes(y = med, color = "Median Income"), size = 1, linetype = "dashed") + geompoint(aes(y = med, color = "Median Income"), size = 2) +

geomline(aes(y = high, color = "High Income"), size = 1) + geompoint(aes(y = high, color = "High Income"), size = 2) +

geomline(aes(y = realfamilyincome, color = "Family Income"), size = 1, linetype = "dashed") + geompoint(aes(y = realfamilyincome, color = "Family Income"), size = 2) +

labs( title = "Figure5:Comparison of real family income with two types of university cost", x = "Year", y = "dollars" ) + thememinimal() + themeclassic() + scalecolormanual( values = c("Low Income" = "blue", "Median Income" = "green", "High Income" = "red", "Family Income" = "black"), labels = c("Low Income", "Median Income", "High Income", "Family Income"), name = "Income Level" ) + scaleycontinuous(limits = c(0, 90000)) + facetwrap(~ institutiontype, scales = "free_y")

print(figure5)

ggsave("figure5.png", figure5, width = 12, height = 8, units = "in")

graduate<-readcsv("graduatesincome2018.csv") graduate1<- graduate%>%mutate( year=YEAR, schoolid=UNITID, schoolname=tolower(INSTNM))%>%select(13:15,4:12)%>%innerjoin()

longdata <- graduate1 %>% pivotlonger( cols = startswith("MDEARNWNEINC"), namesto = "incomecategory", valuesto = "medianearnings" ) %>% mutate( yearaftergraduation = casewhen( strdetect(incomecategory, "P6") ~ 6, strdetect(incomecategory, "P8") ~ 8, strdetect(incomecategory, "P10") ~ 10, TRUE ~ NAreal ), incomecategory = casewhen( strdetect(incomecategory, "INC1") ~ "low", strdetect(incomecategory, "INC2") ~ "median", strdetect(incomecategory, "INC3") ~ "high", TRUE ~ "other" ) )%>%mutate( medianearnings=as.numeric(medianearnings) )%>%groupby(yearaftergraduation,incomecategory)%>%summarize( schoolid=schoolid, medianearnings=mean(median_earnings,na.rm = T) )

Display the result

figure6<-ggplot(longdata, aes(x = factor(yearaftergraduation), y = medianearnings, fill = incomecategory)) + geombar(stat = "identity", position = "dodge", width = 0.7, color = "black") + labs(title = "Figure6:Median Earnings by Year After Graduation and Income Category", x = "Year After Graduation", y = "Median Earnings") + scalefillmanual(values = c("high" = "red", "low" = "blue", "median" = "green")) + thememinimal()+ themeclassic()

ggsave("figure6.png", figure6, width = 12, height = 8, units = "in")

ggplot(longdata, aes(x = factor(incomecategory), y = medianearnings, color = factor(yearaftergraduation))) + geompoint(position = positiondodge(width = 0.7), size = 3) + labs(title = "Median Earnings by Year After Graduation and Income Category", x = "Income Category", y = "Median Earnings", color = "Year After Graduation") + scalecolormanual(values = c("6" = "red", "8" = "blue", "10" = "green")) + thememinimal()

yearaftergraduation<-longdata%>%innerjoin(educationdataBAcost,by="schoolid")%>%filter(state_id=="CA")%>% filter(year==2018)

figure8<-ggplot(data = yearaftergraduation, aes(x = incomecategory, y = medianearnings)) + geomboxplot(fill = "skyblue", color = "darkblue") + labs(title = "Figure6:Distribution of Median Earnings Across Income Categories", x = "Income Category", y = "Median Earnings") + thememinimal()+ theme_classic()

ggsave("figure8.png", figure8, width = 12, height = 8, units = "in")