setwd("C:/Users/qizhe/Desktop/STA 141A/HW1")
cs <- readRDS("college_scorecard_2013.rds")
# 1 Total of 3312 observations, 3312 different colleges.
dim(cs) # 3312 observations
## [1] 3312 51
# number of colleges
sum(cs$main_campus) #2431 main campuses
## [1] 2431
# take out those records with duplicated school names
#duplicated_records = cs[cs$name %in% cs$name[duplicated(cs$name)],]
#ordered_duplicated_records = duplicated_records[order(duplicated_records$name),]
#ordered_duplicated_records[,1:7]
#Problem of duplicate names, multiple branches, can't tell where to discern
#individual college existence
# 2
# sapply()
tab1 <- sapply(cs, class)
table(tab1)
## tab1
## character factor integer logical numeric
## 4 4 15 3 25
# examine each variable type using str()
str(cs)
## 'data.frame': 3312 obs. of 51 variables:
## $ unit_id : int 100654 100663 100690 100706 100724 100751 100812 100830 100858 100937 ...
## $ ope_id : chr "001002" "001052" "025034" "001055" ...
## $ main_campus : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ branches : int 1 1 1 1 1 1 1 1 1 1 ...
## $ open_admissions : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ name : chr "Alabama A & M University" "University of Alabama at Birmingham" "Amridge University" "University of Alabama in Huntsville" ...
## $ city : chr "Normal" "Birmingham" "Montgomery" "Huntsville" ...
## $ state : Factor w/ 56 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ zip : chr "35762" "35294-0110" "36117-3553" "35899" ...
## $ online_only : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ primary_degree : Factor w/ 5 levels "Other","Certificate",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ highest_degree : Factor w/ 5 levels "Other","Certificate",..: 5 5 5 5 5 5 4 5 5 4 ...
## $ ownership : Factor w/ 3 levels "Public","Nonprofit",..: 1 1 2 1 1 1 1 1 1 2 ...
## $ avg_sat : int 823 1146 NA 1180 830 1171 NA 970 1215 1177 ...
## $ undergrad_pop : int 4051 11200 322 5525 5354 28692 2999 4322 19761 1181 ...
## $ grad_pop : int 969 7066 309 1680 719 5312 NA 762 5065 NA ...
## $ cost : int 18888 19990 12300 20306 17400 26717 NA 16556 23788 44167 ...
## $ tuition : int 7182 7206 6870 9192 8720 9450 NA 8750 9852 30690 ...
## $ tuition_nonresident : int 12774 16398 6870 21506 15656 23950 NA 24950 26364 30690 ...
## $ revenue_per_student : int 9063 9033 12057 8322 7813 12198 5094 7625 13186 11928 ...
## $ spend_per_student : int 7459 17208 5123 9352 7393 9817 6176 6817 11324 9990 ...
## $ avg_faculty_salary : int 7079 10170 3849 9341 6557 9605 7672 7173 9429 7513 ...
## $ ft_faculty : num 0.886 0.911 0.672 0.655 0.664 ...
## $ admission : num 0.899 0.867 NA 0.806 0.512 ...
## $ retention : num 0.631 0.802 0.375 0.81 0.622 ...
## $ completion : num 0.291 0.538 0.667 0.483 0.252 ...
## $ fed_loan : num 0.82 0.54 0.763 0.473 0.874 ...
## $ pell_grant : num 0.712 0.35 0.684 0.328 0.827 ...
## $ avg_family_inc : num 29590 50153 22714 51013 28113 ...
## $ med_family_inc : num 21281 35163 16619 33787 20269 ...
## $ avg_10yr_salary : int 34300 46400 46100 50500 29500 49900 42200 37900 54100 51000 ...
## $ sd_10yr_salary : int 25300 36300 34900 32700 20500 42600 28000 27800 41800 36600 ...
## $ med_10yr_salary : int 29900 40200 40100 45600 26700 42700 38500 33500 47100 45600 ...
## $ med_debt : num 30968 21282 24897 23106 32000 ...
## $ med_debt_withdraw : num 9500 8925 6731 8068 9500 ...
## $ default_3yr_rate : num 0.163 0.08 0.089 0.077 0.191 0.081 0.094 0.131 0.061 0.099 ...
## $ repay_5yr_rate_withdraw: num 0.324 0.606 0.449 0.613 0.261 ...
## $ repay_5yr_rate : num 0.53 0.731 0.529 0.794 0.446 ...
## $ avg_entry_age : num 20.5 23.5 33.9 24.1 20.6 ...
## $ veteran : num NA 0.00354 NA 0.01013 0.00353 ...
## $ first_gen : num 0.388 0.336 0.54 0.326 0.374 ...
## $ male : num 0.486 0.413 0.391 0.558 0.408 ...
## $ female : num 0.514 0.587 0.609 0.442 0.592 ...
## $ race_white : num 0.0279 0.5987 0.2919 0.7012 0.0161 ...
## $ race_black : num 0.95 0.259 0.422 0.131 0.928 ...
## $ race_hispanic : num 0.0089 0.0258 0.0093 0.0338 0.0114 0.0313 0.0213 0.0079 0.0253 0.0305 ...
## $ race_asian : num 0.0022 0.0518 0.0031 0.0364 0.0015 0.0112 0.0047 0.0245 0.0213 0.039 ...
## $ race_native : num 0.0012 0.0026 0.0031 0.0145 0.0009 0.0044 0.019 0.0037 0.0077 0.0102 ...
## $ race_pacific : num 0.001 0.0007 0.0031 0.0002 0.0007 0.0011 0.0007 0.0002 0 0 ...
## $ net_cost : int 13415 14805 7455 17520 11936 21513 NA 11915 17541 21406 ...
## $ race_other : num 0.0086 0.0614 0.2671 0.0828 0.0409 ...
#51 features (variables) (50 if remove unit_id), 12 categorical, 39 numerical (ordinal?) double check
#numerical (discrete): branches, undergrad_pop, grad_pop
#numerical (continuous): avg_sat, cost, tuition, tuition_nonresident, revenue_per_student, spend_per_student
#avg_faculty_salary, ft_faculty, admission, retention, completion, fed_loan, pell_grant, avg_family_inc, med_family_inc
#avg_10yr_salary, sd_10yr_salary, med_10yr_salary, med_debt, med_debt_withdraw, default_3yr_rate, repay_5yr_rate_withdraw
#repay_5yr_rate, avg_entry_age, veteran, first_gen, male, female, race_white, race_black, race_hispanic, race_asian, race_native
#race_pacific, net_cost, race_other
#categorical : unit_id, ope_id, name, city, state, zip
#ordinal variable: primary_degree, highest_degree
#nominal variable: ownership
#nominal, ratio, interval?
#logical variable (double check): main_campus, open_admissions, online_only,
# duplicated ope_id records
#duplicated_record_ope_id = cs[cs$ope_id %in% cs$ope_id[duplicated(cs$ope_id)],]
#order_duplicated_record_ope_id = duplicated_record_ope_id[order(duplicated_record_ope_id$ope_id),]
# 3
# number of NA's
sum(is.na(cs)) # 23197
## [1] 23197
# find number of NA's per variable, find the largest
count_na_by_features = sapply(cs, function(x) sum(is.na(x)))
t(t(count_na_by_features)) # avg_sat is largest with 1,923
## [,1]
## unit_id 0
## ope_id 0
## main_campus 0
## branches 0
## open_admissions 0
## name 0
## city 0
## state 0
## zip 0
## online_only 199
## primary_degree 0
## highest_degree 0
## ownership 0
## avg_sat 1923
## undergrad_pop 490
## grad_pop 1149
## cost 717
## tuition 472
## tuition_nonresident 635
## revenue_per_student 201
## spend_per_student 201
## avg_faculty_salary 317
## ft_faculty 558
## admission 1416
## retention 966
## completion 864
## fed_loan 510
## pell_grant 510
## avg_family_inc 267
## med_family_inc 267
## avg_10yr_salary 480
## sd_10yr_salary 480
## med_10yr_salary 480
## med_debt 487
## med_debt_withdraw 489
## default_3yr_rate 183
## repay_5yr_rate_withdraw 538
## repay_5yr_rate 792
## avg_entry_age 267
## veteran 1735
## first_gen 505
## male 490
## female 490
## race_white 490
## race_black 490
## race_hispanic 490
## race_asian 490
## race_native 490
## race_pacific 490
## net_cost 689
## race_other 490
#different types of NA's, intentional, random
#turn into a heatmap, no builtin heatmap function
# Patterns
#total 23197 NA's, highest NA is avg_sat with 1923
#revenue_per_student and spend_per_student both have 201 NA's
#fed_loan and pell_grant both have 510 NA's
#avg_family_inc and med_family_inc both have 267 NA's
#avg_10yr_salary, sd_10yr_salary, med_10yr_salary each have 480 NA's When one of the values are missing, others can't be determined also.
#med_debt and med_debt_withdraw are very close at 487/489 NA's
#male, female, race_white, race_black, race_hispanic, race_asian, race_native,
#race_pacific, and race_other each have 490 NA's School's that don't have gender information also lack ethnic information.
#can plot the NA's to determine pattern
#look into each function, e.g., (image(), colSums())
#finish the rest of Swirl apply, basegraphics 9,10,11,15
#sort(NAcounts)
#?order
#?rank
#usually sort + order
NA_counts<-colSums(is.na(cs))
#NA_counts<-as.matrix(NA_counts)
#NA_counts
# graphically display the NA's per feature
plot(NA_counts, main="Number of Missing Data per Feature",
xlab = "Feature Index",
ylab = "Number NA's", cex = 1, pch=15)
abline(h=c(500, 0), col="Green") # many around this region

# Optional techniques, additional resources
#summary(data)
#which.max(NAcounts)
#hist(NA_counts)
#find the 500's
#table(NA_counts)
#which(NA_counts==490)
#which(NA_counts...|...)
#factor in datacamp
#Roger peng youtube, brian caffo
# 4
# number of private versus public colleges
# public
length(which(cs$ownership=="Public")) #716
## [1] 716
# private aka For Profit and Nonprofit
length(which(cs$ownership!="Public")) #2,596
## [1] 2596
length(cs$ownership) #3312
## [1] 3312
# separate public and private colleges into variables
cs_public = cs[cs$ownership=="Public",]
cs_private = cs[cs$ownership!="Public",]
#prop.table(, margin=1)
#mosaicplot() ?mosaic, titanic dataset
#ggplot2
#cowplot
# graphically display proportions of degrees for private and public colleges
public_highest_deg_table = table(cs_public$highest_degree)/sum(table(cs_public$highest_degree))
private_highest_deg_table = table(cs_private$highest_degree)/sum(table(cs_private$highest_degree))
par(mfrow=c(1,2))
barplot(public_highest_deg_table, main = "Degree Proportion in Public College", ylim=c(0,0.8))
barplot(private_highest_deg_table, main = "Degree Proportion in Private College", ylim=c(0,0.8))

#prop.table(table(cs$ownership,cs$highest_degree),margin=1)
#Public colleges tend to have a much higher ratio of having a graduate degree as the
#highest degree offerred in comparison to private colleges. Also, public schools don't
#offer Certificate programs as the highest degree available.
# 5
# mean, median, and decile for undergraduate populations
ugmean <- mean(cs$undergrad_pop,na.rm = TRUE) #3599.502
ugmed <- median(cs$undergrad_pop,na.rm = TRUE) #1295
decile <- quantile(cs$undergrad_pop, prob = seq(0,1,length=11),type = 5,na.rm = TRUE)
par(mfrow=c(1,1))
hist(cs$undergrad_pop, main = "Histogram of undergraduate population", xlab = "Undergraduate Population")
abline(v = ugmean, col='red', lwd=5)
abline(v = decile, col='pink', lwd=2)
abline(v = ugmed, col='blue', lwd=5)
legend(x=120000, y=2400, c("mean", "median", "decile"), col = c("red", "blue", "pink"), lty=c(1,1,1))

#There is an extreme outlier, the Universit of Phoenix, Arizona whose population of over
#150,000 students creates an extreme skew to the histogram.
# testing for without the previous outlier, little difference in overall skew
cs1 = cs[-which(cs$undergrad_pop==166816),] # removes outlier
ugmean <- mean(cs1$undergrad_pop,na.rm = TRUE) #3599.502
ugmed <- median(cs1$undergrad_pop,na.rm = TRUE) #1295
decile <- quantile(cs1$undergrad_pop, prob = seq(0,1,length=11),type = 5,na.rm = TRUE)
par(mfrow=c(1,1))
hist(cs1$undergrad_pop, main = "Histogram of undergraduate population")
abline(v = ugmean, col='red', lwd=5)
abline(v = decile, col='pink', lwd=2)
abline(v = ugmed, col='blue', lwd=5)
legend(x=120000, y=2400, c("mean", "median", "decile"), col = c("red", "blue", "pink"), lty=c(1,1,1))

# 6
# 5 most populous states: California, Texas, New York, Illinois, Florida
top = cs[cs$state %in% c("CA", "TX", "NY", "IL", "FL"),]
# alternate method to drop NA levels
#top$state = factor(as.character(top$state))
top$state <- droplevels(top$state)
boxplot(top$tuition~top$state, main = "Boxplot of 5 Most Populous States", ylab = "Tuition per State")

#All of the states tend to have a skew towards higher tuition. New York seems to have
#both the highest and lowest tuition. Florida and Texas tend to a greater number of outliers
#than the other states. Texas and Flordia also seem to have a lower tuition than the others.
# 7
# part a
# name of university with largest avg_sat
# inefficient code
#max(cs$avg_sat,na.rm = TRUE) #1534
#which(cs$avg_sat %in% 1534) #105
#cs$name[105] #California Institute of Technology
#which(cs$name=="California Institute of Technology")
# efficient code
cs[which.max(cs$avg_sat), 'name'] #California Institute of Technology
## [1] "California Institute of Technology"
# part b
# largest university & open admissions
# inefficient code
#max(cs$undergrad_pop,na.rm = TRUE) #166816
#which(cs$undergrad_pop %in% 166816) #2371
#cs$name[2371] #University of Phoenix-Online Campus
#cs$open_admissions[2371] #True
# efficient code
cs[which.max(cs$undergrad_pop), c('name', 'open_admissions')]
## name open_admissions
## 2371 University of Phoenix-Online Campus TRUE
# part c
# zip code of smallest avg_family_inc for public schools
publicUni = subset(cs,ownership=="Public")
which.min(publicUni$avg_family_inc)
## [1] 348
publicUni$zip[348] #11101
## [1] "11101"
publicUni$name[348] #CUNY School of Law
## [1] "CUNY School of Law"
# part d
# also largest grad_pop (referring to part b)
# inefficient code
#cs$grad_pop[2371] #41900
#max(cs$grad_pop,na.rm = TRUE) #42811
#which(cs$grad_pop %in% 42811) #248
#cs$name[248] #Walden University
#No, Walden University has the largest grad population.
# efficient code
cs[which.max(cs$undergrad_pop), 'grad_pop'] == max(cs$grad_pop, na.rm=TRUE)
## [1] FALSE
# 8
# subset For Profit & Bachelor primary degree schools
sub8 = subset(cs, ownership=="For Profit" & primary_degree=="Bachelor")
money_per_student <- sub8[,c('revenue_per_student', 'spend_per_student')]
# part a
plot(money_per_student, main = "Revenue versus Spending per Student", xlab = "Revenue per student"
, ylab = "Spending per student")

#There are a few outliers which may violate the assumptions of linearity.
# part b
# create new variable to use
sub8 <- subset(cs, ownership=="For Profit" & primary_degree=="Bachelor")
sub8[is.na(sub8)] <- 0 # set NA's to 0
net_income <- (sub8$revenue_per_student - sub8$spend_per_student) # net income for colleges
total_net_income <- (net_income*(sub8$undergrad_pop + sub8$grad_pop)) # total net income
# order sub8, and then choose top 5
sub8_top = sub8[order(total_net_income, decreasing=TRUE),]
sub8_top5 = sub8_top[1:5,] #University of Phoenix-Online Campus, Ashford University,
#Capella University, Grand Canyon University, Kaplan University-Davenport Campus
# plot net income and number of students
# create total student population, net income, and total net income categories for sub8_top5
sub8_top5$student_pop <- sub8_top5$grad_pop + sub8_top5$undergrad_pop
sub8_top5$net_income <- (sub8_top5$revenue_per_student - sub8_top5$spend_per_student)
sub8_top5$total_net_income <- (sub8_top5$net_income*(sub8_top5$undergrad_pop + sub8_top5$grad_pop))
plot(sub8_top5$student_pop, sub8_top5$total_net_income, xlab = "Student population",
ylab = "Total income per student", main = "Top 5 Total Net Income for Colleges",
pch=1:5)
legend("topleft", sub8_top5$name, pch=1:5)

# 9
plot(cs[,c("avg_sat", "admission")], main = "Average SAT versus Admissions", xlab = "Average SAT",
ylab = "Admissions")

#Around avg_sat = 1200, the admission begins to narrow.
# split according to avg_sat >= 1200 and admission <= 0.4
group <- ifelse(cs$avg_sat >= 1200 & cs$admission <= 0.4, "High_SAT", "Low_SAT")
group2 <- ifelse(is.na(group),"Low_SAT",group)
cs$group <- group2
boxplot(cs$med_10yr_salary~cs$group, ylab = "Median 10 year salary", main = "Median Salary for High/Low SAT")

# The 10 year median salary for students graduating from High SAT collegs tend to earn more than
# those with lower SAT's. There are still outliers in the low SAT colleges which make it so that
# their students can also make higher salaries. It is still more common however for high SAT college
# graduates to be earning more than their counterparts.
cs$race_aw <- cs$race_asian + cs$race_white
boxplot(cs$race_aw~group, ylab = "Asian and White Proportion", main = "Asian and White Combination for High/Low SAT")

# Colleges which have lower proportions of Asian and Whites can also have lower SAT scores. In High SAT average
# schools, this sort of pattern is not common and doesn't become apparent.
cs$grad_proportion <- (cs$grad_pop/(cs$grad_pop + cs$undergrad_pop))
boxplot(cs$grad_proportion~group, ylab = "Graduate Student Proportion",
main = "SAT Scores According to Graduate Student Proportion")

lsat=subset(cs, cs$group=="Low_SAT") #low SAT's
lsat$grad_proportion <- (lsat$grad_pop/(lsat$grad_pop + lsat$undergrad_pop))
lsat[order(lsat$grad_proportion, decreasing = TRUE),][1:5,]
## unit_id ope_id main_campus branches open_admissions
## 93 109086 020992 TRUE 1 FALSE
## 182 117672 001229 TRUE 1 FALSE
## 1000 173683 040443 TRUE 1 FALSE
## 2571 439394 034297 TRUE 1 FALSE
## 43 103778 001070 TRUE 2 FALSE
## name
## 93 American Conservatory Theater
## 182 Southern California University of Health Sciences
## 1000 Hazelden Betty Ford Graduate School of Addiction Studies
## 2571 East West College of Natural Medicine
## 43 Thunderbird School of Global Management
## city state zip online_only primary_degree
## 93 San Francisco CA 94108-5834 FALSE Certificate
## 182 Whittier CA 90604 FALSE Graduate
## 1000 Center City MN 55012-0011 FALSE Certificate
## 2571 Sarasota FL 34234-0000 FALSE Graduate
## 43 Glendale AZ 85306-6000 FALSE Certificate
## highest_degree ownership avg_sat undergrad_pop grad_pop cost tuition
## 93 Graduate Nonprofit NA 0 31 NA NA
## 182 Graduate Nonprofit NA 0 564 NA NA
## 1000 Graduate Nonprofit NA 0 90 NA NA
## 2571 Graduate For Profit NA 0 123 NA NA
## 43 Graduate Nonprofit NA 2 1020 NA NA
## tuition_nonresident revenue_per_student spend_per_student
## 93 NA 31619 21970
## 182 NA 20054 10158
## 1000 NA 20791 4983
## 2571 NA 8592 4901
## 43 NA 62758 53366
## avg_faculty_salary ft_faculty admission retention completion fed_loan
## 93 8434 1.0000 NA NA NA NA
## 182 5620 0.4188 NA NA NA NA
## 1000 6337 NA NA NA NA 0
## 2571 4287 NA NA NA NA NA
## 43 10953 0.6027 NA NA NA 0
## pell_grant avg_family_inc med_family_inc avg_10yr_salary
## 93 NA NA NA NA
## 182 NA 36583.54 31521.0 NA
## 1000 0 NA NA NA
## 2571 NA 30865.32 12000.0 NA
## 43 0 25728.15 22590.5 136600
## sd_10yr_salary med_10yr_salary med_debt med_debt_withdraw
## 93 NA NA NA NA
## 182 NA NA 8144 4541
## 1000 NA NA NA NA
## 2571 NA NA NA NA
## 43 122800 111600 8000 NA
## default_3yr_rate repay_5yr_rate_withdraw repay_5yr_rate avg_entry_age
## 93 0.023 NA NA NA
## 182 0.032 NA NA 33.77143
## 1000 0.018 NA NA NA
## 2571 0.023 NA NA 37.57895
## 43 0.012 NA NA 30.33871
## veteran first_gen male female race_white race_black race_hispanic
## 93 NA NA 0 0 0 0 0
## 182 NA NA 0 0 0 0 0
## 1000 NA NA 0 0 0 0 0
## 2571 NA NA 0 0 0 0 0
## 43 NA NA 1 0 1 0 0
## race_asian race_native race_pacific net_cost race_other group
## 93 0 0 0 NA 0 Low_SAT
## 182 0 0 0 NA 0 Low_SAT
## 1000 0 0 0 NA 0 Low_SAT
## 2571 0 0 0 NA 0 Low_SAT
## 43 0 0 0 NA 0 Low_SAT
## race_aw grad_proportion
## 93 0 1.0000000
## 182 0 1.0000000
## 1000 0 1.0000000
## 2571 0 1.0000000
## 43 1 0.9980431
# It follows that colleges with higher SAT's will have a greater proportion of Graduate students
# than colleges that have a lower SAT average. However, there are some outliers in the lower SAT
# range. The exceptional outliers in the low SAT group seem to come from colleges which are
# actually focused on giving graduate level degrees, which explains how their student population
# can have both lower SAT's on average while still consisting mainly of graduate students.
# part C
mosaicplot(~ group + open_admissions, data = cs, main = "Open Admissions vs. Group",
xlab = "Group", ylab = "Open Admissions")

table(as.character(cs$group), as.character(cs$open_admissions))
##
## FALSE TRUE
## High_SAT 83 0
## Low_SAT 2445 784
# examine table of 'group' and 'open_admissions'
subgroup = cs$group
subgroup[is.na(subgroup)] = "NA"
suboa = cs$open_admissions
suboa[is.na(suboa)] = "NA"
table(suboa,subgroup)
## subgroup
## suboa High_SAT Low_SAT
## FALSE 83 2445
## TRUE 0 784
subas = cs$avg_sat
subas[is.na(subas)] = "NA"
hist(cs$avg_sat)

# Apparently universities with open admissions also lack any average SAT scores. This makes sense
# since open admissions universities only require a high school diploma or GED in order to become
# accepted. That means there is no need for any of them to include average SAT scores. In the mosaic
# plot, the majority of open admissions which are False have low SAT averages, so it seems that most
# SAT scores are low in general. After looking at the histogram of the SAT scores, this also seems to
# be the case.
# Open admissions as a variable seems to imply that they are colleges which don't require any SAT scores.
# This means that they
# According to statistics on the meaning of independence, a random variable is independent if
# P(A|B) = P(A). From the mosaic plot, it shows that there are only False open_admissions for
# any of the groups. However, P(F|Low SAT) = 1, and P(F|High SAT) = 1 also. Since P(A|B) = P(A), that
# implies that P(A) and P(B) are independent. That means that open_admissions is independent of group.
# In general this may not seem to be a logical or sensible statement, that statement being that
# open admissions and high/low SAT are independent. Yet it is for the reason that in the dataset
# if open admissions have a value of True or False, this implies that SAT scores are not included. So
# it doesn't necessarily make sense to have this conclusion, as it may come from examining an imperfect
# dataset.
# part C, part B
mosaicplot(~ group + main_campus, data = cs, main = "Main Campus vs. Group",
xlab = "Group", ylab = "Main Campus")

table(as.character(cs$group), as.character(cs$main_campus)) # High_SAT FALSE = 0
##
## FALSE TRUE
## High_SAT 0 83
## Low_SAT 881 2348
# It seems that if the college is a main campus, then it has a lower proportion of higher SAT scores
# in comparison to lower SAT scores. However, it is also the same for when the college is not the
# main campus. The ratio of high/low SAT is about similar whether the college is a main campus or not.
# Therefore it would make sense to conclude that the main campus and high/low SAT are independent.
# Or in other words, the fact that a college is a main campus doesn't change whether or not the
# college has a high or low SAT.
# part C, part C
mosaicplot(~ group + ownership, data = cs, main = "Ownership vs. Group",
xlab = "Group", ylab = "Ownership")

table(as.character(cs$group), as.character(cs$ownership)) # High_SAT For Profit = 0
##
## For Profit Nonprofit Public
## High_SAT 0 72 11
## Low_SAT 886 1638 705
# Public seems to have somewhat lower SAT's in comparison to Nonprofit Private colleges. The number
# of For Profit colleges are rather low, so it doesn't seem to have a huge impact on whether or not
# they have high or low SAT. It would make sense to think that the ownership level, mainly being either
# public or nonprofit is somewhat dependent on there being a high or low SAT score.
# part C, part D
branches_counts = ifelse(cs$branches == 1, "one branches", "more than one branches")
mosaicplot(~ group + branches_counts, data=cs, main = "Branches Count vs. Group",
xlab = "Group", ylab = "Branches Counts")

# According to the mosaic plot, it doesn't seem to matter whether or not the college has
# one or many branches. Either way the ratio of high to low SAT remains rather constant. Therefore
# it makes sense to conclude that group is independent of the number of branches.
# 10
# part A
plot(cs$avg_family_inc, cs$avg_10yr_salary, main = "Avg. Family Income vs. Avg. 10 yr. Salary",
xlab = "Avg. Family Income", ylab = "Avg. 10 yr. Salary")
abline(lm(avg_10yr_salary~avg_family_inc, data=cs))

outliers = cs[cs$avg_10yr_salary > 150000 & !is.na(cs$avg_10yr_salary) & !is.na(cs$avg_family_inc),]
dim(outliers) # 9 observations
## [1] 9 54
outliers$name
## [1] "University of Massachusetts Medical School Worcester"
## [2] "Icahn School of Medicine at Mount Sinai"
## [3] "SUNY Downstate Medical Center"
## [4] "Northeast Ohio Medical University"
## [5] "Philadelphia College of Osteopathic Medicine"
## [6] "University of North Texas Health Science Center"
## [7] "West Virginia School of Osteopathic Medicine"
## [8] "Medical College of Wisconsin"
## [9] "Louisiana State University Health Sciences Center-Shreveport"
outliers
## unit_id ope_id main_campus branches open_admissions
## 891 166708 009756 TRUE 1 FALSE
## 1331 193405 007026 TRUE 1 FALSE
## 1403 196255 002839 TRUE 1 FALSE
## 1577 204477 024544 TRUE 1 FALSE
## 1789 215123 003352 TRUE 1 FALSE
## 2030 228909 009768 TRUE 1 FALSE
## 2180 237880 011245 TRUE 1 FALSE
## 2204 239169 024535 TRUE 1 FALSE
## 2536 435000 008067 TRUE 1 FALSE
## name
## 891 University of Massachusetts Medical School Worcester
## 1331 Icahn School of Medicine at Mount Sinai
## 1403 SUNY Downstate Medical Center
## 1577 Northeast Ohio Medical University
## 1789 Philadelphia College of Osteopathic Medicine
## 2030 University of North Texas Health Science Center
## 2180 West Virginia School of Osteopathic Medicine
## 2204 Medical College of Wisconsin
## 2536 Louisiana State University Health Sciences Center-Shreveport
## city state zip online_only primary_degree
## 891 Worcester MA 01655 FALSE Graduate
## 1331 New York NY 10029-6574 FALSE Graduate
## 1403 Brooklyn NY 11203-2098 FALSE Bachelor
## 1577 Rootstown OH 44272-0095 FALSE Graduate
## 1789 Philadelphia PA 19131 FALSE Graduate
## 2030 Fort Worth TX 76107-2699 FALSE Graduate
## 2180 Lewisburg WV 24901 FALSE Graduate
## 2204 Milwaukee WI 53226-0509 FALSE Graduate
## 2536 Shreveport LA 71103 FALSE Bachelor
## highest_degree ownership avg_sat undergrad_pop grad_pop cost tuition
## 891 Graduate Public NA NA 1161 NA NA
## 1331 Graduate Nonprofit NA NA 1090 NA NA
## 1403 Graduate Public NA 346 1507 NA NA
## 1577 Graduate Public NA NA 832 NA NA
## 1789 Graduate Nonprofit NA NA 2736 NA NA
## 2030 Graduate Public NA NA 2142 NA NA
## 2180 Graduate Public NA NA 817 NA NA
## 2204 Graduate Nonprofit NA NA 1212 NA NA
## 2536 Graduate Public NA 44 812 NA NA
## tuition_nonresident revenue_per_student spend_per_student
## 891 NA 12759 54982
## 1331 NA 27476 25776
## 1403 NA 18956 66244
## 1577 NA 32181 26496
## 1789 NA 32107 16280
## 2030 NA 9766 17864
## 2180 NA 37958 20006
## 2204 NA 28887 46284
## 2536 NA 12256 57341
## avg_faculty_salary ft_faculty admission retention completion fed_loan
## 891 8823 0.7273 NA NA NA NA
## 1331 NA NA NA NA NA NA
## 1403 11620 0.8395 NA NA NA 0.4943
## 1577 8678 1.0000 NA NA NA NA
## 1789 NA NA NA NA NA NA
## 2030 NA NA NA NA NA NA
## 2180 NA NA NA NA NA NA
## 2204 NA NA NA NA NA NA
## 2536 5584 0.8929 NA NA NA 0.4400
## pell_grant avg_family_inc med_family_inc avg_10yr_salary
## 891 NA 21000.767 15361.0 214300
## 1331 NA 12484.354 2875.0 212600
## 1403 0.2045 36315.887 20755.0 160900
## 1577 NA 7882.177 4024.5 218300
## 1789 NA 3720.608 393.5 193800
## 2030 NA 19518.312 19352.5 197800
## 2180 NA 15011.905 7003.0 235000
## 2204 NA 9717.391 3859.5 250000
## 2536 0.2000 45746.333 26061.5 167300
## sd_10yr_salary med_10yr_salary med_debt med_debt_withdraw
## 891 126400 183400 NA NA
## 1331 128700 191300 NA NA
## 1403 113800 125900 13938 8261
## 1577 108200 199200 3010 2375
## 1789 119500 177600 3000 2000
## 2030 149500 163800 NA NA
## 2180 129300 220700 2000 NA
## 2204 172700 250000 NA NA
## 2536 128600 122500 NA NA
## default_3yr_rate repay_5yr_rate_withdraw repay_5yr_rate avg_entry_age
## 891 0.006 NA NA 26.57534
## 1331 0.015 NA NA 25.50000
## 1403 0.002 0.9025974 NA 29.21127
## 1577 0.000 NA NA 24.34375
## 1789 0.026 NA NA 27.95556
## 2030 0.031 NA NA 30.75000
## 2180 0.004 NA NA 26.70538
## 2204 0.000 NA NA 26.15625
## 2536 0.004 0.8750000 NA 26.77778
## veteran first_gen male female race_white race_black race_hispanic
## 891 NA NA NA NA NA NA NA
## 1331 NA NA NA NA NA NA NA
## 1403 NA 0.3723849 0.1965 0.8035 0.3988 0.2168 0.1156
## 1577 NA NA NA NA NA NA NA
## 1789 NA 0.1461988 NA NA NA NA NA
## 2030 NA NA NA NA NA NA NA
## 2180 NA NA NA NA NA NA NA
## 2204 NA NA NA NA NA NA NA
## 2536 NA NA 0.2727 0.7273 0.4773 0.2500 0.0227
## race_asian race_native race_pacific net_cost race_other group
## 891 NA NA NA NA NA Low_SAT
## 1331 NA NA NA NA NA Low_SAT
## 1403 0.1676 0.0058 0.0029 NA 0.0926 Low_SAT
## 1577 NA NA NA NA NA Low_SAT
## 1789 NA NA NA NA NA Low_SAT
## 2030 NA NA NA NA NA Low_SAT
## 2180 NA NA NA NA NA Low_SAT
## 2204 NA NA NA NA NA Low_SAT
## 2536 0.0682 0.0000 0.0000 NA 0.1818 Low_SAT
## race_aw grad_proportion
## 891 NA NA
## 1331 NA NA
## 1403 0.5664 0.8132758
## 1577 NA NA
## 1789 NA NA
## 2030 NA NA
## 2180 NA NA
## 2204 NA NA
## 2536 0.5455 0.9485981
# It seems that there are certain outliers to the top left, roughly above 150,000 avg 10 yr salary
# that are removed from the linear direction of the rest of the data. After looking into them,
# it seems that the majority of them have a Graduate degree as their primary degree, and all of
# them at least offer a Graduate degree as their highest degree available. Another interesting
# note is that after looking at their names, they are all some type of medical college.
# part B
# check the number of factor levels
levels(cs$highest_degree)
## [1] "Other" "Certificate" "Associate" "Bachelor" "Graduate"
# optional R/Stats practice, test the new categorical variable's factor levels to see
# if there's an improvement of fit
plot(cs$avg_family_inc, cs$avg_10yr_salary, main = "Avg. Family Income vs. Avg. 10 yr. Salary",
xlab = "Avg. Family Income", ylab = "Avg. 10 yr. Salary")
fit1 = lm(avg_10yr_salary~avg_family_inc + highest_degree, data=cs)
fit1$coef
## (Intercept) avg_family_inc
## 3.874917e+04 2.177299e-01
## highest_degreeCertificate highest_degreeAssociate
## -1.486046e+04 -9.377607e+03
## highest_degreeBachelor highest_degreeGraduate
## -8.754538e+03 -1.313234e+03
abline(a = fit1$coef[1], b = fit1$coef[2], col="orange")
abline(a = fit1$coef[1]+fit1$coef[3], b = fit1$coef[2], lty=2, col="pink")
abline(a = fit1$coef[1]+fit1$coef[4], b = fit1$coef[2], lty=3, col="green")
abline(a = fit1$coef[1]+fit1$coef[5], b = fit1$coef[2], lty=4, col="blue")
abline(a = fit1$coef[1]+fit1$coef[6], b = fit1$coef[2], lty=5, col="red")
abline(lm(avg_10yr_salary~avg_family_inc, data=cs))
legend("topright", c("other", "Certificate", "Associate", "Bachelor", "Graduate",
"Original regression line"),
col = c("orange", "pink", "green", "blue", "red", "black"), lty = c(1,2,3,4,5,1))

# Orange and red seem to be a group which are better at estimating the line due to them taking into
# account the data which are higher up. The lines also seem somewhat far apart from each other,
# therefore, adding categorical variables are useful. Some of the categorical variables however
# are quite similar and close to each other, therefore they aren't as useful in predicting the data.
# Since orange and red are these two, it makes sense that 'Other' and 'Graduate' are useful levels
# for predicting the average 10 year salary.