College Data Analytics

setwd("C:/Users/qizhe/Desktop/STA 141A/HW1")

cs <- readRDS("college_scorecard_2013.rds")

# 1 Total of 3312 observations, 3312 different colleges.
dim(cs) # 3312 observations

## [1] 3312   51

# number of colleges
sum(cs$main_campus) #2431 main campuses

## [1] 2431

# take out those records with duplicated school names
#duplicated_records = cs[cs$name %in% cs$name[duplicated(cs$name)],]
#ordered_duplicated_records = duplicated_records[order(duplicated_records$name),]
#ordered_duplicated_records[,1:7]
#Problem of duplicate names, multiple branches, can't tell where to discern
#individual college existence

# 2
# sapply()
tab1 <- sapply(cs, class)
table(tab1)

## tab1
## character    factor   integer   logical   numeric 
##         4         4        15         3        25

# examine each variable type using str()
str(cs)

## 'data.frame':    3312 obs. of  51 variables:
##  $ unit_id                : int  100654 100663 100690 100706 100724 100751 100812 100830 100858 100937 ...
##  $ ope_id                 : chr  "001002" "001052" "025034" "001055" ...
##  $ main_campus            : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ branches               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ open_admissions        : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ name                   : chr  "Alabama A & M University" "University of Alabama at Birmingham" "Amridge University" "University of Alabama in Huntsville" ...
##  $ city                   : chr  "Normal" "Birmingham" "Montgomery" "Huntsville" ...
##  $ state                  : Factor w/ 56 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ zip                    : chr  "35762" "35294-0110" "36117-3553" "35899" ...
##  $ online_only            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ primary_degree         : Factor w/ 5 levels "Other","Certificate",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ highest_degree         : Factor w/ 5 levels "Other","Certificate",..: 5 5 5 5 5 5 4 5 5 4 ...
##  $ ownership              : Factor w/ 3 levels "Public","Nonprofit",..: 1 1 2 1 1 1 1 1 1 2 ...
##  $ avg_sat                : int  823 1146 NA 1180 830 1171 NA 970 1215 1177 ...
##  $ undergrad_pop          : int  4051 11200 322 5525 5354 28692 2999 4322 19761 1181 ...
##  $ grad_pop               : int  969 7066 309 1680 719 5312 NA 762 5065 NA ...
##  $ cost                   : int  18888 19990 12300 20306 17400 26717 NA 16556 23788 44167 ...
##  $ tuition                : int  7182 7206 6870 9192 8720 9450 NA 8750 9852 30690 ...
##  $ tuition_nonresident    : int  12774 16398 6870 21506 15656 23950 NA 24950 26364 30690 ...
##  $ revenue_per_student    : int  9063 9033 12057 8322 7813 12198 5094 7625 13186 11928 ...
##  $ spend_per_student      : int  7459 17208 5123 9352 7393 9817 6176 6817 11324 9990 ...
##  $ avg_faculty_salary     : int  7079 10170 3849 9341 6557 9605 7672 7173 9429 7513 ...
##  $ ft_faculty             : num  0.886 0.911 0.672 0.655 0.664 ...
##  $ admission              : num  0.899 0.867 NA 0.806 0.512 ...
##  $ retention              : num  0.631 0.802 0.375 0.81 0.622 ...
##  $ completion             : num  0.291 0.538 0.667 0.483 0.252 ...
##  $ fed_loan               : num  0.82 0.54 0.763 0.473 0.874 ...
##  $ pell_grant             : num  0.712 0.35 0.684 0.328 0.827 ...
##  $ avg_family_inc         : num  29590 50153 22714 51013 28113 ...
##  $ med_family_inc         : num  21281 35163 16619 33787 20269 ...
##  $ avg_10yr_salary        : int  34300 46400 46100 50500 29500 49900 42200 37900 54100 51000 ...
##  $ sd_10yr_salary         : int  25300 36300 34900 32700 20500 42600 28000 27800 41800 36600 ...
##  $ med_10yr_salary        : int  29900 40200 40100 45600 26700 42700 38500 33500 47100 45600 ...
##  $ med_debt               : num  30968 21282 24897 23106 32000 ...
##  $ med_debt_withdraw      : num  9500 8925 6731 8068 9500 ...
##  $ default_3yr_rate       : num  0.163 0.08 0.089 0.077 0.191 0.081 0.094 0.131 0.061 0.099 ...
##  $ repay_5yr_rate_withdraw: num  0.324 0.606 0.449 0.613 0.261 ...
##  $ repay_5yr_rate         : num  0.53 0.731 0.529 0.794 0.446 ...
##  $ avg_entry_age          : num  20.5 23.5 33.9 24.1 20.6 ...
##  $ veteran                : num  NA 0.00354 NA 0.01013 0.00353 ...
##  $ first_gen              : num  0.388 0.336 0.54 0.326 0.374 ...
##  $ male                   : num  0.486 0.413 0.391 0.558 0.408 ...
##  $ female                 : num  0.514 0.587 0.609 0.442 0.592 ...
##  $ race_white             : num  0.0279 0.5987 0.2919 0.7012 0.0161 ...
##  $ race_black             : num  0.95 0.259 0.422 0.131 0.928 ...
##  $ race_hispanic          : num  0.0089 0.0258 0.0093 0.0338 0.0114 0.0313 0.0213 0.0079 0.0253 0.0305 ...
##  $ race_asian             : num  0.0022 0.0518 0.0031 0.0364 0.0015 0.0112 0.0047 0.0245 0.0213 0.039 ...
##  $ race_native            : num  0.0012 0.0026 0.0031 0.0145 0.0009 0.0044 0.019 0.0037 0.0077 0.0102 ...
##  $ race_pacific           : num  0.001 0.0007 0.0031 0.0002 0.0007 0.0011 0.0007 0.0002 0 0 ...
##  $ net_cost               : int  13415 14805 7455 17520 11936 21513 NA 11915 17541 21406 ...
##  $ race_other             : num  0.0086 0.0614 0.2671 0.0828 0.0409 ...

#51 features (variables) (50 if remove unit_id), 12 categorical, 39 numerical (ordinal?) double check
#numerical (discrete): branches, undergrad_pop, grad_pop
#numerical (continuous): avg_sat, cost, tuition, tuition_nonresident, revenue_per_student, spend_per_student
#avg_faculty_salary, ft_faculty, admission, retention, completion, fed_loan, pell_grant, avg_family_inc, med_family_inc
#avg_10yr_salary, sd_10yr_salary, med_10yr_salary, med_debt, med_debt_withdraw, default_3yr_rate, repay_5yr_rate_withdraw
#repay_5yr_rate, avg_entry_age, veteran, first_gen, male, female, race_white, race_black, race_hispanic, race_asian, race_native
#race_pacific, net_cost, race_other
#categorical : unit_id, ope_id, name, city, state, zip
#ordinal variable: primary_degree, highest_degree
#nominal variable: ownership
#nominal, ratio, interval?
#logical variable (double check): main_campus, open_admissions, online_only, 

# duplicated ope_id records
#duplicated_record_ope_id = cs[cs$ope_id %in% cs$ope_id[duplicated(cs$ope_id)],]
#order_duplicated_record_ope_id = duplicated_record_ope_id[order(duplicated_record_ope_id$ope_id),]

# 3
# number of NA's
sum(is.na(cs)) # 23197

## [1] 23197

# find number of NA's per variable, find the largest
count_na_by_features = sapply(cs, function(x) sum(is.na(x)))
t(t(count_na_by_features)) # avg_sat is largest with 1,923

##                         [,1]
## unit_id                    0
## ope_id                     0
## main_campus                0
## branches                   0
## open_admissions            0
## name                       0
## city                       0
## state                      0
## zip                        0
## online_only              199
## primary_degree             0
## highest_degree             0
## ownership                  0
## avg_sat                 1923
## undergrad_pop            490
## grad_pop                1149
## cost                     717
## tuition                  472
## tuition_nonresident      635
## revenue_per_student      201
## spend_per_student        201
## avg_faculty_salary       317
## ft_faculty               558
## admission               1416
## retention                966
## completion               864
## fed_loan                 510
## pell_grant               510
## avg_family_inc           267
## med_family_inc           267
## avg_10yr_salary          480
## sd_10yr_salary           480
## med_10yr_salary          480
## med_debt                 487
## med_debt_withdraw        489
## default_3yr_rate         183
## repay_5yr_rate_withdraw  538
## repay_5yr_rate           792
## avg_entry_age            267
## veteran                 1735
## first_gen                505
## male                     490
## female                   490
## race_white               490
## race_black               490
## race_hispanic            490
## race_asian               490
## race_native              490
## race_pacific             490
## net_cost                 689
## race_other               490

#different types of NA's, intentional, random
#turn into a heatmap, no builtin heatmap function

# Patterns
#total 23197 NA's, highest NA is avg_sat with 1923
#revenue_per_student and spend_per_student both have 201 NA's
#fed_loan and pell_grant both have 510 NA's
#avg_family_inc and med_family_inc both have 267 NA's
#avg_10yr_salary, sd_10yr_salary, med_10yr_salary each have 480 NA's When one of the values are missing, others can't be determined also.
#med_debt and med_debt_withdraw are very close at 487/489 NA's
#male, female, race_white, race_black, race_hispanic, race_asian, race_native,
#race_pacific, and race_other each have 490 NA's School's that don't have gender information also lack ethnic information.

#can plot the NA's to determine pattern

#look into each function, e.g., (image(), colSums())

#finish the rest of Swirl apply, basegraphics 9,10,11,15

#sort(NAcounts)
#?order
#?rank
#usually sort + order

NA_counts<-colSums(is.na(cs))
#NA_counts<-as.matrix(NA_counts)
#NA_counts

# graphically display the NA's per feature
plot(NA_counts, main="Number of Missing Data per Feature",
     xlab = "Feature Index",
     ylab = "Number NA's", cex = 1, pch=15)
abline(h=c(500, 0), col="Green") # many around this region

# Optional techniques, additional resources
#summary(data)
#which.max(NAcounts)
#hist(NA_counts)
#find the 500's
#table(NA_counts)
#which(NA_counts==490)
#which(NA_counts...|...)
#factor in datacamp
#Roger peng youtube, brian caffo

# 4
# number of private versus public colleges
# public
length(which(cs$ownership=="Public")) #716

## [1] 716

# private aka For Profit and Nonprofit
length(which(cs$ownership!="Public")) #2,596

## [1] 2596

length(cs$ownership) #3312

## [1] 3312

# separate public and private colleges into variables
cs_public = cs[cs$ownership=="Public",]
cs_private = cs[cs$ownership!="Public",]

#prop.table(, margin=1)
#mosaicplot() ?mosaic, titanic dataset
#ggplot2
#cowplot

# graphically display proportions of degrees for private and public colleges
public_highest_deg_table = table(cs_public$highest_degree)/sum(table(cs_public$highest_degree))
private_highest_deg_table = table(cs_private$highest_degree)/sum(table(cs_private$highest_degree))
par(mfrow=c(1,2))
barplot(public_highest_deg_table, main = "Degree Proportion in Public College", ylim=c(0,0.8))
barplot(private_highest_deg_table, main = "Degree Proportion in Private College", ylim=c(0,0.8))

#prop.table(table(cs$ownership,cs$highest_degree),margin=1)

#Public colleges tend to have a much higher ratio of having a graduate degree as the
#highest degree offerred in comparison to private colleges. Also, public schools don't
#offer Certificate programs as the highest degree available.

# 5
# mean, median, and decile for undergraduate populations
ugmean <- mean(cs$undergrad_pop,na.rm = TRUE) #3599.502
ugmed <- median(cs$undergrad_pop,na.rm = TRUE) #1295
decile <- quantile(cs$undergrad_pop, prob = seq(0,1,length=11),type = 5,na.rm = TRUE)
par(mfrow=c(1,1))
hist(cs$undergrad_pop, main = "Histogram of undergraduate population", xlab = "Undergraduate Population")
abline(v = ugmean, col='red', lwd=5)
abline(v = decile, col='pink', lwd=2)
abline(v = ugmed, col='blue', lwd=5)
legend(x=120000, y=2400, c("mean", "median", "decile"), col = c("red", "blue", "pink"), lty=c(1,1,1))

#There is an extreme outlier, the Universit of Phoenix, Arizona whose population of over
#150,000 students creates an extreme skew to the histogram.

# testing for without the previous outlier, little difference in overall skew
cs1 = cs[-which(cs$undergrad_pop==166816),] # removes outlier
ugmean <- mean(cs1$undergrad_pop,na.rm = TRUE) #3599.502
ugmed <- median(cs1$undergrad_pop,na.rm = TRUE) #1295
decile <- quantile(cs1$undergrad_pop, prob = seq(0,1,length=11),type = 5,na.rm = TRUE)
par(mfrow=c(1,1))
hist(cs1$undergrad_pop, main = "Histogram of undergraduate population")
abline(v = ugmean, col='red', lwd=5)
abline(v = decile, col='pink', lwd=2)
abline(v = ugmed, col='blue', lwd=5)
legend(x=120000, y=2400, c("mean", "median", "decile"), col = c("red", "blue", "pink"), lty=c(1,1,1))

# 6
# 5 most populous states: California, Texas, New York, Illinois, Florida
top = cs[cs$state %in% c("CA", "TX", "NY", "IL", "FL"),]
# alternate method to drop NA levels
#top$state = factor(as.character(top$state))
top$state <- droplevels(top$state)
boxplot(top$tuition~top$state, main = "Boxplot of 5 Most Populous States", ylab = "Tuition per State")

#All of the states tend to have a skew towards higher tuition. New York seems to have
#both the highest and lowest tuition. Florida and Texas tend to a greater number of outliers
#than the other states. Texas and Flordia also seem to have a lower tuition than the others.

# 7
# part a
# name of university with largest avg_sat
# inefficient code
#max(cs$avg_sat,na.rm = TRUE) #1534
#which(cs$avg_sat %in% 1534) #105
#cs$name[105] #California Institute of Technology
#which(cs$name=="California Institute of Technology")
# efficient code
cs[which.max(cs$avg_sat), 'name'] #California Institute of Technology

## [1] "California Institute of Technology"

# part b
# largest university & open admissions
# inefficient code
#max(cs$undergrad_pop,na.rm = TRUE) #166816
#which(cs$undergrad_pop %in% 166816) #2371
#cs$name[2371] #University of Phoenix-Online Campus
#cs$open_admissions[2371] #True
# efficient code
cs[which.max(cs$undergrad_pop), c('name', 'open_admissions')]

##                                     name open_admissions
## 2371 University of Phoenix-Online Campus            TRUE

# part c
# zip code of smallest avg_family_inc for public schools
publicUni = subset(cs,ownership=="Public")
which.min(publicUni$avg_family_inc)

## [1] 348

publicUni$zip[348] #11101

## [1] "11101"

publicUni$name[348] #CUNY School of Law

## [1] "CUNY School of Law"

# part d
# also largest grad_pop (referring to part b)
# inefficient code
#cs$grad_pop[2371] #41900
#max(cs$grad_pop,na.rm = TRUE) #42811
#which(cs$grad_pop %in% 42811) #248
#cs$name[248] #Walden University
#No, Walden University has the largest grad population.
# efficient code
cs[which.max(cs$undergrad_pop), 'grad_pop'] == max(cs$grad_pop, na.rm=TRUE)

## [1] FALSE

# 8
# subset For Profit & Bachelor primary degree schools
sub8 = subset(cs, ownership=="For Profit" & primary_degree=="Bachelor")
money_per_student <- sub8[,c('revenue_per_student', 'spend_per_student')]

# part a
plot(money_per_student, main = "Revenue versus Spending per Student", xlab = "Revenue per student"
     , ylab = "Spending per student")

#There are a few outliers which may violate the assumptions of linearity.

# part b
# create new variable to use
sub8 <- subset(cs, ownership=="For Profit" & primary_degree=="Bachelor")
sub8[is.na(sub8)] <- 0 # set NA's to 0
net_income <- (sub8$revenue_per_student - sub8$spend_per_student) # net income for colleges
total_net_income <- (net_income*(sub8$undergrad_pop + sub8$grad_pop)) # total net income
# order sub8, and then choose top 5
sub8_top = sub8[order(total_net_income, decreasing=TRUE),]
sub8_top5 = sub8_top[1:5,] #University of Phoenix-Online Campus, Ashford University,
#Capella University, Grand Canyon University, Kaplan University-Davenport Campus

# plot net income and number of students
# create total student population, net income, and total net income categories for sub8_top5
sub8_top5$student_pop <- sub8_top5$grad_pop + sub8_top5$undergrad_pop
sub8_top5$net_income <- (sub8_top5$revenue_per_student - sub8_top5$spend_per_student)
sub8_top5$total_net_income <- (sub8_top5$net_income*(sub8_top5$undergrad_pop + sub8_top5$grad_pop))
plot(sub8_top5$student_pop, sub8_top5$total_net_income, xlab = "Student population", 
     ylab = "Total income per student", main = "Top 5 Total Net Income for Colleges", 
     pch=1:5)
legend("topleft", sub8_top5$name, pch=1:5)

# 9
plot(cs[,c("avg_sat", "admission")], main = "Average SAT versus Admissions", xlab = "Average SAT",
     ylab = "Admissions")

#Around avg_sat = 1200, the admission begins to narrow.

# split according to avg_sat >= 1200 and admission <= 0.4
group <- ifelse(cs$avg_sat >= 1200 & cs$admission <= 0.4, "High_SAT", "Low_SAT")
group2 <- ifelse(is.na(group),"Low_SAT",group)
cs$group <- group2
boxplot(cs$med_10yr_salary~cs$group, ylab = "Median 10 year salary", main = "Median Salary for High/Low SAT")

# The 10 year median salary for students graduating from High SAT collegs tend to earn more than
# those with lower SAT's. There are still outliers in the low SAT colleges which make it so that
# their students can also make higher salaries. It is still more common however for high SAT college
# graduates to be earning more than their counterparts.

cs$race_aw <- cs$race_asian + cs$race_white
boxplot(cs$race_aw~group, ylab = "Asian and White Proportion", main = "Asian and White Combination for High/Low SAT")

# Colleges which have lower proportions of Asian and Whites can also have lower SAT scores. In High SAT average
# schools, this sort of pattern is not common and doesn't become apparent.

cs$grad_proportion <- (cs$grad_pop/(cs$grad_pop + cs$undergrad_pop))
boxplot(cs$grad_proportion~group, ylab = "Graduate Student Proportion", 
        main = "SAT Scores According to Graduate Student Proportion")

lsat=subset(cs, cs$group=="Low_SAT") #low SAT's
lsat$grad_proportion <- (lsat$grad_pop/(lsat$grad_pop + lsat$undergrad_pop))
lsat[order(lsat$grad_proportion, decreasing = TRUE),][1:5,]

##      unit_id ope_id main_campus branches open_admissions
## 93    109086 020992        TRUE        1           FALSE
## 182   117672 001229        TRUE        1           FALSE
## 1000  173683 040443        TRUE        1           FALSE
## 2571  439394 034297        TRUE        1           FALSE
## 43    103778 001070        TRUE        2           FALSE
##                                                          name
## 93                              American Conservatory Theater
## 182         Southern California University of Health Sciences
## 1000 Hazelden Betty Ford Graduate School of Addiction Studies
## 2571                    East West College of Natural Medicine
## 43                    Thunderbird School of Global Management
##               city state        zip online_only primary_degree
## 93   San Francisco    CA 94108-5834       FALSE    Certificate
## 182       Whittier    CA      90604       FALSE       Graduate
## 1000   Center City    MN 55012-0011       FALSE    Certificate
## 2571      Sarasota    FL 34234-0000       FALSE       Graduate
## 43        Glendale    AZ 85306-6000       FALSE    Certificate
##      highest_degree  ownership avg_sat undergrad_pop grad_pop cost tuition
## 93         Graduate  Nonprofit      NA             0       31   NA      NA
## 182        Graduate  Nonprofit      NA             0      564   NA      NA
## 1000       Graduate  Nonprofit      NA             0       90   NA      NA
## 2571       Graduate For Profit      NA             0      123   NA      NA
## 43         Graduate  Nonprofit      NA             2     1020   NA      NA
##      tuition_nonresident revenue_per_student spend_per_student
## 93                    NA               31619             21970
## 182                   NA               20054             10158
## 1000                  NA               20791              4983
## 2571                  NA                8592              4901
## 43                    NA               62758             53366
##      avg_faculty_salary ft_faculty admission retention completion fed_loan
## 93                 8434     1.0000        NA        NA         NA       NA
## 182                5620     0.4188        NA        NA         NA       NA
## 1000               6337         NA        NA        NA         NA        0
## 2571               4287         NA        NA        NA         NA       NA
## 43                10953     0.6027        NA        NA         NA        0
##      pell_grant avg_family_inc med_family_inc avg_10yr_salary
## 93           NA             NA             NA              NA
## 182          NA       36583.54        31521.0              NA
## 1000          0             NA             NA              NA
## 2571         NA       30865.32        12000.0              NA
## 43            0       25728.15        22590.5          136600
##      sd_10yr_salary med_10yr_salary med_debt med_debt_withdraw
## 93               NA              NA       NA                NA
## 182              NA              NA     8144              4541
## 1000             NA              NA       NA                NA
## 2571             NA              NA       NA                NA
## 43           122800          111600     8000                NA
##      default_3yr_rate repay_5yr_rate_withdraw repay_5yr_rate avg_entry_age
## 93              0.023                      NA             NA            NA
## 182             0.032                      NA             NA      33.77143
## 1000            0.018                      NA             NA            NA
## 2571            0.023                      NA             NA      37.57895
## 43              0.012                      NA             NA      30.33871
##      veteran first_gen male female race_white race_black race_hispanic
## 93        NA        NA    0      0          0          0             0
## 182       NA        NA    0      0          0          0             0
## 1000      NA        NA    0      0          0          0             0
## 2571      NA        NA    0      0          0          0             0
## 43        NA        NA    1      0          1          0             0
##      race_asian race_native race_pacific net_cost race_other   group
## 93            0           0            0       NA          0 Low_SAT
## 182           0           0            0       NA          0 Low_SAT
## 1000          0           0            0       NA          0 Low_SAT
## 2571          0           0            0       NA          0 Low_SAT
## 43            0           0            0       NA          0 Low_SAT
##      race_aw grad_proportion
## 93         0       1.0000000
## 182        0       1.0000000
## 1000       0       1.0000000
## 2571       0       1.0000000
## 43         1       0.9980431

# It follows that colleges with higher SAT's will have a greater proportion of Graduate students
# than colleges that have a lower SAT average. However, there are some outliers in the lower SAT
# range. The exceptional outliers in the low SAT group seem to come from colleges which are
# actually focused on giving graduate level degrees, which explains how their student population
# can have both lower SAT's on average while still consisting mainly of graduate students.

# part C
mosaicplot(~  group + open_admissions, data = cs, main = "Open Admissions vs. Group",
           xlab = "Group", ylab = "Open Admissions")

table(as.character(cs$group), as.character(cs$open_admissions))

##           
##            FALSE TRUE
##   High_SAT    83    0
##   Low_SAT   2445  784

# examine table of 'group' and 'open_admissions'
subgroup = cs$group
subgroup[is.na(subgroup)] = "NA"
suboa = cs$open_admissions
suboa[is.na(suboa)] = "NA"
table(suboa,subgroup)

##        subgroup
## suboa   High_SAT Low_SAT
##   FALSE       83    2445
##   TRUE         0     784

subas = cs$avg_sat
subas[is.na(subas)] = "NA"
hist(cs$avg_sat)

# Apparently universities with open admissions also lack any average SAT scores. This makes sense
# since open admissions universities only require a high school diploma or GED in order to become
# accepted. That means there is no need for any of them to include average SAT scores. In the mosaic
# plot, the majority of open admissions which are False have low SAT averages, so it seems that most
# SAT scores are low in general. After looking at the histogram of the SAT scores, this also seems to
# be the case.
# Open admissions as a variable seems to imply that they are colleges which don't require any SAT scores.
# This means that they 

# According to statistics on the meaning of independence, a random variable is independent if 
# P(A|B) = P(A). From the mosaic plot, it shows that there are only False open_admissions for
# any of the groups. However, P(F|Low SAT) = 1, and P(F|High SAT) = 1 also. Since P(A|B) = P(A), that
# implies that P(A) and P(B) are independent. That means that open_admissions is independent of group.
# In general this may not seem to be a logical or sensible statement, that statement being that
# open admissions and high/low SAT are independent. Yet it is for the reason that in the dataset
# if open admissions have a value of True or False, this implies that SAT scores are not included. So
# it doesn't necessarily make sense to have this conclusion, as it may come from examining an imperfect
# dataset.

# part C, part B
mosaicplot(~  group + main_campus, data = cs, main = "Main Campus vs. Group",
           xlab = "Group", ylab = "Main Campus")

table(as.character(cs$group), as.character(cs$main_campus)) # High_SAT FALSE = 0

##           
##            FALSE TRUE
##   High_SAT     0   83
##   Low_SAT    881 2348

# It seems that if the college is a main campus, then it has a lower proportion of higher SAT scores
# in comparison to lower SAT scores. However, it is also the same for when the college is not the
# main campus. The ratio of high/low SAT is about similar whether the college is a main campus or not.
# Therefore it would make sense to conclude that the main campus and high/low SAT are independent.
# Or in other words, the fact that a college is a main campus doesn't change whether or not the
# college has a high or low SAT.

# part C, part C
mosaicplot(~  group + ownership, data = cs, main = "Ownership vs. Group",
           xlab = "Group", ylab = "Ownership")

table(as.character(cs$group), as.character(cs$ownership)) # High_SAT For Profit = 0

##           
##            For Profit Nonprofit Public
##   High_SAT          0        72     11
##   Low_SAT         886      1638    705

# Public seems to have somewhat lower SAT's in comparison to Nonprofit Private colleges. The number
# of For Profit colleges are rather low, so it doesn't seem to have a huge impact on whether or not
# they have high or low SAT. It would make sense to think that the ownership level, mainly being either
# public or nonprofit is somewhat dependent on there being a high or low SAT score.

# part C, part D
branches_counts = ifelse(cs$branches == 1, "one branches", "more than one branches")
mosaicplot(~ group + branches_counts, data=cs, main = "Branches Count vs. Group",
           xlab = "Group", ylab = "Branches Counts")

# According to the mosaic plot, it doesn't seem to matter whether or not the college has
# one or many branches. Either way the ratio of high to low SAT remains rather constant. Therefore
# it makes sense to conclude that group is independent of the number of branches.

# 10
# part A
plot(cs$avg_family_inc, cs$avg_10yr_salary, main = "Avg. Family Income vs. Avg. 10 yr. Salary",
     xlab = "Avg. Family Income", ylab = "Avg. 10 yr. Salary")
abline(lm(avg_10yr_salary~avg_family_inc, data=cs))

outliers = cs[cs$avg_10yr_salary > 150000 & !is.na(cs$avg_10yr_salary) & !is.na(cs$avg_family_inc),]
dim(outliers) # 9 observations

## [1]  9 54

outliers$name

## [1] "University of Massachusetts Medical School Worcester"        
## [2] "Icahn School of Medicine at Mount Sinai"                     
## [3] "SUNY Downstate Medical Center"                               
## [4] "Northeast Ohio Medical University"                           
## [5] "Philadelphia College of Osteopathic Medicine"                
## [6] "University of North Texas Health Science Center"             
## [7] "West Virginia School of Osteopathic Medicine"                
## [8] "Medical College of Wisconsin"                                
## [9] "Louisiana State University Health Sciences Center-Shreveport"

outliers

##      unit_id ope_id main_campus branches open_admissions
## 891   166708 009756        TRUE        1           FALSE
## 1331  193405 007026        TRUE        1           FALSE
## 1403  196255 002839        TRUE        1           FALSE
## 1577  204477 024544        TRUE        1           FALSE
## 1789  215123 003352        TRUE        1           FALSE
## 2030  228909 009768        TRUE        1           FALSE
## 2180  237880 011245        TRUE        1           FALSE
## 2204  239169 024535        TRUE        1           FALSE
## 2536  435000 008067        TRUE        1           FALSE
##                                                              name
## 891          University of Massachusetts Medical School Worcester
## 1331                      Icahn School of Medicine at Mount Sinai
## 1403                                SUNY Downstate Medical Center
## 1577                            Northeast Ohio Medical University
## 1789                 Philadelphia College of Osteopathic Medicine
## 2030              University of North Texas Health Science Center
## 2180                 West Virginia School of Osteopathic Medicine
## 2204                                 Medical College of Wisconsin
## 2536 Louisiana State University Health Sciences Center-Shreveport
##              city state        zip online_only primary_degree
## 891     Worcester    MA      01655       FALSE       Graduate
## 1331     New York    NY 10029-6574       FALSE       Graduate
## 1403     Brooklyn    NY 11203-2098       FALSE       Bachelor
## 1577    Rootstown    OH 44272-0095       FALSE       Graduate
## 1789 Philadelphia    PA      19131       FALSE       Graduate
## 2030   Fort Worth    TX 76107-2699       FALSE       Graduate
## 2180    Lewisburg    WV      24901       FALSE       Graduate
## 2204    Milwaukee    WI 53226-0509       FALSE       Graduate
## 2536   Shreveport    LA      71103       FALSE       Bachelor
##      highest_degree ownership avg_sat undergrad_pop grad_pop cost tuition
## 891        Graduate    Public      NA            NA     1161   NA      NA
## 1331       Graduate Nonprofit      NA            NA     1090   NA      NA
## 1403       Graduate    Public      NA           346     1507   NA      NA
## 1577       Graduate    Public      NA            NA      832   NA      NA
## 1789       Graduate Nonprofit      NA            NA     2736   NA      NA
## 2030       Graduate    Public      NA            NA     2142   NA      NA
## 2180       Graduate    Public      NA            NA      817   NA      NA
## 2204       Graduate Nonprofit      NA            NA     1212   NA      NA
## 2536       Graduate    Public      NA            44      812   NA      NA
##      tuition_nonresident revenue_per_student spend_per_student
## 891                   NA               12759             54982
## 1331                  NA               27476             25776
## 1403                  NA               18956             66244
## 1577                  NA               32181             26496
## 1789                  NA               32107             16280
## 2030                  NA                9766             17864
## 2180                  NA               37958             20006
## 2204                  NA               28887             46284
## 2536                  NA               12256             57341
##      avg_faculty_salary ft_faculty admission retention completion fed_loan
## 891                8823     0.7273        NA        NA         NA       NA
## 1331                 NA         NA        NA        NA         NA       NA
## 1403              11620     0.8395        NA        NA         NA   0.4943
## 1577               8678     1.0000        NA        NA         NA       NA
## 1789                 NA         NA        NA        NA         NA       NA
## 2030                 NA         NA        NA        NA         NA       NA
## 2180                 NA         NA        NA        NA         NA       NA
## 2204                 NA         NA        NA        NA         NA       NA
## 2536               5584     0.8929        NA        NA         NA   0.4400
##      pell_grant avg_family_inc med_family_inc avg_10yr_salary
## 891          NA      21000.767        15361.0          214300
## 1331         NA      12484.354         2875.0          212600
## 1403     0.2045      36315.887        20755.0          160900
## 1577         NA       7882.177         4024.5          218300
## 1789         NA       3720.608          393.5          193800
## 2030         NA      19518.312        19352.5          197800
## 2180         NA      15011.905         7003.0          235000
## 2204         NA       9717.391         3859.5          250000
## 2536     0.2000      45746.333        26061.5          167300
##      sd_10yr_salary med_10yr_salary med_debt med_debt_withdraw
## 891          126400          183400       NA                NA
## 1331         128700          191300       NA                NA
## 1403         113800          125900    13938              8261
## 1577         108200          199200     3010              2375
## 1789         119500          177600     3000              2000
## 2030         149500          163800       NA                NA
## 2180         129300          220700     2000                NA
## 2204         172700          250000       NA                NA
## 2536         128600          122500       NA                NA
##      default_3yr_rate repay_5yr_rate_withdraw repay_5yr_rate avg_entry_age
## 891             0.006                      NA             NA      26.57534
## 1331            0.015                      NA             NA      25.50000
## 1403            0.002               0.9025974             NA      29.21127
## 1577            0.000                      NA             NA      24.34375
## 1789            0.026                      NA             NA      27.95556
## 2030            0.031                      NA             NA      30.75000
## 2180            0.004                      NA             NA      26.70538
## 2204            0.000                      NA             NA      26.15625
## 2536            0.004               0.8750000             NA      26.77778
##      veteran first_gen   male female race_white race_black race_hispanic
## 891       NA        NA     NA     NA         NA         NA            NA
## 1331      NA        NA     NA     NA         NA         NA            NA
## 1403      NA 0.3723849 0.1965 0.8035     0.3988     0.2168        0.1156
## 1577      NA        NA     NA     NA         NA         NA            NA
## 1789      NA 0.1461988     NA     NA         NA         NA            NA
## 2030      NA        NA     NA     NA         NA         NA            NA
## 2180      NA        NA     NA     NA         NA         NA            NA
## 2204      NA        NA     NA     NA         NA         NA            NA
## 2536      NA        NA 0.2727 0.7273     0.4773     0.2500        0.0227
##      race_asian race_native race_pacific net_cost race_other   group
## 891          NA          NA           NA       NA         NA Low_SAT
## 1331         NA          NA           NA       NA         NA Low_SAT
## 1403     0.1676      0.0058       0.0029       NA     0.0926 Low_SAT
## 1577         NA          NA           NA       NA         NA Low_SAT
## 1789         NA          NA           NA       NA         NA Low_SAT
## 2030         NA          NA           NA       NA         NA Low_SAT
## 2180         NA          NA           NA       NA         NA Low_SAT
## 2204         NA          NA           NA       NA         NA Low_SAT
## 2536     0.0682      0.0000       0.0000       NA     0.1818 Low_SAT
##      race_aw grad_proportion
## 891       NA              NA
## 1331      NA              NA
## 1403  0.5664       0.8132758
## 1577      NA              NA
## 1789      NA              NA
## 2030      NA              NA
## 2180      NA              NA
## 2204      NA              NA
## 2536  0.5455       0.9485981

# It seems that there are certain outliers to the top left, roughly above 150,000 avg 10 yr salary
# that are removed from the linear direction of the rest of the data. After looking into them,
# it seems that the majority of them have a Graduate degree as their primary degree, and all of
# them at least offer a Graduate degree as their highest degree available. Another interesting
# note is that after looking at their names, they are all some type of medical college.

# part B
# check the number of factor levels
levels(cs$highest_degree)

## [1] "Other"       "Certificate" "Associate"   "Bachelor"    "Graduate"

# optional R/Stats practice, test the new categorical variable's factor levels to see
# if there's an improvement of fit
plot(cs$avg_family_inc, cs$avg_10yr_salary, main = "Avg. Family Income vs. Avg. 10 yr. Salary",
     xlab = "Avg. Family Income", ylab = "Avg. 10 yr. Salary")
fit1 = lm(avg_10yr_salary~avg_family_inc + highest_degree, data=cs)
fit1$coef

##               (Intercept)            avg_family_inc 
##              3.874917e+04              2.177299e-01 
## highest_degreeCertificate   highest_degreeAssociate 
##             -1.486046e+04             -9.377607e+03 
##    highest_degreeBachelor    highest_degreeGraduate 
##             -8.754538e+03             -1.313234e+03

abline(a = fit1$coef[1], b = fit1$coef[2], col="orange")
abline(a = fit1$coef[1]+fit1$coef[3], b = fit1$coef[2], lty=2, col="pink")
abline(a = fit1$coef[1]+fit1$coef[4], b = fit1$coef[2], lty=3, col="green")
abline(a = fit1$coef[1]+fit1$coef[5], b = fit1$coef[2], lty=4, col="blue")
abline(a = fit1$coef[1]+fit1$coef[6], b = fit1$coef[2], lty=5, col="red")
abline(lm(avg_10yr_salary~avg_family_inc, data=cs))
legend("topright", c("other", "Certificate", "Associate", "Bachelor", "Graduate",
                     "Original regression line"),
       col = c("orange", "pink", "green", "blue", "red", "black"), lty = c(1,2,3,4,5,1))

# Orange and red seem to be a group which are better at estimating the line due to them taking into
# account the data which are higher up. The lines also seem somewhat far apart from each other,
# therefore, adding categorical variables are useful. Some of the categorical variables however
# are quite similar and close to each other, therefore they aren't as useful in predicting the data.
# Since orange and red are these two, it makes sense that 'Other' and 'Graduate' are useful levels
# for predicting the average 10 year salary.

College Data Analytics

Jared Yu

April 16, 2018