Introduction

Packages Required

# install.packages("tidyverse")
library(tidyverse)

# install.packages("skimr")
library(skimr)

The tidyverse suite of packages is used for general data cleaning (tidyr), manipulation (dplyr), and visualization (ggplot2).

The skimr package allows us to obtain an initial high level view of the varaibles in a data set.

Data Preparation

# Import data
scorecard <- read.csv("scorecard.csv",
                      na.strings = c("NULL","NA"),
                      stringsAsFactors = FALSE)
skim(scorecard)
Data summary
Name scorecard
Number of rows 7115
Number of columns 29
_______________________
Column type frequency:
character 4
numeric 25
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
INSTNM 0 1 3 93 0 6981 0
CITY 0 1 3 24 0 2475 0
STABBR 0 1 2 2 0 59 0
CONTROL 0 1 1 18 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ID 0 1.00 1866527.32 6957330.02 100654.00 174096.50 229027.00 450609.50 49005401.00 ▇▁▁▁▁
ZIP 0 1.00 48903.82 29606.47 602.00 23826.00 46208.00 75441.50 99801.00 ▇▇▆▇▇
LOCALE 444 0.94 19.79 9.54 -3.00 12.00 21.00 22.00 43.00 ▁▇▅▂▂
LATITUDE 445 0.94 37.36 5.84 -14.32 33.96 38.79 41.33 71.32 ▁▁▆▇▁
LONGITUDE 445 0.94 -90.32 17.84 -170.74 -97.34 -86.34 -78.89 171.38 ▂▇▁▁▁
HBCU 528 0.93 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
MENONLY 444 0.94 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
WOMENONLY 444 0.94 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
ADM_RATE 5078 0.29 0.68 0.21 0.00 0.55 0.71 0.84 1.00 ▁▂▅▇▇
ACTCM25 5823 0.18 20.55 3.88 1.00 18.00 20.00 22.00 34.00 ▁▁▇▅▁
ACTCM75 5823 0.18 25.82 3.65 9.00 23.00 26.00 28.00 35.00 ▁▁▆▇▂
ACTCMMID 5823 0.18 23.44 3.71 6.00 21.00 23.00 25.00 35.00 ▁▁▇▅▁
SAT_AVG 5795 0.19 1131.77 130.10 564.00 1044.75 1117.00 1195.00 1558.00 ▁▁▇▃▁
UGDS 748 0.89 2426.06 5481.33 0.00 106.00 401.00 2018.00 77269.00 ▇▁▁▁▁
COSTT4_A 3531 0.50 26337.07 15015.59 0.00 14000.25 22646.50 33941.75 93704.00 ▇▇▃▁▁
AVGFACSAL 2868 0.60 6617.13 2458.97 0.00 4965.00 6364.00 7945.50 22924.00 ▂▇▂▁▁
PCTPELL 770 0.89 0.48 0.22 0.00 0.31 0.46 0.65 1.00 ▃▇▇▆▂
PCTFLOAN 770 0.89 0.48 0.28 0.00 0.26 0.54 0.70 1.00 ▆▃▆▇▃
AGE_ENTRY 500 0.93 26.01 3.99 17.43 23.18 25.78 28.51 58.90 ▇▇▁▁▁
FEMALE 1429 0.80 0.64 0.19 0.02 0.55 0.63 0.77 0.98 ▁▁▆▇▅
MARRIED 1392 0.80 0.16 0.10 0.00 0.10 0.15 0.22 0.82 ▇▆▁▁▁
DEPENDENT 921 0.87 0.49 0.25 0.03 0.29 0.46 0.68 0.99 ▃▇▆▅▅
VETERAN 4538 0.36 0.02 0.02 0.00 0.01 0.01 0.02 0.35 ▇▁▁▁▁
FIRST_GEN 1247 0.82 0.46 0.13 0.09 0.38 0.48 0.54 0.96 ▁▅▇▁▁
FAMINC 500 0.93 38482.72 23169.95 321.39 22668.03 31447.49 48098.65 174263.25 ▇▅▁▁▁

Missing Data

For the analyses here, I excluded missing values (“na.rm = TRUE”) in various codes.

Data Errors

The first error is with the miscoded control types. We can address that using multiple ifelse statements.

scorecard <- scorecard %>%
  mutate(CONTROL = ifelse(CONTROL == "Public", 1,
                          ifelse(CONTROL == "Private nonprofit", 2,
                                 ifelse(CONTROL %in% c("Private for-profit","For profit"), 3, CONTROL)))) %>%
  mutate(CONTROL = factor(CONTROL, 
                          levels = c(1,2,3),
                          labels = c("Public",
                                     "Private Non-Profit",
                                     "Private For-Profit")))

Data Conformity

See above

Dummy Variables

scorecard <- scorecard %>%
  mutate(high_income = ifelse(FAMINC > 54021, 1, 0),
         name = tolower(INSTNM))
scorecard_universities <- scorecard %>% 
  filter(str_detect(name,"university")) %>%
  pull(name)
scorecard_colleges <- scorecard %>% 
  filter(str_detect(name,"college")) %>%
  pull(name)
scorecard <- scorecard %>%
  mutate(is_college = ifelse(name %in% scorecard_colleges, 1, 0),
         is_university = ifelse(name %in% scorecard_universities, 1, 0))

For the above, I realized there were some institutions that were neither colleges nor universities (e.g., military intitutes), hence the need for two indicator variables.

# variable for states that border OH
border_state <- c("KY","IN","PA","WV","MI")
scorecard <- scorecard %>%
  mutate(borders_oh = ifelse(STABBR %in% border_state, 1, 0))

Other Variables

scorecard <- scorecard %>%
  mutate(majority_female = ifelse(FEMALE > 0.5, 1, 0),
         high_sat = ifelse(SAT_AVG > 1195, 1, 0),
         selective = ifelse(ADM_RATE > 0.84, 1, 0))

Directed Analyses

Cost: private vs. public

ggplot(data = scorecard, 
       aes(x = CONTROL, y = COSTT4_A)) + 
  geom_boxplot()
## Warning: Removed 3531 rows containing non-finite values (stat_boxplot).

scorecard %>%
  group_by(CONTROL) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A,na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            min_cost = min(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE),
            max_cost = max(COSTT4_A, na.rm = TRUE))
## # A tibble: 3 x 7
##   CONTROL                n mean_cost sd_cost min_cost med_cost max_cost
##   <fct>              <int>     <dbl>   <dbl>    <int>    <dbl>    <int>
## 1 Public              2076    15695.   5558.        0    14091    36172
## 2 Private Non-Profit  2041    38760.  15340.     6950    38406    72717
## 3 Private For-Profit  2998    26329.   7905.     7908    26084    93704

#Private Schools do cost more than private schools

Cost of Xavier

scorecard %>%
  filter(str_detect(name,"xavier")) %>%
  select(name, FAMINC)
##                                                                              name
## 1                                                         saint xavier university
## 2                                                  xavier university of louisiana
## 3                                                               xavier university
## 4                                                xavier college school of nursing
## 5 the chicago school of professional psychology at xavier university of louisiana
##      FAMINC
## 1  49967.74
## 2  49213.89
## 3 114329.60
## 4  34897.15
## 5  31396.26
# 114329.60
ggplot(scorecard, aes(x = FAMINC)) + 
  geom_histogram(bins = 45) + 
  ggtitle("Distribution of Family Income Nationally")
## Warning: Removed 500 rows containing non-finite values (stat_bin).

scorecard %>%
  filter(STABBR == "OH") %>%
  ggplot(., aes(x = FAMINC)) + 
  geom_histogram(bins = 20) + 
  ggtitle("Distribution of Family Income in OH")
## Warning: Removed 19 rows containing non-finite values (stat_bin).

#Average Family Incomes of Xavier Families are higher than the national average and in Ohio.

Cost of OH schools vs. bordering states

scorecard %>%
  filter(STABBR == "OH") %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))
##    n mean_cost  sd_cost med_cost
## 1 82  28735.86 13684.94    26641
scorecard %>%
  filter(STABBR %in% border_state) %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))
##     n mean_cost  sd_cost med_cost
## 1 245  32779.09 13672.09    27797
scorecard %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))
##      n mean_cost  sd_cost med_cost
## 1 1771  30350.82 14051.51    26226

Cost of Ohio Schools are about the average nationally.

```

Self Directed Analysis

ggplot(scorecard,
       aes(x = log(UGDS + 1), y = COSTT4_A)) +
  geom_point() +
  ggtitle("Cost vs. no. undergrads")
## Warning: Removed 3531 rows containing missing values (geom_point).

ggplot(scorecard, 
       aes(x = COSTT4_A, y = AVGFACSAL)) + 
  geom_point()
## Warning: Removed 3695 rows containing missing values (geom_point).

ggplot(scorecard, 
       aes(y = FAMINC, x = COSTT4_A)) + 
  geom_point()
## Warning: Removed 3554 rows containing missing values (geom_point).

Self directed questions

Proportion married vs control type

scorecard %>%
  group_by(CONTROL) %>%
  summarize(mean_married = mean(MARRIED, na.rm = TRUE),
            sd_married = sd(MARRIED, na.rm = TRUE),
            min_married = min(MARRIED, na.rm = TRUE),
            med_married = median(MARRIED, na.rm = TRUE),
            max_married = max(MARRIED, na.rm = TRUE))
## # A tibble: 3 x 6
##   CONTROL            mean_married sd_married min_married med_married max_married
##   <fct>                     <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
## 1 Public                    0.128     0.0791        0           0.12        0.48
## 2 Private Non-Profit        0.153     0.124         0.01        0.13        0.82
## 3 Private For-Profit        0.197     0.0893        0.03        0.18        0.69
scorecard %>%
  filter(STABBR == "OH") %>%
  group_by(CONTROL) %>%
  summarize(mean_married = mean(MARRIED, na.rm = TRUE),
            sd_married = sd(MARRIED, na.rm = TRUE),
            min_married = min(MARRIED, na.rm = TRUE),
            med_married = median(MARRIED, na.rm = TRUE),
            max_married = max(MARRIED, na.rm = TRUE))
## # A tibble: 3 x 6
##   CONTROL            mean_married sd_married min_married med_married max_married
##   <fct>                     <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
## 1 Public                    0.158     0.0929        0.03       0.14         0.36
## 2 Private Non-Profit        0.127     0.103         0.01       0.095        0.51
## 3 Private For-Profit        0.168     0.0671        0.05       0.155        0.39

#Married families have direct relationships to wanting a public or private school.

Proportion veteran vs control type

scorecard %>%
  group_by(CONTROL) %>%
  summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
            sd_vet = sd(VETERAN, na.rm = TRUE),
            min_vet = min(VETERAN, na.rm = TRUE),
            med_vet = median(VETERAN, na.rm = TRUE),
            max_vet = max(VETERAN, na.rm = TRUE))
## # A tibble: 3 x 6
##   CONTROL            mean_vet sd_vet min_vet med_vet max_vet
##   <fct>                 <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 Public              0.00904 0.0107       0    0.01    0.16
## 2 Private Non-Profit  0.0180  0.0243       0    0.01    0.27
## 3 Private For-Profit  0.0209  0.0296       0    0.01    0.35
scorecard %>%
  filter(STABBR == "OH") %>%
  group_by(CONTROL) %>%
  summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
            sd_vet = sd(VETERAN, na.rm = TRUE),
            min_vet = min(VETERAN, na.rm = TRUE),
            med_vet = median(VETERAN, na.rm = TRUE),
            max_vet = max(VETERAN, na.rm = TRUE))
## # A tibble: 3 x 6
##   CONTROL            mean_vet sd_vet min_vet med_vet max_vet
##   <fct>                 <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 Public               0.0158 0.0197    0       0.01    0.08
## 2 Private Non-Profit   0.0227 0.0224    0.01    0.01    0.08
## 3 Private For-Profit   0.0260 0.0196    0.01    0.02    0.1

##Veteran and Control type have a relationship when considering veteran rate to control.