Introduction

Packages Required

# install.packages("tidyverse")
library(tidyverse)

# install.packages("skimr")
library(skimr)

The tidyverse suite of packages is used for general data cleaning (tidyr), manipulation (dplyr), and visualization (ggplot2).

The skimr package allows us to obtain an initial high level view of the varaibles in a data set.

Data Preparation

# Import data
scorecard <- read.csv("scorecard.csv",
                      na.strings = c("NULL","NA"),
                      stringsAsFactors = FALSE)

skim(scorecard)

Data summary
Name	scorecard
Number of rows	7115
Number of columns	29
_______________________
Column type frequency:
character	4
numeric	25
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
INSTNM	1	3	93	6981
CITY	1	3	24	2475
STABBR	1	2	2	59
CONTROL	1	1	18	7

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ID	0	1.00	1866527.32	6957330.02	100654.00	174096.50	229027.00	450609.50	49005401.00	▇▁▁▁▁
ZIP	0	1.00	48903.82	29606.47	602.00	23826.00	46208.00	75441.50	99801.00	▇▇▆▇▇
LOCALE	444	0.94	19.79	9.54	-3.00	12.00	21.00	22.00	43.00	▁▇▅▂▂
LATITUDE	445	0.94	37.36	5.84	-14.32	33.96	38.79	41.33	71.32	▁▁▆▇▁
LONGITUDE	445	0.94	-90.32	17.84	-170.74	-97.34	-86.34	-78.89	171.38	▂▇▁▁▁
HBCU	528	0.93	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
MENONLY	444	0.94	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
WOMENONLY	444	0.94	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
ADM_RATE	5078	0.29	0.68	0.21	0.00	0.55	0.71	0.84	1.00	▁▂▅▇▇
ACTCM25	5823	0.18	20.55	3.88	1.00	18.00	20.00	22.00	34.00	▁▁▇▅▁
ACTCM75	5823	0.18	25.82	3.65	9.00	23.00	26.00	28.00	35.00	▁▁▆▇▂
ACTCMMID	5823	0.18	23.44	3.71	6.00	21.00	23.00	25.00	35.00	▁▁▇▅▁
SAT_AVG	5795	0.19	1131.77	130.10	564.00	1044.75	1117.00	1195.00	1558.00	▁▁▇▃▁
UGDS	748	0.89	2426.06	5481.33	0.00	106.00	401.00	2018.00	77269.00	▇▁▁▁▁
COSTT4_A	3531	0.50	26337.07	15015.59	0.00	14000.25	22646.50	33941.75	93704.00	▇▇▃▁▁
AVGFACSAL	2868	0.60	6617.13	2458.97	0.00	4965.00	6364.00	7945.50	22924.00	▂▇▂▁▁
PCTPELL	770	0.89	0.48	0.22	0.00	0.31	0.46	0.65	1.00	▃▇▇▆▂
PCTFLOAN	770	0.89	0.48	0.28	0.00	0.26	0.54	0.70	1.00	▆▃▆▇▃
AGE_ENTRY	500	0.93	26.01	3.99	17.43	23.18	25.78	28.51	58.90	▇▇▁▁▁
FEMALE	1429	0.80	0.64	0.19	0.02	0.55	0.63	0.77	0.98	▁▁▆▇▅
MARRIED	1392	0.80	0.16	0.10	0.00	0.10	0.15	0.22	0.82	▇▆▁▁▁
DEPENDENT	921	0.87	0.49	0.25	0.03	0.29	0.46	0.68	0.99	▃▇▆▅▅
VETERAN	4538	0.36	0.02	0.02	0.00	0.01	0.01	0.02	0.35	▇▁▁▁▁
FIRST_GEN	1247	0.82	0.46	0.13	0.09	0.38	0.48	0.54	0.96	▁▅▇▁▁
FAMINC	500	0.93	38482.72	23169.95	321.39	22668.03	31447.49	48098.65	174263.25	▇▅▁▁▁

Missing Data

For the analyses here, I excluded missing values (“na.rm = TRUE”) in various codes.

Data Errors

The first error is with the miscoded control types. We can address that using multiple ifelse statements.

scorecard <- scorecard %>%
  mutate(CONTROL = ifelse(CONTROL == "Public", 1,
                          ifelse(CONTROL == "Private nonprofit", 2,
                                 ifelse(CONTROL %in% c("Private for-profit","For profit"), 3, CONTROL)))) %>%
  mutate(CONTROL = factor(CONTROL, 
                          levels = c(1,2,3),
                          labels = c("Public",
                                     "Private Non-Profit",
                                     "Private For-Profit")))

Data Conformity

See above

Dummy Variables

scorecard <- scorecard %>%
  mutate(high_income = ifelse(FAMINC > 54021, 1, 0),
         name = tolower(INSTNM))

scorecard_universities <- scorecard %>% 
  filter(str_detect(name,"university")) %>%
  pull(name)
scorecard_colleges <- scorecard %>% 
  filter(str_detect(name,"college")) %>%
  pull(name)

scorecard <- scorecard %>%
  mutate(is_college = ifelse(name %in% scorecard_colleges, 1, 0),
         is_university = ifelse(name %in% scorecard_universities, 1, 0))

For the above, I realized there were some institutions that were neither colleges nor universities (e.g., military intitutes), hence the need for two indicator variables.

# variable for states that border OH
border_state <- c("KY","IN","PA","WV","MI")
scorecard <- scorecard %>%
  mutate(borders_oh = ifelse(STABBR %in% border_state, 1, 0))

Other Variables

scorecard <- scorecard %>%
  mutate(majority_female = ifelse(FEMALE > 0.5, 1, 0),
         high_sat = ifelse(SAT_AVG > 1195, 1, 0),
         selective = ifelse(ADM_RATE > 0.84, 1, 0))

Simple Analysis and Trends

Number of schools in Ohio/borders Ohio

scorecard <- scorecard %>%
  mutate(in_oh = ifelse(STABBR == "OH", 1,0))
oh_border_df <- scorecard %>% 
  summarize(n_oh = sum(in_oh, na.rm = TRUE),
            n_border_oh = sum(borders_oh, na.rm = TRUE)) %>% t() %>%
  as.data.frame()

category <- c("In OH", "Borders OH")
num <- c(318, 903)

borders_df <- as.data.frame(cbind(category,num))
ggplot(borders_df,
       aes(x = category, y = num, fill = category)) + 
  geom_col() + 
  ggtitle("Number of universities in OH vs. Bordering OH")

Figure for task 4.1

Cost vs. Family Income

ggplot(scorecard, aes(x = FAMINC, y = COSTT4_A)) + 
  geom_point() + 
  xlab("Average family income") + 
  ylab("Cost of attendence")

## Warning: Removed 3554 rows containing missing values (geom_point).

Number of undergrads at each control type

scorecard_control1 <- scorecard %>%
  filter(CONTROL == "Public")
scorecard_control2 <- scorecard %>%
  filter(CONTROL == "Private Non-Profit")
scorecard_control3 <- scorecard %>%
  filter(CONTROL == "Private For-Profit")

ggplot(data = scorecard_control1, aes(x = UGDS)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 110 rows containing non-finite values (stat_bin).

ggplot(data = scorecard_control2, aes(x = UGDS)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 386 rows containing non-finite values (stat_bin).

ggplot(data = scorecard_control3, aes(x = UGDS)) + 
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 252 rows containing non-finite values (stat_bin).

SAT/ACT vs. Family Income in OH Border states

scorecard %>%
  filter(STABBR %in% border_state) %>%
  ggplot(., aes(y = SAT_AVG, x = FAMINC)) + 
  geom_point()

## Warning: Removed 685 rows containing missing values (geom_point).

Directed Analyses

Cost: private vs. public

ggplot(data = scorecard, 
       aes(x = CONTROL, y = COSTT4_A)) + 
  geom_boxplot()

## Warning: Removed 3531 rows containing non-finite values (stat_boxplot).

scorecard %>%
  group_by(CONTROL) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A,na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            min_cost = min(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE),
            max_cost = max(COSTT4_A, na.rm = TRUE))

## # A tibble: 3 x 7
##   CONTROL                n mean_cost sd_cost min_cost med_cost max_cost
##   <fct>              <int>     <dbl>   <dbl>    <int>    <dbl>    <int>
## 1 Public              2076    15695.   5558.        0    14091    36172
## 2 Private Non-Profit  2041    38760.  15340.     6950    38406    72717
## 3 Private For-Profit  2998    26329.   7905.     7908    26084    93704

#Private Schools do cost more than private schools

Cost of Xavier

scorecard %>%
  filter(str_detect(name,"xavier")) %>%
  select(name, FAMINC)

##                                                                              name
## 1                                                         saint xavier university
## 2                                                  xavier university of louisiana
## 3                                                               xavier university
## 4                                                xavier college school of nursing
## 5 the chicago school of professional psychology at xavier university of louisiana
##      FAMINC
## 1  49967.74
## 2  49213.89
## 3 114329.60
## 4  34897.15
## 5  31396.26

# 114329.60

ggplot(scorecard, aes(x = FAMINC)) + 
  geom_histogram(bins = 45) + 
  ggtitle("Distribution of Family Income Nationally")

## Warning: Removed 500 rows containing non-finite values (stat_bin).

scorecard %>%
  filter(STABBR == "OH") %>%
  ggplot(., aes(x = FAMINC)) + 
  geom_histogram(bins = 20) + 
  ggtitle("Distribution of Family Income in OH")

## Warning: Removed 19 rows containing non-finite values (stat_bin).

#Average Family Incomes of Xavier Families are higher than the national average and in Ohio.

Cost of OH schools vs. bordering states

scorecard %>%
  filter(STABBR == "OH") %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))

##    n mean_cost  sd_cost med_cost
## 1 82  28735.86 13684.94    26641

scorecard %>%
  filter(STABBR %in% border_state) %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))

##     n mean_cost  sd_cost med_cost
## 1 245  32779.09 13672.09    27797

scorecard %>%
  filter(is_university == 1) %>%
  summarize(n = n(),
            mean_cost = mean(COSTT4_A, na.rm = TRUE),
            sd_cost = sd(COSTT4_A, na.rm = TRUE),
            med_cost = median(COSTT4_A, na.rm = TRUE))

##      n mean_cost  sd_cost med_cost
## 1 1771  30350.82 14051.51    26226

Cost of Ohio Schools are about the average nationally.

```

Self Directed Analysis

ggplot(scorecard,
       aes(x = log(UGDS + 1), y = COSTT4_A)) +
  geom_point() +
  ggtitle("Cost vs. no. undergrads")

## Warning: Removed 3531 rows containing missing values (geom_point).

ggplot(scorecard, 
       aes(x = COSTT4_A, y = AVGFACSAL)) + 
  geom_point()

## Warning: Removed 3695 rows containing missing values (geom_point).

ggplot(scorecard, 
       aes(y = FAMINC, x = COSTT4_A)) + 
  geom_point()

## Warning: Removed 3554 rows containing missing values (geom_point).

Self directed questions

Proportion married vs control type

scorecard %>%
  group_by(CONTROL) %>%
  summarize(mean_married = mean(MARRIED, na.rm = TRUE),
            sd_married = sd(MARRIED, na.rm = TRUE),
            min_married = min(MARRIED, na.rm = TRUE),
            med_married = median(MARRIED, na.rm = TRUE),
            max_married = max(MARRIED, na.rm = TRUE))

## # A tibble: 3 x 6
##   CONTROL            mean_married sd_married min_married med_married max_married
##   <fct>                     <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
## 1 Public                    0.128     0.0791        0           0.12        0.48
## 2 Private Non-Profit        0.153     0.124         0.01        0.13        0.82
## 3 Private For-Profit        0.197     0.0893        0.03        0.18        0.69

scorecard %>%
  filter(STABBR == "OH") %>%
  group_by(CONTROL) %>%
  summarize(mean_married = mean(MARRIED, na.rm = TRUE),
            sd_married = sd(MARRIED, na.rm = TRUE),
            min_married = min(MARRIED, na.rm = TRUE),
            med_married = median(MARRIED, na.rm = TRUE),
            max_married = max(MARRIED, na.rm = TRUE))

## # A tibble: 3 x 6
##   CONTROL            mean_married sd_married min_married med_married max_married
##   <fct>                     <dbl>      <dbl>       <dbl>       <dbl>       <dbl>
## 1 Public                    0.158     0.0929        0.03       0.14         0.36
## 2 Private Non-Profit        0.127     0.103         0.01       0.095        0.51
## 3 Private For-Profit        0.168     0.0671        0.05       0.155        0.39

#Married families have direct relationships to wanting a public or private school.

Proportion veteran vs control type

scorecard %>%
  group_by(CONTROL) %>%
  summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
            sd_vet = sd(VETERAN, na.rm = TRUE),
            min_vet = min(VETERAN, na.rm = TRUE),
            med_vet = median(VETERAN, na.rm = TRUE),
            max_vet = max(VETERAN, na.rm = TRUE))

## # A tibble: 3 x 6
##   CONTROL            mean_vet sd_vet min_vet med_vet max_vet
##   <fct>                 <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 Public              0.00904 0.0107       0    0.01    0.16
## 2 Private Non-Profit  0.0180  0.0243       0    0.01    0.27
## 3 Private For-Profit  0.0209  0.0296       0    0.01    0.35

scorecard %>%
  filter(STABBR == "OH") %>%
  group_by(CONTROL) %>%
  summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
            sd_vet = sd(VETERAN, na.rm = TRUE),
            min_vet = min(VETERAN, na.rm = TRUE),
            med_vet = median(VETERAN, na.rm = TRUE),
            max_vet = max(VETERAN, na.rm = TRUE))

## # A tibble: 3 x 6
##   CONTROL            mean_vet sd_vet min_vet med_vet max_vet
##   <fct>                 <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 Public               0.0158 0.0197    0       0.01    0.08
## 2 Private Non-Profit   0.0227 0.0224    0.01    0.01    0.08
## 3 Private For-Profit   0.0260 0.0196    0.01    0.02    0.1

##Veteran and Control type have a relationship when considering veteran rate to control.

BAIS 496 R-Exam

Conor Peterson

3/26/2020