# install.packages("tidyverse")
library(tidyverse)
# install.packages("skimr")
library(skimr)
The tidyverse suite of packages is used for general data cleaning (tidyr), manipulation (dplyr), and visualization (ggplot2).
The skimr package allows us to obtain an initial high level view of the varaibles in a data set.
# Import data
scorecard <- read.csv("scorecard.csv",
na.strings = c("NULL","NA"),
stringsAsFactors = FALSE)
skim(scorecard)
| Name | scorecard |
| Number of rows | 7115 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 25 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| INSTNM | 0 | 1 | 3 | 93 | 0 | 6981 | 0 |
| CITY | 0 | 1 | 3 | 24 | 0 | 2475 | 0 |
| STABBR | 0 | 1 | 2 | 2 | 0 | 59 | 0 |
| CONTROL | 0 | 1 | 1 | 18 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 1866527.32 | 6957330.02 | 100654.00 | 174096.50 | 229027.00 | 450609.50 | 49005401.00 | ▇▁▁▁▁ |
| ZIP | 0 | 1.00 | 48903.82 | 29606.47 | 602.00 | 23826.00 | 46208.00 | 75441.50 | 99801.00 | ▇▇▆▇▇ |
| LOCALE | 444 | 0.94 | 19.79 | 9.54 | -3.00 | 12.00 | 21.00 | 22.00 | 43.00 | ▁▇▅▂▂ |
| LATITUDE | 445 | 0.94 | 37.36 | 5.84 | -14.32 | 33.96 | 38.79 | 41.33 | 71.32 | ▁▁▆▇▁ |
| LONGITUDE | 445 | 0.94 | -90.32 | 17.84 | -170.74 | -97.34 | -86.34 | -78.89 | 171.38 | ▂▇▁▁▁ |
| HBCU | 528 | 0.93 | 0.02 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| MENONLY | 444 | 0.94 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| WOMENONLY | 444 | 0.94 | 0.01 | 0.08 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| ADM_RATE | 5078 | 0.29 | 0.68 | 0.21 | 0.00 | 0.55 | 0.71 | 0.84 | 1.00 | ▁▂▅▇▇ |
| ACTCM25 | 5823 | 0.18 | 20.55 | 3.88 | 1.00 | 18.00 | 20.00 | 22.00 | 34.00 | ▁▁▇▅▁ |
| ACTCM75 | 5823 | 0.18 | 25.82 | 3.65 | 9.00 | 23.00 | 26.00 | 28.00 | 35.00 | ▁▁▆▇▂ |
| ACTCMMID | 5823 | 0.18 | 23.44 | 3.71 | 6.00 | 21.00 | 23.00 | 25.00 | 35.00 | ▁▁▇▅▁ |
| SAT_AVG | 5795 | 0.19 | 1131.77 | 130.10 | 564.00 | 1044.75 | 1117.00 | 1195.00 | 1558.00 | ▁▁▇▃▁ |
| UGDS | 748 | 0.89 | 2426.06 | 5481.33 | 0.00 | 106.00 | 401.00 | 2018.00 | 77269.00 | ▇▁▁▁▁ |
| COSTT4_A | 3531 | 0.50 | 26337.07 | 15015.59 | 0.00 | 14000.25 | 22646.50 | 33941.75 | 93704.00 | ▇▇▃▁▁ |
| AVGFACSAL | 2868 | 0.60 | 6617.13 | 2458.97 | 0.00 | 4965.00 | 6364.00 | 7945.50 | 22924.00 | ▂▇▂▁▁ |
| PCTPELL | 770 | 0.89 | 0.48 | 0.22 | 0.00 | 0.31 | 0.46 | 0.65 | 1.00 | ▃▇▇▆▂ |
| PCTFLOAN | 770 | 0.89 | 0.48 | 0.28 | 0.00 | 0.26 | 0.54 | 0.70 | 1.00 | ▆▃▆▇▃ |
| AGE_ENTRY | 500 | 0.93 | 26.01 | 3.99 | 17.43 | 23.18 | 25.78 | 28.51 | 58.90 | ▇▇▁▁▁ |
| FEMALE | 1429 | 0.80 | 0.64 | 0.19 | 0.02 | 0.55 | 0.63 | 0.77 | 0.98 | ▁▁▆▇▅ |
| MARRIED | 1392 | 0.80 | 0.16 | 0.10 | 0.00 | 0.10 | 0.15 | 0.22 | 0.82 | ▇▆▁▁▁ |
| DEPENDENT | 921 | 0.87 | 0.49 | 0.25 | 0.03 | 0.29 | 0.46 | 0.68 | 0.99 | ▃▇▆▅▅ |
| VETERAN | 4538 | 0.36 | 0.02 | 0.02 | 0.00 | 0.01 | 0.01 | 0.02 | 0.35 | ▇▁▁▁▁ |
| FIRST_GEN | 1247 | 0.82 | 0.46 | 0.13 | 0.09 | 0.38 | 0.48 | 0.54 | 0.96 | ▁▅▇▁▁ |
| FAMINC | 500 | 0.93 | 38482.72 | 23169.95 | 321.39 | 22668.03 | 31447.49 | 48098.65 | 174263.25 | ▇▅▁▁▁ |
For the analyses here, I excluded missing values (“na.rm = TRUE”) in various codes.
The first error is with the miscoded control types. We can address that using multiple ifelse statements.
scorecard <- scorecard %>%
mutate(CONTROL = ifelse(CONTROL == "Public", 1,
ifelse(CONTROL == "Private nonprofit", 2,
ifelse(CONTROL %in% c("Private for-profit","For profit"), 3, CONTROL)))) %>%
mutate(CONTROL = factor(CONTROL,
levels = c(1,2,3),
labels = c("Public",
"Private Non-Profit",
"Private For-Profit")))
See above
scorecard <- scorecard %>%
mutate(high_income = ifelse(FAMINC > 54021, 1, 0),
name = tolower(INSTNM))
scorecard_universities <- scorecard %>%
filter(str_detect(name,"university")) %>%
pull(name)
scorecard_colleges <- scorecard %>%
filter(str_detect(name,"college")) %>%
pull(name)
scorecard <- scorecard %>%
mutate(is_college = ifelse(name %in% scorecard_colleges, 1, 0),
is_university = ifelse(name %in% scorecard_universities, 1, 0))
For the above, I realized there were some institutions that were neither colleges nor universities (e.g., military intitutes), hence the need for two indicator variables.
# variable for states that border OH
border_state <- c("KY","IN","PA","WV","MI")
scorecard <- scorecard %>%
mutate(borders_oh = ifelse(STABBR %in% border_state, 1, 0))
scorecard <- scorecard %>%
mutate(majority_female = ifelse(FEMALE > 0.5, 1, 0),
high_sat = ifelse(SAT_AVG > 1195, 1, 0),
selective = ifelse(ADM_RATE > 0.84, 1, 0))
scorecard <- scorecard %>%
mutate(in_oh = ifelse(STABBR == "OH", 1,0))
oh_border_df <- scorecard %>%
summarize(n_oh = sum(in_oh, na.rm = TRUE),
n_border_oh = sum(borders_oh, na.rm = TRUE)) %>% t() %>%
as.data.frame()
category <- c("In OH", "Borders OH")
num <- c(318, 903)
borders_df <- as.data.frame(cbind(category,num))
ggplot(borders_df,
aes(x = category, y = num, fill = category)) +
geom_col() +
ggtitle("Number of universities in OH vs. Bordering OH")
Figure for task 4.1
ggplot(scorecard, aes(x = FAMINC, y = COSTT4_A)) +
geom_point() +
xlab("Average family income") +
ylab("Cost of attendence")
## Warning: Removed 3554 rows containing missing values (geom_point).
scorecard_control1 <- scorecard %>%
filter(CONTROL == "Public")
scorecard_control2 <- scorecard %>%
filter(CONTROL == "Private Non-Profit")
scorecard_control3 <- scorecard %>%
filter(CONTROL == "Private For-Profit")
ggplot(data = scorecard_control1, aes(x = UGDS)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 110 rows containing non-finite values (stat_bin).
ggplot(data = scorecard_control2, aes(x = UGDS)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 386 rows containing non-finite values (stat_bin).
ggplot(data = scorecard_control3, aes(x = UGDS)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 252 rows containing non-finite values (stat_bin).
scorecard %>%
filter(STABBR %in% border_state) %>%
ggplot(., aes(y = SAT_AVG, x = FAMINC)) +
geom_point()
## Warning: Removed 685 rows containing missing values (geom_point).
ggplot(data = scorecard,
aes(x = CONTROL, y = COSTT4_A)) +
geom_boxplot()
## Warning: Removed 3531 rows containing non-finite values (stat_boxplot).
scorecard %>%
group_by(CONTROL) %>%
summarize(n = n(),
mean_cost = mean(COSTT4_A,na.rm = TRUE),
sd_cost = sd(COSTT4_A, na.rm = TRUE),
min_cost = min(COSTT4_A, na.rm = TRUE),
med_cost = median(COSTT4_A, na.rm = TRUE),
max_cost = max(COSTT4_A, na.rm = TRUE))
## # A tibble: 3 x 7
## CONTROL n mean_cost sd_cost min_cost med_cost max_cost
## <fct> <int> <dbl> <dbl> <int> <dbl> <int>
## 1 Public 2076 15695. 5558. 0 14091 36172
## 2 Private Non-Profit 2041 38760. 15340. 6950 38406 72717
## 3 Private For-Profit 2998 26329. 7905. 7908 26084 93704
#Private Schools do cost more than private schools
scorecard %>%
filter(str_detect(name,"xavier")) %>%
select(name, FAMINC)
## name
## 1 saint xavier university
## 2 xavier university of louisiana
## 3 xavier university
## 4 xavier college school of nursing
## 5 the chicago school of professional psychology at xavier university of louisiana
## FAMINC
## 1 49967.74
## 2 49213.89
## 3 114329.60
## 4 34897.15
## 5 31396.26
# 114329.60
ggplot(scorecard, aes(x = FAMINC)) +
geom_histogram(bins = 45) +
ggtitle("Distribution of Family Income Nationally")
## Warning: Removed 500 rows containing non-finite values (stat_bin).
scorecard %>%
filter(STABBR == "OH") %>%
ggplot(., aes(x = FAMINC)) +
geom_histogram(bins = 20) +
ggtitle("Distribution of Family Income in OH")
## Warning: Removed 19 rows containing non-finite values (stat_bin).
#Average Family Incomes of Xavier Families are higher than the national average and in Ohio.
scorecard %>%
filter(STABBR == "OH") %>%
filter(is_university == 1) %>%
summarize(n = n(),
mean_cost = mean(COSTT4_A, na.rm = TRUE),
sd_cost = sd(COSTT4_A, na.rm = TRUE),
med_cost = median(COSTT4_A, na.rm = TRUE))
## n mean_cost sd_cost med_cost
## 1 82 28735.86 13684.94 26641
scorecard %>%
filter(STABBR %in% border_state) %>%
filter(is_university == 1) %>%
summarize(n = n(),
mean_cost = mean(COSTT4_A, na.rm = TRUE),
sd_cost = sd(COSTT4_A, na.rm = TRUE),
med_cost = median(COSTT4_A, na.rm = TRUE))
## n mean_cost sd_cost med_cost
## 1 245 32779.09 13672.09 27797
scorecard %>%
filter(is_university == 1) %>%
summarize(n = n(),
mean_cost = mean(COSTT4_A, na.rm = TRUE),
sd_cost = sd(COSTT4_A, na.rm = TRUE),
med_cost = median(COSTT4_A, na.rm = TRUE))
## n mean_cost sd_cost med_cost
## 1 1771 30350.82 14051.51 26226
Cost of Ohio Schools are about the average nationally.
```
ggplot(scorecard,
aes(x = log(UGDS + 1), y = COSTT4_A)) +
geom_point() +
ggtitle("Cost vs. no. undergrads")
## Warning: Removed 3531 rows containing missing values (geom_point).
ggplot(scorecard,
aes(x = COSTT4_A, y = AVGFACSAL)) +
geom_point()
## Warning: Removed 3695 rows containing missing values (geom_point).
ggplot(scorecard,
aes(y = FAMINC, x = COSTT4_A)) +
geom_point()
## Warning: Removed 3554 rows containing missing values (geom_point).
scorecard %>%
group_by(CONTROL) %>%
summarize(mean_married = mean(MARRIED, na.rm = TRUE),
sd_married = sd(MARRIED, na.rm = TRUE),
min_married = min(MARRIED, na.rm = TRUE),
med_married = median(MARRIED, na.rm = TRUE),
max_married = max(MARRIED, na.rm = TRUE))
## # A tibble: 3 x 6
## CONTROL mean_married sd_married min_married med_married max_married
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Public 0.128 0.0791 0 0.12 0.48
## 2 Private Non-Profit 0.153 0.124 0.01 0.13 0.82
## 3 Private For-Profit 0.197 0.0893 0.03 0.18 0.69
scorecard %>%
filter(STABBR == "OH") %>%
group_by(CONTROL) %>%
summarize(mean_married = mean(MARRIED, na.rm = TRUE),
sd_married = sd(MARRIED, na.rm = TRUE),
min_married = min(MARRIED, na.rm = TRUE),
med_married = median(MARRIED, na.rm = TRUE),
max_married = max(MARRIED, na.rm = TRUE))
## # A tibble: 3 x 6
## CONTROL mean_married sd_married min_married med_married max_married
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Public 0.158 0.0929 0.03 0.14 0.36
## 2 Private Non-Profit 0.127 0.103 0.01 0.095 0.51
## 3 Private For-Profit 0.168 0.0671 0.05 0.155 0.39
#Married families have direct relationships to wanting a public or private school.
scorecard %>%
group_by(CONTROL) %>%
summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
sd_vet = sd(VETERAN, na.rm = TRUE),
min_vet = min(VETERAN, na.rm = TRUE),
med_vet = median(VETERAN, na.rm = TRUE),
max_vet = max(VETERAN, na.rm = TRUE))
## # A tibble: 3 x 6
## CONTROL mean_vet sd_vet min_vet med_vet max_vet
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Public 0.00904 0.0107 0 0.01 0.16
## 2 Private Non-Profit 0.0180 0.0243 0 0.01 0.27
## 3 Private For-Profit 0.0209 0.0296 0 0.01 0.35
scorecard %>%
filter(STABBR == "OH") %>%
group_by(CONTROL) %>%
summarize(mean_vet = mean(VETERAN, na.rm = TRUE),
sd_vet = sd(VETERAN, na.rm = TRUE),
min_vet = min(VETERAN, na.rm = TRUE),
med_vet = median(VETERAN, na.rm = TRUE),
max_vet = max(VETERAN, na.rm = TRUE))
## # A tibble: 3 x 6
## CONTROL mean_vet sd_vet min_vet med_vet max_vet
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Public 0.0158 0.0197 0 0.01 0.08
## 2 Private Non-Profit 0.0227 0.0224 0.01 0.01 0.08
## 3 Private For-Profit 0.0260 0.0196 0.01 0.02 0.1
##Veteran and Control type have a relationship when considering veteran rate to control.