# load data
library(tidyverse)
fp <- 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv'
grad_input <- read_csv(fp)
# select relevant columns for analysis
grad_data <- grad_input |>
select(Major,Major_category,Grad_share,Nongrad_total,Grad_unemployment_rate,Nongrad_unemployment_rate,Grad_median,Nongrad_median,Grad_P25,Grad_P75,Nongrad_P25,Nongrad_P75,Grad_premium)
grad_cat_stats <- grad_input |>
group_by(Major_category) |>
mutate(cat_g_unemployment_rate = Grad_unemployed/(Grad_unemployed + Grad_employed),
cat_ng_unemployment_rate= Nongrad_unemployed/(Nongrad_unemployed + Nongrad_employed),
cat_grad_premium = (Grad_median-Nongrad_median)/Nongrad_median) |>
summarise(avg_cat_g_unemp_rate = mean(cat_g_unemployment_rate),
avg_cat_ng_unemp_rate = mean(cat_ng_unemployment_rate),
avg_cat_grad_premium = mean(cat_grad_premium))
grad_comb_data <- grad_data |> left_join(grad_cat_stats,by=c('Major_category'='Major_category'))
#head(grad_comb_data)
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
What premium in salary can be expected by completing a graduate degree? Is there a statistically significant difference between graduate and non-graduate salaries.
What are the cases, and how many are there?
The cases are majors offered as graduate degrees and there are 173 observations.
Describe the method of data collection.
The data used in the FiveThirtyEight article is based off of the American Community Survey 2010-2012 public dataset.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
The data is collected via Github, but the author from FiveThirtyEight pulled the data from the American Community Survey. source
What is the response variable? Is it quantitative or qualitative?
The response variable is quantitative and is the premium in graduation salary after completing a graduate degree as denoted in the data data as Grad_premium.
The explanatory variables are as follows:
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
summary(grad_data)
## Major Major_category Grad_share Nongrad_total
## Length:173 Length:173 Min. :0.09632 Min. : 2232
## Class :character Class :character 1st Qu.:0.26757 1st Qu.: 20564
## Mode :character Mode :character Median :0.39875 Median : 68993
## Mean :0.40059 Mean : 214720
## 3rd Qu.:0.49912 3rd Qu.: 184971
## Max. :0.93117 Max. :2996892
## Grad_unemployment_rate Nongrad_unemployment_rate Grad_median
## Min. :0.00000 Min. :0.00000 Min. : 47000
## 1st Qu.:0.02607 1st Qu.:0.04198 1st Qu.: 65000
## Median :0.03665 Median :0.05103 Median : 75000
## Mean :0.03934 Mean :0.05395 Mean : 76756
## 3rd Qu.:0.04805 3rd Qu.:0.06439 3rd Qu.: 90000
## Max. :0.13851 Max. :0.16091 Max. :135000
## Nongrad_median Grad_P25 Grad_P75 Nongrad_P25
## Min. : 37000 Min. :24500 Min. : 65000 Min. :25000
## 1st Qu.: 48700 1st Qu.:45000 1st Qu.: 93000 1st Qu.:34000
## Median : 55000 Median :50000 Median :108000 Median :38000
## Mean : 58584 Mean :52597 Mean :112087 Mean :40078
## 3rd Qu.: 65000 3rd Qu.:60000 3rd Qu.:130000 3rd Qu.:44000
## Max. :126000 Max. :85000 Max. :294000 Max. :80000
## Nongrad_P75 Grad_premium
## Min. : 48000 Min. :-0.0250
## 1st Qu.: 72000 1st Qu.: 0.2308
## Median : 80000 Median : 0.3208
## Mean : 84333 Mean : 0.3285
## 3rd Qu.: 97000 3rd Qu.: 0.4000
## Max. :215000 Max. : 1.6471
ggplot(grad_data,aes(Nongrad_unemployment_rate))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(grad_data,aes(Grad_unemployment_rate))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(grad_data,aes(Nongrad_median))+
geom_histogram(binwidth = 5000)
ggplot(grad_data,aes(Grad_median))+
geom_histogram(binwidth = 5000)
ggplot(grad_data,mapping=aes(Grad_unemployment_rate,Nongrad_unemployment_rate))+
geom_point() +
geom_text(aes(label=Major),vjust=-1,size=1.5)
ggplot(grad_data,mapping=aes(Grad_median,Nongrad_median))+
geom_point() +
geom_text(aes(label=Major),vjust=-1,size=1.5)
grad_comb_data |>
group_by(Major_category) |>
summarise(cnt=n(),
avg_g_unemployrate = mean(avg_cat_g_unemp_rate),
avg_ng_unemployrate = mean(avg_cat_ng_unemp_rate)) %>%
ggplot(aes(avg_g_unemployrate,avg_ng_unemployrate)) +
geom_point() +
geom_text(aes(label=Major_category),vjust=1,size=2) +
xlim(0,0.09) +
ylim(0,0.09)
ggplot(grad_comb_data,aes(Grad_premium))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
colnames(grad_comb_data)
## [1] "Major" "Major_category"
## [3] "Grad_share" "Nongrad_total"
## [5] "Grad_unemployment_rate" "Nongrad_unemployment_rate"
## [7] "Grad_median" "Nongrad_median"
## [9] "Grad_P25" "Grad_P75"
## [11] "Nongrad_P25" "Nongrad_P75"
## [13] "Grad_premium" "avg_cat_g_unemp_rate"
## [15] "avg_cat_ng_unemp_rate" "avg_cat_grad_premium"
ggplot(grad_comb_data,aes(x=Major_category,y=Grad_premium)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle=45,hjust=0.9))