DATA 606 Data Project Proposal

Data Preparation

# load data
library(tidyverse)

fp <- 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv'
grad_input <- read_csv(fp)

# select relevant columns for analysis
grad_data <- grad_input |>
    select(Major,Major_category,Grad_share,Nongrad_total,Grad_unemployment_rate,Nongrad_unemployment_rate,Grad_median,Nongrad_median,Grad_P25,Grad_P75,Nongrad_P25,Nongrad_P75,Grad_premium)

grad_cat_stats <- grad_input |>
    group_by(Major_category) |>
    mutate(cat_g_unemployment_rate = Grad_unemployed/(Grad_unemployed + Grad_employed),
           cat_ng_unemployment_rate= Nongrad_unemployed/(Nongrad_unemployed + Nongrad_employed),
           cat_grad_premium = (Grad_median-Nongrad_median)/Nongrad_median) |>
    summarise(avg_cat_g_unemp_rate = mean(cat_g_unemployment_rate),
              avg_cat_ng_unemp_rate = mean(cat_ng_unemployment_rate),
              avg_cat_grad_premium = mean(cat_grad_premium))

grad_comb_data <- grad_data |> left_join(grad_cat_stats,by=c('Major_category'='Major_category'))

#head(grad_comb_data)

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

What premium in salary can be expected by completing a graduate degree? Is there a statistically significant difference between graduate and non-graduate salaries.

Cases

What are the cases, and how many are there?

The cases are majors offered as graduate degrees and there are 173 observations.

Data collection

Describe the method of data collection.

The data used in the FiveThirtyEight article is based off of the American Community Survey 2010-2012 public dataset.

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

The data is collected via Github, but the author from FiveThirtyEight pulled the data from the American Community Survey. source

Dependent Variable

What is the response variable? Is it quantitative or qualitative?

The response variable is quantitative and is the premium in graduation salary after completing a graduate degree as denoted in the data data as Grad_premium.

Independent Variable(s)

The explanatory variables are as follows:

the average salary of graduate students, a numeric variable, with that major after obtaining the degree
The average salary of non-graduate students, a numeric variables, with the same major
the graduate unemployment rate: numeric
the non-graduate unemployment rate: numeric

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

summary(grad_data)

##     Major           Major_category       Grad_share      Nongrad_total    
##  Length:173         Length:173         Min.   :0.09632   Min.   :   2232  
##  Class :character   Class :character   1st Qu.:0.26757   1st Qu.:  20564  
##  Mode  :character   Mode  :character   Median :0.39875   Median :  68993  
##                                        Mean   :0.40059   Mean   : 214720  
##                                        3rd Qu.:0.49912   3rd Qu.: 184971  
##                                        Max.   :0.93117   Max.   :2996892  
##  Grad_unemployment_rate Nongrad_unemployment_rate  Grad_median    
##  Min.   :0.00000        Min.   :0.00000           Min.   : 47000  
##  1st Qu.:0.02607        1st Qu.:0.04198           1st Qu.: 65000  
##  Median :0.03665        Median :0.05103           Median : 75000  
##  Mean   :0.03934        Mean   :0.05395           Mean   : 76756  
##  3rd Qu.:0.04805        3rd Qu.:0.06439           3rd Qu.: 90000  
##  Max.   :0.13851        Max.   :0.16091           Max.   :135000  
##  Nongrad_median      Grad_P25        Grad_P75       Nongrad_P25   
##  Min.   : 37000   Min.   :24500   Min.   : 65000   Min.   :25000  
##  1st Qu.: 48700   1st Qu.:45000   1st Qu.: 93000   1st Qu.:34000  
##  Median : 55000   Median :50000   Median :108000   Median :38000  
##  Mean   : 58584   Mean   :52597   Mean   :112087   Mean   :40078  
##  3rd Qu.: 65000   3rd Qu.:60000   3rd Qu.:130000   3rd Qu.:44000  
##  Max.   :126000   Max.   :85000   Max.   :294000   Max.   :80000  
##   Nongrad_P75      Grad_premium    
##  Min.   : 48000   Min.   :-0.0250  
##  1st Qu.: 72000   1st Qu.: 0.2308  
##  Median : 80000   Median : 0.3208  
##  Mean   : 84333   Mean   : 0.3285  
##  3rd Qu.: 97000   3rd Qu.: 0.4000  
##  Max.   :215000   Max.   : 1.6471

ggplot(grad_data,aes(Nongrad_unemployment_rate))+
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(grad_data,aes(Grad_unemployment_rate))+
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(grad_data,aes(Nongrad_median))+
    geom_histogram(binwidth = 5000)

ggplot(grad_data,aes(Grad_median))+
    geom_histogram(binwidth = 5000)

ggplot(grad_data,mapping=aes(Grad_unemployment_rate,Nongrad_unemployment_rate))+
    geom_point() +
    geom_text(aes(label=Major),vjust=-1,size=1.5)

ggplot(grad_data,mapping=aes(Grad_median,Nongrad_median))+
    geom_point() +
    geom_text(aes(label=Major),vjust=-1,size=1.5)

grad_comb_data |>
    group_by(Major_category) |>
    summarise(cnt=n(),
              avg_g_unemployrate = mean(avg_cat_g_unemp_rate),
              avg_ng_unemployrate = mean(avg_cat_ng_unemp_rate)) %>%
    ggplot(aes(avg_g_unemployrate,avg_ng_unemployrate)) +
    geom_point() +
    geom_text(aes(label=Major_category),vjust=1,size=2) +
    xlim(0,0.09) +
    ylim(0,0.09)

ggplot(grad_comb_data,aes(Grad_premium))+
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

colnames(grad_comb_data)

##  [1] "Major"                     "Major_category"           
##  [3] "Grad_share"                "Nongrad_total"            
##  [5] "Grad_unemployment_rate"    "Nongrad_unemployment_rate"
##  [7] "Grad_median"               "Nongrad_median"           
##  [9] "Grad_P25"                  "Grad_P75"                 
## [11] "Nongrad_P25"               "Nongrad_P75"              
## [13] "Grad_premium"              "avg_cat_g_unemp_rate"     
## [15] "avg_cat_ng_unemp_rate"     "avg_cat_grad_premium"

ggplot(grad_comb_data,aes(x=Major_category,y=Grad_premium)) +
    geom_boxplot() + 
    theme(axis.text.x = element_text(angle=45,hjust=0.9))