library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(infer)
library(statsr)
## Loading required package: BayesFactor
## Loading required package: coda
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## ************
## Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
## 
## Attaching package: 'statsr'
## 
## The following object is masked from 'package:infer':
## 
##     rep_sample_n
## 
## The following objects are masked from 'package:openintro':
## 
##     calc_streak, evals, nycflights, present

Proposal

With all the focus on mainting and developing stem tallent, I’ve always wanted to see if STEM degrees commend a price premium.

In terms of defintion of a STEM degree, I’ll be using DHS’s STEM CIPs Code List (Found here: https://www.ice.gov/doclib/sevis/pdf/stemList2022.pdf)

In addition I will crosswalk the Major Code to CIPs using a crosswalk table defined in: https://forum.ipums.org/t/crosswalk-between-degfieldd-and-cip/4209

Which is allegedly sourced via the census Bureau.

Data Read In

url.data <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv"
raw <- read.csv(url(url.data), header = TRUE,)
summary(raw)
##    Major_code      Major           Major_category         Total        
##  Min.   :1100   Length:173         Length:173         Min.   :   2396  
##  1st Qu.:2403   Class :character   Class :character   1st Qu.:  24280  
##  Median :3608   Mode  :character   Mode  :character   Median :  75791  
##  Mean   :3880                                         Mean   : 230257  
##  3rd Qu.:5503                                         3rd Qu.: 205763  
##  Max.   :6403                                         Max.   :3123510  
##     Employed       Employed_full_time_year_round   Unemployed    
##  Min.   :   1492   Min.   :   1093               Min.   :     0  
##  1st Qu.:  17281   1st Qu.:  12722               1st Qu.:  1101  
##  Median :  56564   Median :  39613               Median :  3619  
##  Mean   : 166162   Mean   : 126308               Mean   :  9725  
##  3rd Qu.: 142879   3rd Qu.: 111025               3rd Qu.:  8862  
##  Max.   :2354398   Max.   :1939384               Max.   :147261  
##  Unemployment_rate     Median           P25th           P75th       
##  Min.   :0.00000   Min.   : 35000   Min.   :24900   Min.   : 45800  
##  1st Qu.:0.04626   1st Qu.: 46000   1st Qu.:32000   1st Qu.: 70000  
##  Median :0.05472   Median : 53000   Median :36000   Median : 80000  
##  Mean   :0.05736   Mean   : 56816   Mean   :38697   Mean   : 82506  
##  3rd Qu.:0.06904   3rd Qu.: 65000   3rd Qu.:42000   3rd Qu.: 95000  
##  Max.   :0.15615   Max.   :125000   Max.   :78000   Max.   :210000
glimpse(raw)
## Rows: 173
## Columns: 11
## $ Major_code                    <int> 1100, 1101, 1102, 1103, 1104, 1105, 1106…
## $ Major                         <chr> "GENERAL AGRICULTURE", "AGRICULTURE PROD…
## $ Major_category                <chr> "Agriculture & Natural Resources", "Agri…
## $ Total                         <int> 128148, 95326, 33955, 103549, 24280, 794…
## $ Employed                      <int> 90245, 76865, 26321, 81177, 17281, 63043…
## $ Employed_full_time_year_round <int> 74078, 64240, 22810, 64937, 12722, 51077…
## $ Unemployed                    <int> 2423, 2266, 821, 3619, 894, 2070, 264, 2…
## $ Unemployment_rate             <dbl> 0.02614711, 0.02863606, 0.03024832, 0.04…
## $ Median                        <int> 50000, 54000, 63000, 46000, 62000, 50000…
## $ P25th                         <int> 34000, 36000, 40000, 30000, 38500, 35000…
## $ P75th                         <dbl> 80000, 80000, 98000, 72000, 90000, 75000…

Cases

There are 173 cases in this data set. This data is mostly summaries effectively grouped buy the major code

Type of Study

This is an observational study.

Data Sourcing

There are 3 priary resources:

Data 538-https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv U.S. Census-Crosswalk table-https://forum.ipums.org/t/crosswalk-between-degfieldd-and-cip/4209 DHS-https://www.ice.gov/doclib/sevis/pdf/stemList2022.pdf

Response Variable

The Response variable is the classification of the CIP code (Ie STEM or not STEM)

Explanatory Variable

The Explanatory variable is the income rate, and its numerical.

Summary Stats

summary(raw)
##    Major_code      Major           Major_category         Total        
##  Min.   :1100   Length:173         Length:173         Min.   :   2396  
##  1st Qu.:2403   Class :character   Class :character   1st Qu.:  24280  
##  Median :3608   Mode  :character   Mode  :character   Median :  75791  
##  Mean   :3880                                         Mean   : 230257  
##  3rd Qu.:5503                                         3rd Qu.: 205763  
##  Max.   :6403                                         Max.   :3123510  
##     Employed       Employed_full_time_year_round   Unemployed    
##  Min.   :   1492   Min.   :   1093               Min.   :     0  
##  1st Qu.:  17281   1st Qu.:  12722               1st Qu.:  1101  
##  Median :  56564   Median :  39613               Median :  3619  
##  Mean   : 166162   Mean   : 126308               Mean   :  9725  
##  3rd Qu.: 142879   3rd Qu.: 111025               3rd Qu.:  8862  
##  Max.   :2354398   Max.   :1939384               Max.   :147261  
##  Unemployment_rate     Median           P25th           P75th       
##  Min.   :0.00000   Min.   : 35000   Min.   :24900   Min.   : 45800  
##  1st Qu.:0.04626   1st Qu.: 46000   1st Qu.:32000   1st Qu.: 70000  
##  Median :0.05472   Median : 53000   Median :36000   Median : 80000  
##  Mean   :0.05736   Mean   : 56816   Mean   :38697   Mean   : 82506  
##  3rd Qu.:0.06904   3rd Qu.: 65000   3rd Qu.:42000   3rd Qu.: 95000  
##  Max.   :0.15615   Max.   :125000   Max.   :78000   Max.   :210000

Pretty Charts

mean <- mean(raw$Median)
sd   <- sd(raw$Median)
ggplot(data = raw, aes(x = Median)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = mean, sd = sd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.