Data Preparation

library(psych)
library(ggplot2)
# Load data from Longitudinal Employer-Household Dynamics (LEHD) program website at the U.S. Census Bureau.
 
data_source <- 'https://lehd.ces.census.gov/data/pseo/graduate_earnings_all.csv'
data <- read.csv(data_source)

# There are a lot of columns in these data frames. We will select only relevant columns. 
data <- read.csv(data_source)
salary_data <- data[,c('deglevl','deglevl_code', 'ciptitle', 'p25_earnings', 'p50_earnings', 'p75_earnings')]

# Since LEHD reports the 25th, 50th and 75th percentile, we will use the mean of 
# these two as a proxy of the median (i.e center of the distribution).
salary_data$Salary <- (salary_data$p25_earnings+salary_data$p50_earnings +salary_data$p75_earnings)/3

#We will rename the column names to be more descriptive
salary_data <- salary_data[,c('deglevl','deglevl_code', 'ciptitle', 'Salary')]
names(salary_data)<-c('Degree','DegreeCode', 'Major','Salary')

Research question

Is Education and/or area of study predictive of future earnings?

Cases

Each case represents a student who received a degree or certificate from n Colorado or Texas educational institution. There 19958 observations in the given data set.

Data collection

These statistics are generated by matching university transcript data with a national database of jobs. The PSEO are made possible through data sharing partnerships between universities, university systems, State Departments of Education, State Labor Market Information offices, and the U.S. Census Bureau. PSEO data are currently only available for post-graduate institutions whose transcript data has been made available to Census Bureau through a data-sharing agreement.

Type of study

This is an observational study.

Data Source

Post-Secondary Employment Outcomes (PSEO) (Beta)

Post-Secondary Employment Outcomes (PSEO) are experimental tabulations developed by the Longitudinal Employer-Household Dynamics (LEHD) program at the U.S. Census Bureau.

https://lehd.ces.census.gov/data/pseo_beta.html

Response

The response variable is annual salary and is numerical.

Explanatory

The explanatory variables are degree/major and are categorical.

Relevant summary statistics

describe(salary_data$Salary)
##    vars    n     mean       sd median  trimmed      mad   min    max
## X1    1 9417 51571.09 22715.17  47149 48626.81 17299.47 15990 329249
##     range skew kurtosis     se
## X1 313259 2.42    14.05 234.08
salary_data_degree_salary <- salary_data[, c('Degree', 'Salary')]
describeBy(x = salary_data_degree_salary, group = c('Degree') )
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## 
##  Descriptive statistics by group 
## group: Associates
##         vars    n    mean       sd   median trimmed     mad   min    max
## Degree*    1 2755     1.0     0.00     1.00     1.0     0.0     1      1
## Salary     2  837 43242.5 13308.65 42051.33 42453.7 13922.6 19903 107224
##         range skew kurtosis     se
## Degree*     0  NaN      NaN   0.00
## Salary  87321 0.67     0.72 460.01
## -------------------------------------------------------- 
## group: Baccalaureate
##         vars    n     mean       sd   median  trimmed      mad      min
## Degree*    1 9277     2.00     0.00     2.00     2.00     0.00     2.00
## Salary     2 5814 48875.38 19447.51 45357.67 46602.01 15552.97 18319.33
##            max    range skew kurtosis     se
## Degree*      2      0.0  NaN      NaN   0.00
## Salary  329249 310929.7 2.62    20.06 255.05
## -------------------------------------------------------- 
## group: Certificate, <1 year
##         vars    n     mean       sd   median  trimmed      mad   min
## Degree*    1 2429     3.00     0.00     3.00     3.00     0.00     3
## Salary     2  615 39988.87 14332.36 37367.33 38489.81 13668.58 17668
##              max    range skew kurtosis     se
## Degree*      3.0      0.0  NaN      NaN   0.00
## Salary  118519.7 100851.7 1.17     2.14 577.94
## -------------------------------------------------------- 
## group: Certificate,1-2 years
##         vars    n     mean       sd   median  trimmed      mad   min
## Degree*    1 2436     4.00     0.00     4.00     4.00     0.00     4
## Salary     2  435 42372.38 13293.13 40114.33 41540.79 13593.96 15990
##              max    range skew kurtosis     se
## Degree*     4.00     0.00  NaN      NaN   0.00
## Salary  98569.33 82579.33 0.72     0.64 637.36
## -------------------------------------------------------- 
## group: Certificate,2-4 years
##         vars  n mean sd median trimmed mad min  max range skew kurtosis se
## Degree*    1 13    5  0      5       5   0   5    5     0  NaN      NaN  0
## Salary     2  0  NaN NA     NA     NaN  NA Inf -Inf  -Inf   NA       NA NA
## -------------------------------------------------------- 
## group: Doctoral - Professional Practice
##         vars   n     mean       sd   median  trimmed      mad      min
## Degree*    1 138      6.0     0.00     6.00     6.00     0.00     6.00
## Salary     2 103 106951.9 53946.33 94736.33 99118.67 48812.13 50521.33
##              max    range skew kurtosis      se
## Degree*      6.0      0.0  NaN      NaN    0.00
## Salary  295536.7 245015.3 1.66     3.33 5315.49
## -------------------------------------------------------- 
## group: Doctoral - Research/Scholarship
##         vars   n     mean       sd   median  trimmed      mad      min
## Degree*    1 931     7.00     0.00     7.00     7.00     0.00     7.00
## Salary     2 382 82360.57 26715.19 78024.17 79728.81 23203.93 38687.67
##            max    range skew kurtosis      se
## Degree*      7      0.0  NaN      NaN    0.00
## Salary  174588 135900.3 0.94     0.64 1366.87
## -------------------------------------------------------- 
## group: Masters
##         vars    n     mean       sd   median trimmed      mad   min
## Degree*    1 1979     8.00     0.00     8.00     8.0     0.00     8
## Salary     2 1231 64814.41 22573.19 59357.67 62320.8 19113.18 24482
##              max    range skew kurtosis     se
## Degree*      8.0      0.0  NaN      NaN   0.00
## Salary  171280.7 146798.7 1.13     1.54 643.37
ggplot(salary_data, aes(x=salary_data$Salary)) + geom_histogram()

salary_data_Certificate_lessthan_year <- salary_data[which(salary_data$DegreeCode==1),]
ggplot(salary_data_Certificate_lessthan_year, aes(x=salary_data_Certificate_lessthan_year$Salary)) + geom_histogram()

Certificate_1_2_years <- salary_data[which(salary_data$DegreeCode==2),]
ggplot(Certificate_1_2_years, aes(x=Certificate_1_2_years$Salary)) + geom_histogram()

salary_data_Associates <- salary_data[which(salary_data$DegreeCode==3),]
ggplot(salary_data_Associates, aes(x=salary_data_Associates$Salary)) + geom_histogram()

Baccalaureate <- salary_data[which(salary_data$DegreeCode==5),]
ggplot(Baccalaureate, aes(x=Baccalaureate$Salary)) + geom_histogram()

Masters <- salary_data[which(salary_data$DegreeCode==7),]
ggplot(Masters, aes(x=Masters$Salary)) + geom_histogram()

Doctoral_Research_Scholarship <- salary_data[which(salary_data$DegreeCode==17),]
ggplot(Doctoral_Research_Scholarship, aes(x=Doctoral_Research_Scholarship$Salary)) + geom_histogram()

Doctoral_Professional_Practice <- salary_data[which(salary_data$DegreeCode==18),]
ggplot(Doctoral_Professional_Practice, aes(x=Doctoral_Professional_Practice$Salary)) + geom_histogram()