library(psych)
library(ggplot2)
# Load data from Longitudinal Employer-Household Dynamics (LEHD) program website at the U.S. Census Bureau.
data_source <- 'https://lehd.ces.census.gov/data/pseo/graduate_earnings_all.csv'
data <- read.csv(data_source)
# There are a lot of columns in these data frames. We will select only relevant columns.
data <- read.csv(data_source)
salary_data <- data[,c('deglevl','deglevl_code', 'ciptitle', 'p25_earnings', 'p50_earnings', 'p75_earnings')]
# Since LEHD reports the 25th, 50th and 75th percentile, we will use the mean of
# these two as a proxy of the median (i.e center of the distribution).
salary_data$Salary <- (salary_data$p25_earnings+salary_data$p50_earnings +salary_data$p75_earnings)/3
#We will rename the column names to be more descriptive
salary_data <- salary_data[,c('deglevl','deglevl_code', 'ciptitle', 'Salary')]
names(salary_data)<-c('Degree','DegreeCode', 'Major','Salary')
Is Education and/or area of study predictive of future earnings?
Each case represents a student who received a degree or certificate from n Colorado or Texas educational institution. There 19958 observations in the given data set.
These statistics are generated by matching university transcript data with a national database of jobs. The PSEO are made possible through data sharing partnerships between universities, university systems, State Departments of Education, State Labor Market Information offices, and the U.S. Census Bureau. PSEO data are currently only available for post-graduate institutions whose transcript data has been made available to Census Bureau through a data-sharing agreement.
This is an observational study.
Post-Secondary Employment Outcomes (PSEO) (Beta)
Post-Secondary Employment Outcomes (PSEO) are experimental tabulations developed by the Longitudinal Employer-Household Dynamics (LEHD) program at the U.S. Census Bureau.
The response variable is annual salary and is numerical.
The explanatory variables are degree/major and are categorical.
describe(salary_data$Salary)
## vars n mean sd median trimmed mad min max
## X1 1 9417 51571.09 22715.17 47149 48626.81 17299.47 15990 329249
## range skew kurtosis se
## X1 313259 2.42 14.05 234.08
salary_data_degree_salary <- salary_data[, c('Degree', 'Salary')]
describeBy(x = salary_data_degree_salary, group = c('Degree') )
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##
## Descriptive statistics by group
## group: Associates
## vars n mean sd median trimmed mad min max
## Degree* 1 2755 1.0 0.00 1.00 1.0 0.0 1 1
## Salary 2 837 43242.5 13308.65 42051.33 42453.7 13922.6 19903 107224
## range skew kurtosis se
## Degree* 0 NaN NaN 0.00
## Salary 87321 0.67 0.72 460.01
## --------------------------------------------------------
## group: Baccalaureate
## vars n mean sd median trimmed mad min
## Degree* 1 9277 2.00 0.00 2.00 2.00 0.00 2.00
## Salary 2 5814 48875.38 19447.51 45357.67 46602.01 15552.97 18319.33
## max range skew kurtosis se
## Degree* 2 0.0 NaN NaN 0.00
## Salary 329249 310929.7 2.62 20.06 255.05
## --------------------------------------------------------
## group: Certificate, <1 year
## vars n mean sd median trimmed mad min
## Degree* 1 2429 3.00 0.00 3.00 3.00 0.00 3
## Salary 2 615 39988.87 14332.36 37367.33 38489.81 13668.58 17668
## max range skew kurtosis se
## Degree* 3.0 0.0 NaN NaN 0.00
## Salary 118519.7 100851.7 1.17 2.14 577.94
## --------------------------------------------------------
## group: Certificate,1-2 years
## vars n mean sd median trimmed mad min
## Degree* 1 2436 4.00 0.00 4.00 4.00 0.00 4
## Salary 2 435 42372.38 13293.13 40114.33 41540.79 13593.96 15990
## max range skew kurtosis se
## Degree* 4.00 0.00 NaN NaN 0.00
## Salary 98569.33 82579.33 0.72 0.64 637.36
## --------------------------------------------------------
## group: Certificate,2-4 years
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Degree* 1 13 5 0 5 5 0 5 5 0 NaN NaN 0
## Salary 2 0 NaN NA NA NaN NA Inf -Inf -Inf NA NA NA
## --------------------------------------------------------
## group: Doctoral - Professional Practice
## vars n mean sd median trimmed mad min
## Degree* 1 138 6.0 0.00 6.00 6.00 0.00 6.00
## Salary 2 103 106951.9 53946.33 94736.33 99118.67 48812.13 50521.33
## max range skew kurtosis se
## Degree* 6.0 0.0 NaN NaN 0.00
## Salary 295536.7 245015.3 1.66 3.33 5315.49
## --------------------------------------------------------
## group: Doctoral - Research/Scholarship
## vars n mean sd median trimmed mad min
## Degree* 1 931 7.00 0.00 7.00 7.00 0.00 7.00
## Salary 2 382 82360.57 26715.19 78024.17 79728.81 23203.93 38687.67
## max range skew kurtosis se
## Degree* 7 0.0 NaN NaN 0.00
## Salary 174588 135900.3 0.94 0.64 1366.87
## --------------------------------------------------------
## group: Masters
## vars n mean sd median trimmed mad min
## Degree* 1 1979 8.00 0.00 8.00 8.0 0.00 8
## Salary 2 1231 64814.41 22573.19 59357.67 62320.8 19113.18 24482
## max range skew kurtosis se
## Degree* 8.0 0.0 NaN NaN 0.00
## Salary 171280.7 146798.7 1.13 1.54 643.37
ggplot(salary_data, aes(x=salary_data$Salary)) + geom_histogram()
salary_data_Certificate_lessthan_year <- salary_data[which(salary_data$DegreeCode==1),]
ggplot(salary_data_Certificate_lessthan_year, aes(x=salary_data_Certificate_lessthan_year$Salary)) + geom_histogram()
Certificate_1_2_years <- salary_data[which(salary_data$DegreeCode==2),]
ggplot(Certificate_1_2_years, aes(x=Certificate_1_2_years$Salary)) + geom_histogram()
salary_data_Associates <- salary_data[which(salary_data$DegreeCode==3),]
ggplot(salary_data_Associates, aes(x=salary_data_Associates$Salary)) + geom_histogram()
Baccalaureate <- salary_data[which(salary_data$DegreeCode==5),]
ggplot(Baccalaureate, aes(x=Baccalaureate$Salary)) + geom_histogram()
Masters <- salary_data[which(salary_data$DegreeCode==7),]
ggplot(Masters, aes(x=Masters$Salary)) + geom_histogram()
Doctoral_Research_Scholarship <- salary_data[which(salary_data$DegreeCode==17),]
ggplot(Doctoral_Research_Scholarship, aes(x=Doctoral_Research_Scholarship$Salary)) + geom_histogram()
Doctoral_Professional_Practice <- salary_data[which(salary_data$DegreeCode==18),]
ggplot(Doctoral_Professional_Practice, aes(x=Doctoral_Professional_Practice$Salary)) + geom_histogram()