Data preparation :
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(stringr)
all_ages_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv", header= TRUE)
names(all_ages_df)
## [1] "Major_code" "Major"
## [3] "Major_category" "Total"
## [5] "Employed" "Employed_full_time_year_round"
## [7] "Unemployed" "Unemployment_rate"
## [9] "Median" "P25th"
## [11] "P75th"
all_ages_df_eng <- all_ages_df %>% filter(Major_category == "Engineering")
head(all_ages_df_eng)
## Major_code Major Major_category Total Employed
## 1 1401 ARCHITECTURE Engineering 294692 216770
## 2 2400 GENERAL ENGINEERING Engineering 503080 359172
## 3 2401 AEROSPACE ENGINEERING Engineering 65734 44944
## 4 2402 BIOLOGICAL ENGINEERING Engineering 32748 24270
## 5 2403 ARCHITECTURAL ENGINEERING Engineering 19587 13713
## 6 2404 BIOMEDICAL ENGINEERING Engineering 18347 12876
## Employed_full_time_year_round Unemployed Unemployment_rate Median P25th
## 1 163020 20394 0.08599113 63000 40400
## 2 312023 17986 0.04768824 75000 50000
## 3 38491 1969 0.04197131 80000 58000
## 4 18621 1521 0.05897406 62000 40000
## 5 11180 1017 0.06904277 78000 50000
## 6 9202 1105 0.07903583 65000 40000
## P75th
## 1 93500
## 2 100000
## 3 110000
## 4 91000
## 5 102000
## 6 96000
grad_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv", header= TRUE)
names(grad_df)
## [1] "Major_code" "Major"
## [3] "Major_category" "Grad_total"
## [5] "Grad_sample_size" "Grad_employed"
## [7] "Grad_full_time_year_round" "Grad_unemployed"
## [9] "Grad_unemployment_rate" "Grad_median"
## [11] "Grad_P25" "Grad_P75"
## [13] "Nongrad_total" "Nongrad_employed"
## [15] "Nongrad_full_time_year_round" "Nongrad_unemployed"
## [17] "Nongrad_unemployment_rate" "Nongrad_median"
## [19] "Nongrad_P25" "Nongrad_P75"
## [21] "Grad_share" "Grad_premium"
grad_df_eng <- grad_df %>% filter(Major_category == "Engineering")
head(grad_df_eng)
## Major_code Major Major_category
## 1 2504 MECHANICAL ENGINEERING RELATED TECHNOLOGIES Engineering
## 2 2599 MISCELLANEOUS ENGINEERING TECHNOLOGIES Engineering
## 3 2503 INDUSTRIAL PRODUCTION TECHNOLOGIES Engineering
## 4 2502 ELECTRICAL ENGINEERING TECHNOLOGY Engineering
## 5 2500 ENGINEERING TECHNOLOGIES Engineering
## 6 2403 ARCHITECTURAL ENGINEERING Engineering
## Grad_total Grad_sample_size Grad_employed Grad_full_time_year_round
## 1 6065 111 4442 3669
## 2 14816 315 12433 11146
## 3 19885 408 14752 12467
## 4 28155 521 22501 19707
## 5 11724 219 9471 7958
## 6 6466 143 4857 4264
## Grad_unemployed Grad_unemployment_rate Grad_median Grad_P25 Grad_P75
## 1 310 0.06523569 78000 50000 103000
## 2 407 0.03169782 80000 54000 105000
## 3 603 0.03927060 84500 60000 111000
## 4 1296 0.05446065 85000 60000 110000
## 5 450 0.04535833 74000 48400 105000
## 6 304 0.05890331 78000 56000 110000
## Nongrad_total Nongrad_employed Nongrad_full_time_year_round
## 1 27999 23069 20418
## 2 60571 50092 44199
## 3 81076 64389 56559
## 4 90886 71204 62854
## 5 35992 29092 25129
## 6 18500 12772 10648
## Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25
## 1 998 0.04146757 61000 42000
## 2 3316 0.06208808 65000 43000
## 3 3431 0.05058980 70000 48000
## 4 4210 0.05582518 68000 48000
## 5 1475 0.04825465 65000 40000
## 6 889 0.06507576 80000 52000
## Nongrad_P75 Grad_share Grad_premium
## 1 85000 0.1780472 0.2786885
## 2 90000 0.1965326 0.2307692
## 3 99000 0.1969572 0.2071429
## 4 92000 0.2365152 0.2500000
## 5 94000 0.2457037 0.1384615
## 6 106000 0.2589922 -0.0250000
recent_grad_df <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv", header= TRUE)
names(recent_grad_df)
## [1] "Rank" "Major_code" "Major"
## [4] "Total" "Men" "Women"
## [7] "Major_category" "ShareWomen" "Sample_size"
## [10] "Employed" "Full_time" "Part_time"
## [13] "Full_time_year_round" "Unemployed" "Unemployment_rate"
## [16] "Median" "P25th" "P75th"
## [19] "College_jobs" "Non_college_jobs" "Low_wage_jobs"
recent_grad_df_eng <- recent_grad_df %>% filter(Major_category == "Engineering")
head(recent_grad_df_eng)
## Rank Major_code Major Total Men
## 1 1 2419 PETROLEUM ENGINEERING 2339 2057
## 2 2 2416 MINING AND MINERAL ENGINEERING 756 679
## 3 3 2415 METALLURGICAL ENGINEERING 856 725
## 4 4 2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING 1258 1123
## 5 5 2405 CHEMICAL ENGINEERING 32260 21239
## 6 6 2418 NUCLEAR ENGINEERING 2573 2200
## Women Major_category ShareWomen Sample_size Employed Full_time Part_time
## 1 282 Engineering 0.1205643 36 1976 1849 270
## 2 77 Engineering 0.1018519 7 640 556 170
## 3 131 Engineering 0.1530374 3 648 558 133
## 4 135 Engineering 0.1073132 16 758 1069 150
## 5 11021 Engineering 0.3416305 289 25694 23170 5180
## 6 373 Engineering 0.1449670 17 1857 2038 264
## Full_time_year_round Unemployed Unemployment_rate Median P25th P75th
## 1 1207 37 0.01838053 110000 95000 125000
## 2 388 85 0.11724138 75000 55000 90000
## 3 340 16 0.02409639 73000 50000 105000
## 4 692 40 0.05012531 70000 43000 80000
## 5 16697 1672 0.06109771 65000 50000 75000
## 6 1449 400 0.17722641 65000 50000 102000
## College_jobs Non_college_jobs Low_wage_jobs
## 1 1534 364 193
## 2 350 257 50
## 3 456 176 0
## 4 529 102 0
## 5 18314 4440 972
## 6 1142 657 244
Research Question :
Which college majors offer the best opportunities in terms of unemployment rate and salary?
Cases :
Here for all 3 datasources All_ages, Grad Students and Recent Grads where each case that represents majors offered by colleges and universities. On total there are 173 majors each.
Data Collection :
Every year, the U.S. Census Bureau contacts over 3.5 million households across the country to participate in the American Community Survey (ACS). The information that the Census Bureau collects helps to determine how more than $675 billion dollars of federal funding each year is spent on infrastructure and services. Through the ACS, data about jobs and occupations, educational attainment, veterans, whether people own or rent their home, and many other topics are available. The American Community Survey (ACS) Public Use Microdata Sample (PUMS) files are a set of untabulated records about individual people or housing units. The Census Bureau produces the PUMS files so that data users can create custom tables that are not available through pretabulated (or summary) ACS data products.
Type of study :
This is an observational Study
Data Source :
They in turn used data from:
Major categories are from Carnevale et al, “What’s It Worth?: The Economic Value of College Majors.” Georgetown University Center on Education and the Workforce, 2011. http://cew.georgetown.edu/whatsitworth".
Response Variable :
The response variable are the college majors and are categorical.
Explanatory Variables :
The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income.These data are numerical.
Relavent Summary Statistics :
First we will look at overall unemployment rate for the 3 categories: all ages, recent grads, and grad students.
summary(all_ages_df$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(grad_df$Grad_unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
summary(recent_grad_df$Unemployment_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
unempl <- cbind(all_ages_df$Unemployment_rate, recent_grad_df$Unemployment_rate, grad_df$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = rainbow(nrow(unempl)))
