# load data
library(tidyr)
library(dplyr)
library(tidyverse)
library(stringr)
library(ggplot2)
# all ages
temp1 <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv", stringsAsFactors = FALSE)
# graduate students with ages>25
temp2 <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv", stringsAsFactors = FALSE)
# recent graduates with ages<28
temp3 <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv", stringsAsFactors = FALSE)
# see the list of all 16 major categories
temp1$Major_category
# categorize stem, business and liberal arts
stem <- c("Agriculture & Natural Resources", "Biology & Life Science", "Health", "Computers & Mathematics", "Engineering", "Physical Sciences", "Social Science")
business <- c("Business")
liberalarts <- c("Communications & Journalism", "Education", "Industrial Arts & Consumer Services", "Law & Public Policy", "Psychology & Social Work", "Arts", "Humanities & Liberal Arts", "Interdisciplinary")# all_ages
all_ages <- temp1 %>%
#drop_na() %>%
mutate(Type = case_when(Major_category %in% liberalarts ~ "L",
Major_category %in% stem ~ "S",
TRUE ~ "B")) %>%
arrange(Major_category) %>%
group_by(Major_category) %>%
mutate(Mean_median= round(mean(Median),0)) %>%
select(Major_category, Type, Employed, Unemployed, Unemployment_rate, Median, P25th, P75th, Mean_median)
all_ages
# grad_students
grad_students <- temp2 %>%
#drop_na() %>%
mutate(Type = case_when(Major_category %in% liberalarts ~ "L",
Major_category %in% stem ~ "S",
TRUE ~ "B")) %>%
arrange(Major_category) %>%
group_by(Major_category) %>%
mutate(Mean_grad_median= round(mean(Grad_median),0)) %>%
select(Major_category, Type, Grad_employed, Grad_unemployed, Grad_unemployment_rate, Grad_median, Grad_P25, Grad_P75, Mean_grad_median)
grad_students
recent_grads <- temp3 %>%
#drop_na() %>%
mutate(Type = case_when(Major_category %in% liberalarts ~ "L",
Major_category %in% stem ~ "S",
TRUE ~ "B")) %>%
arrange(Major_category) %>%
group_by(Major_category) %>%
mutate(Mean_median= round(mean(Median),0)) %>%
mutate(Mean_jobindex = round(mean(ifelse(Non_college_jobs==0, 0, Non_college_jobs/(Non_college_jobs + College_jobs))), 4)) %>%
select(Rank, Major_category, Type, Employed, Unemployed, Unemployment_rate, Median, P25th, P75th, College_jobs, Non_college_jobs, Mean_jobindex, Mean_median)
recent_gradsYou should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
As STEM majors are becoming more popular, I would like to study the relationship between median income and different major categories in stem, business, and liberal arts.
Also, I would like to study the relationship between median income and the percentage of job requirement of being college graduates (Mean_jobindex) by different major categories in stem, business, and liberal arts, among the recent graduates.
What are the cases, and how many are there?
All_ages: There are 173 cases in total, with no age limits in each case.
Grad_students: There are 173 cases in total, with age >25.
Recent_grads: There are 173 cases in total, with age <28. Although one row has some NA values, I do not need the columns with NA, therefore I do not have to drop that row.
Each case represents one unique major_code and major offered by colleges in United States.
Describe the method of data collection.
Every year, the U.S. Census Bureau contacts over 3.5 million households across the country to participate in the American Community Survey (ACS). ACS is an ongoing survey that provides vital information on a yearly basis about our nation and its people. Information from the survey generates data that help determine how more than $675 billion in federal and state funds are distributed each year.
Through the ACS, we know more about jobs and occupations, educational attainment, veterans, whether people own or rent their homes, and other topics. Public officials, planners, and entrepreneurs use this information to assess the past and plan the future. (https://www.census.gov/programs-surveys/acs/about.html)
All three main .csv datasets are from American Community Survey 2010-2012 Public Use Microdata Series. They contain basic earnings and labor force information.
Download data here: http://www.census.gov/programs-surveys/acs/data/pums.html
Documentation here: http://www.census.gov/programs-surveys/acs/technical-documentation/pums.html
What type of study is this (observational/experiment)?
This study is observational.
If you collected the data, state self-collected. If not, provide a citation/link.
The datasets were obtained from github: https://github.com/fivethirtyeight/data/tree/master/college-majors.
What is the response variable? Is it quantitative or qualitative?
My response variable is Median income. It is quantitative.
You should have two independent variables, one quantitative and one qualitative.
Major_category is qualitative. Mean_jobindex is quantitative.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
## Major_category Type Employed Unemployed
## Length:173 Length:173 Min. : 1492 Min. : 0
## Class :character Class :character 1st Qu.: 17281 1st Qu.: 1101
## Mode :character Mode :character Median : 56564 Median : 3619
## Mean : 166162 Mean : 9725
## 3rd Qu.: 142879 3rd Qu.: 8862
## Max. :2354398 Max. :147261
## Unemployment_rate Median P25th P75th
## Min. :0.00000 Min. : 35000 Min. :24900 Min. : 45800
## 1st Qu.:0.04626 1st Qu.: 46000 1st Qu.:32000 1st Qu.: 70000
## Median :0.05472 Median : 53000 Median :36000 Median : 80000
## Mean :0.05736 Mean : 56816 Mean :38697 Mean : 82506
## 3rd Qu.:0.06904 3rd Qu.: 65000 3rd Qu.:42000 3rd Qu.: 95000
## Max. :0.15615 Max. :125000 Max. :78000 Max. :210000
## Mean_median
## Min. :43000
## 1st Qu.:46080
## Median :53222
## Mean :56816
## 3rd Qu.:62400
## Max. :77759
# Median Income from All Ages
ggplot(all_ages, aes(x=Median, fill=..count..)) +
geom_histogram(bins=30, color="black") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
ggtitle("Median Income from All Ages") +
xlab("Median Salary") + ylab("Count")# Mean of Median Income from All Ages by Major Category
ggplot(all_ages, aes(x= reorder(Major_category, Mean_median), y=Mean_median, fill=Mean_median)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.8) +
coord_flip() +
ggtitle("Mean of Median Income from All Ages by Major Category") +
xlab("Major Category") + ylab("Median Salary") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
geom_text(aes(label=Mean_median), color='white', size=3, vjust=0.4, hjust=1.1)## Major_category Type Grad_employed Grad_unemployed
## Length:173 Length:173 Min. : 1008 Min. : 0
## Class :character Class :character 1st Qu.: 12659 1st Qu.: 453
## Mode :character Mode :character Median : 28930 Median : 1179
## Mean : 94037 Mean : 3506
## 3rd Qu.:109944 3rd Qu.: 3329
## Max. :915341 Max. :35718
## Grad_unemployment_rate Grad_median Grad_P25 Grad_P75
## Min. :0.00000 Min. : 47000 Min. :24500 Min. : 65000
## 1st Qu.:0.02607 1st Qu.: 65000 1st Qu.:45000 1st Qu.: 93000
## Median :0.03665 Median : 75000 Median :50000 Median :108000
## Mean :0.03934 Mean : 76756 Mean :52597 Mean :112087
## 3rd Qu.:0.04805 3rd Qu.: 90000 3rd Qu.:60000 3rd Qu.:130000
## Max. :0.13851 Max. :135000 Max. :85000 Max. :294000
## Mean_grad_median
## Min. :55000
## 1st Qu.:66571
## Median :80292
## Mean :76756
## 3rd Qu.:85545
## Max. :94328
# Median Income from Graduate Students
ggplot(grad_students, aes(x=Grad_median, fill=..count..)) +
geom_histogram(bins=30, color="black") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
ggtitle("Median Income from Grad Students") +
xlab("Median Salary") + ylab("Count")# Mean of Median Income from Grad Students by Major Category
ggplot(grad_students, aes(x= reorder(Major_category, Mean_grad_median), y=Mean_grad_median, fill=Mean_grad_median)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.8) +
coord_flip() +
ggtitle("Mean of Median Income from Grad Students by Major Category") +
xlab("Major Category") + ylab("Median Salary") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
geom_text(aes(label=Mean_grad_median), color='white', size=3, vjust=0.4, hjust=1.1)## Rank Major_category Type Employed
## Min. : 1 Length:173 Length:173 Min. : 0
## 1st Qu.: 44 Class :character Class :character 1st Qu.: 3608
## Median : 87 Mode :character Mode :character Median : 11797
## Mean : 87 Mean : 31193
## 3rd Qu.:130 3rd Qu.: 31433
## Max. :173 Max. :307933
## Unemployed Unemployment_rate Median P25th
## Min. : 0 Min. :0.00000 Min. : 22000 Min. :18500
## 1st Qu.: 304 1st Qu.:0.05031 1st Qu.: 33000 1st Qu.:24000
## Median : 893 Median :0.06796 Median : 36000 Median :27000
## Mean : 2416 Mean :0.06819 Mean : 40151 Mean :29501
## 3rd Qu.: 2393 3rd Qu.:0.08756 3rd Qu.: 45000 3rd Qu.:33000
## Max. :28169 Max. :0.17723 Max. :110000 Max. :95000
## P75th College_jobs Non_college_jobs Mean_jobindex
## Min. : 22000 Min. : 0 Min. : 0 Min. :0.2863
## 1st Qu.: 42000 1st Qu.: 1675 1st Qu.: 1591 1st Qu.:0.3201
## Median : 47000 Median : 4390 Median : 4595 Median :0.4839
## Mean : 51494 Mean : 12323 Mean : 13284 Mean :0.4894
## 3rd Qu.: 60000 3rd Qu.: 14444 3rd Qu.: 11783 3rd Qu.:0.6145
## Max. :125000 Max. :151643 Max. :148395 Max. :0.7027
## Mean_median
## Min. :30100
## 1st Qu.:33062
## Median :36900
## Mean :40151
## 3rd Qu.:42745
## Max. :57383
# Median Income from Recent Graduates
ggplot(recent_grads, aes(x=Median, fill=..count..)) +
geom_histogram(bins=30, color="black") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
ggtitle("Median Income from Recent Grads") +
xlab("Median Salary") + ylab("Count")# Mean of Median Income from Recent Graduates by Major Category
ggplot(recent_grads, aes(x= reorder(Major_category, Mean_median), y=Mean_median, fill=Mean_median)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.8) +
coord_flip() +
ggtitle("Mean of Median Income from Recent Graduates by Major Category") +
xlab("Major Category") + ylab("Median Salary") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
geom_text(aes(label=Mean_median), color='white', size=3, vjust=0.4, hjust=1.1)# Mean of Job Index from Recent Graduates by Major Category
ggplot(recent_grads, aes(x= reorder(Major_category, Mean_jobindex), y=Mean_jobindex, fill=Mean_jobindex)) +
geom_bar(stat="identity", position=position_dodge(), width = 0.8) +
coord_flip() +
ggtitle("Mean of Job Index from Recent Graduates by Major Category") +
xlab("Major Category") + ylab("Job Index") +
scale_fill_gradient(low='royalblue4', high='royalblue1') +
geom_text(aes(label=Mean_jobindex), color='white', size=3, vjust=0.4, hjust=1.1)