# load data
library(tidyverse)
library(stringr)
library(ggpubr)
library(DATA606)
##
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics
## This package is designed to support this course. The text book used
## is OpenIntro Statistics, 3rd Edition. You can read this by typing
## vignette('os3') or visit www.OpenIntro.org.
##
## The getLabs() function will return a list of the labs available.
##
## The demo(package='DATA606') will list the demos that are available.
raw_data <- read.csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv', stringsAsFactors = FALSE)
raw_data
Major_Category
, and create a character class storing all STEM majors in the raw dataset.majors <- raw_data$Major_category %>%
unique()
stem_majors <- c('Engineering','Physical Sciences','Computers & Mathematics',
'Agriculture & Natural Resources','Health','Social Science',
'Biology & Life Science')
Is_STEM
, if the major is STEM then STEM
else Non-STEM
. Select column as Major
, Major_category
, Is_STEM
, Unemployment_rate
as output cleaned data.clean_data <- raw_data %>%
drop_na() %>%
mutate(Is_STEM =
case_when(
Major_category %in% stem_majors ~ 'STEM',
str_detect(Major,'TECHNOLOG') ~ 'STEM',
TRUE ~ 'Non-STEM')) %>%
select(Major,
Major_category,
Is_STEM,
Median,
Unemployment_rate) %>%
arrange(Major)
clean_data
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Answer:
Resarch Question:
Is the average unemployment rate of STEM majors different than that of non-STEM majors?
Can unemployment rate be predicted by types of major and median of incoming?
What are the cases, and how many are there?
Answer: The cases are the employment statistics of each major.
Describe the method of data collection.
Answer: The data is from American Community Survey 2010-2012 Public Use Microdata Series.
The American Community Survey (ACS) is an ongoing survey that provides vital information on a yearly basis about our nation and its people. Information from the survey generates data that help determine how more than $675 billion in federal and state funds are distributed each year.
Through the ACS, we know more about jobs and occupations, educational attainment, veterans, whether people own or rent their homes, and other topics.
*Reference: https://www.census.gov/programs-surveys/acs/about.html
What type of study is this (observational/experiment)?
Answer: This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
Answer: The data is from the source below:
https://github.com/fivethirtyeight/data/tree/master/college-majors
What is the response variable? Is it quantitative or qualitative?
Answer: The response variable is Unemployment Rate
. It’s quantitative
You should have two independent variables, one quantitative and one qualitative.
Answer:
The first independent variable is Is_STEM
, which is qualitative;
The second independent variable is Median
(median of income), which is quantitative.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
Is_STEM
contains two values STEM
and Non-STEM
ggplot(clean_data, aes(x = Is_STEM, fill = Is_STEM)) +
geom_bar(color='black', fill = 'cyan3')+
geom_text(stat='count', aes(label=..count..), vjust=0, hjust=1.5, color='white', face='bold')+
coord_flip()+
labs(title='Number of Cases by Type of Majors')+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
## Warning: Ignoring unknown parameters: face
median
(median income) is as below:summary(clean_data$Median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22000 33000 36000 40077 45000 110000
p1 <- ggplot(clean_data,aes(x=factor(0),Median))+geom_boxplot(color='black', fill = 'cyan1')+
coord_flip()+
ggtitle('Boxplot: Median Income')+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
plot.title = element_text(hjust=0.5))
p2 <- ggplot(clean_data, aes(x=Median, fill=..count..))+
geom_histogram(bins=30,color="black")+
scale_fill_gradient(low = 'cyan4', high = 'cyan1')+
ggtitle('Histogram: Median Income')+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
ggarrange(p2,p1,nrow=2)
data_stem_top_20 <- clean_data %>%
filter(Is_STEM == 'STEM') %>%
top_n(20)
## Selecting by Unemployment_rate
ggplot(data_stem_top_20, aes(x = reorder(Major,Unemployment_rate), y = Unemployment_rate, fill = Unemployment_rate)) +
geom_bar(stat = 'identity')+
geom_text(aes(label=paste0(round(Unemployment_rate,4)*100,'%')),vjust=0.4, hjust=1, position = position_dodge(width = 1), color="white",size = 3)+
coord_flip()+
labs(title='STEM Major Unemployment Rate', subtitle='Top 20')+
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
xlab('STEM Major')+
ylab('Unemployment Rate')+
theme(legend.position = "none")+
scale_fill_gradient(low = 'deeppink4', high = 'deeppink1')
data_stem_bottom_20 <- clean_data %>%
filter(Is_STEM == 'STEM') %>%
top_n(-20)
## Selecting by Unemployment_rate
ggplot(data_stem_bottom_20, aes(x = reorder(Major,1-Unemployment_rate), y = 1-Unemployment_rate, fill = 1-Unemployment_rate)) +
geom_bar(stat = 'identity')+
geom_text(aes(label=paste0(round(1-Unemployment_rate,4)*100,'%')),vjust=0.4, hjust=1, position = position_dodge(width = 1), color="white",size = 3)+
coord_flip()+
labs(title='STEM Major Employment Rate', subtitle='Top 20')+
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
xlab('STEM Major')+
ylab('Employment Rate')+
theme(legend.position = "none")
data_non_stem_top_20 <- clean_data %>%
filter(Is_STEM == 'Non-STEM') %>%
top_n(20)
## Selecting by Unemployment_rate
ggplot(data_non_stem_top_20, aes(x = reorder(Major,Unemployment_rate), y = Unemployment_rate, fill = Unemployment_rate)) +
geom_bar(stat = 'identity')+
geom_text(aes(label=paste0(round(Unemployment_rate,4)*100,'%')),vjust=0.4, hjust=1, position = position_dodge(width = 1), color="white",size = 3)+
coord_flip()+
labs(title='Non-STEM Major Unemployment Rate', subtitle='Top 20')+
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
xlab('Non-STEM Major')+
ylab('Unemployment Rate')+
theme(legend.position = "none")+
scale_fill_gradient(low = 'deeppink4', high = 'deeppink1')
data_non_stem_bottom_20 <- clean_data %>%
filter(Is_STEM == 'Non-STEM') %>%
top_n(-20)
## Selecting by Unemployment_rate
ggplot(data_non_stem_bottom_20, aes(x = reorder(Major,1-Unemployment_rate), y = 1-Unemployment_rate, fill = 1-Unemployment_rate)) +
geom_bar(stat = 'identity')+
geom_text(aes(label=paste0(round(1-Unemployment_rate,4)*100,'%')),vjust=0.4, hjust=1, position = position_dodge(width = 1), color="white",size = 3)+
coord_flip()+
labs(title='Non-STEM Employment Rate', subtitle='Top 20')+
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
xlab('Non-STEM Major')+
ylab('Employment Rate')+
theme(legend.position = "none")
inference(y = clean_data$Unemployment_rate,
x= clean_data$Is_STEM,
est = 'mean',
type = 'ht',
null = 0,
alternative = 'twosided',
method = 'theoretical',
conflevel = 0.95)
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_Non-STEM = 75, mean_Non-STEM = 0.072, sd_Non-STEM = 0.0278
## n_STEM = 97, mean_STEM = 0.065, sd_STEM = 0.032
## Observed difference between means (Non-STEM-STEM) = 0.007
##
## H0: mu_Non-STEM - mu_STEM = 0
## HA: mu_Non-STEM - mu_STEM != 0
## Standard error = 0.005
## Test statistic: Z = 1.531
## p-value = 0.1256