Data 606-Project Proposal

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(psych)
library(stringr)

data_url<- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/all-ages.csv"
all_ages <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

all_ages_degree <- all_ages %>% filter(Major_category == "Agriculture & Natural Resources")
head(all_ages_degree)

## # A tibble: 6 x 11
##   Major_code Major Major_category  Total Employed Employed_full_t~
##        <int> <chr> <chr>           <int>    <int>            <int>
## 1       1100 GENE~ Agriculture &~ 128148    90245            74078
## 2       1101 AGRI~ Agriculture &~  95326    76865            64240
## 3       1102 AGRI~ Agriculture &~  33955    26321            22810
## 4       1103 ANIM~ Agriculture &~ 103549    81177            64937
## 5       1104 FOOD~ Agriculture &~  24280    17281            12722
## 6       1105 PLAN~ Agriculture &~  79409    63043            51077
## # ... with 5 more variables: Unemployed <int>, Unemployment_rate <dbl>,
## #   Median <int>, P25th <int>, P75th <dbl>

data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/grad-students.csv"
grad_stdnt <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)


grad_college_degree<- grad_stdnt %>% filter(Major_category == "Agriculture & Natural Resources")
head(grad_college_degree)

## # A tibble: 6 x 22
##   Major_code Major Major_category Grad_total Grad_sample_size Grad_employed
##        <int> <chr> <chr>               <int>            <int>         <int>
## 1       1101 AGRI~ Agriculture &~      17488              386         13104
## 2       1100 GENE~ Agriculture &~      44306              764         28930
## 3       1302 FORE~ Agriculture &~      24713              487         16831
## 4       1303 NATU~ Agriculture &~      29357              659         23394
## 5       1105 PLAN~ Agriculture &~      30983              624         22782
## 6       1102 AGRI~ Agriculture &~      14800              305         10592
## # ... with 16 more variables: Grad_full_time_year_round <int>,
## #   Grad_unemployed <int>, Grad_unemployment_rate <dbl>,
## #   Grad_median <dbl>, Grad_P25 <int>, Grad_P75 <dbl>,
## #   Nongrad_total <int>, Nongrad_employed <int>,
## #   Nongrad_full_time_year_round <int>, Nongrad_unemployed <int>,
## #   Nongrad_unemployment_rate <dbl>, Nongrad_median <dbl>,
## #   Nongrad_P25 <int>, Nongrad_P75 <dbl>, Grad_share <dbl>,
## #   Grad_premium <dbl>

data_url <- "https://raw.githubusercontent.com/jgarcia71/Data-606-Assignments/master/recent-grads.csv"
rct_grad <- data_url %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

rct_grad_degrees <- rct_grad %>% filter(Major_category == "Agriculture & Natural Resources")
head(rct_grad_degrees)

## # A tibble: 6 x 21
##    Rank Major_code Major Major_category Total Sample_size   Men Women
##   <int>      <int> <chr> <chr>          <int>       <int> <int> <int>
## 1    22       1104 FOOD~ Agriculture &~  4361          36 99743 28576
## 2    64       1101 AGRI~ Agriculture &~ 14240         273  7426 10874
## 3    65       1100 GENE~ Agriculture &~ 10399         158  1761  1874
## 4    72       1102 AGRI~ Agriculture &~  2439          44 10624 15270
## 5   108       1303 NATU~ Agriculture &~ 13773         152 27015 35037
## 6   112       1302 FORE~ Agriculture &~  3607          48 32041 71439
## # ... with 13 more variables: ShareWomen <dbl>, Employed <int>,
## #   Full_time <int>, Part_time <int>, Full_time_year_round <int>,
## #   Unemployed <int>, Unemployment_rate <dbl>, Median <int>, P25th <int>,
## #   P75th <int>, College_jobs <int>, Non_college_jobs <int>,
## #   Low_wage_jobs <int>

Research Question

Which college majors offer the best opportunities in terms of unemployment rate and salary?

Cases

What are the cases, and how many are there? All_ages Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include both undergrads and grad students.

Grad Students Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only grad students aged 25+ years.

Recent Grads Each case represents majors offered by colleges and universities in the US. There are 173 majors represented. These data include only undergraduate students aged <28 years. These data also include gender statistics.

Data collection

Describe the method of data collection. These Data were collated by the following website: https://data.world/

Type of study

What type of study is this (observational/experiment)? This is an observational Study

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)? The explanatory variables are the counts of employed and unemployed college degree holders and the statistics of their income. These data are numerical.

Relevant summary statistics

Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed. First we will look at overall unemployment rate for the 3 categories: all ages, recent grads, and grad students.

summary(all_ages$Unemployment_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615

summary(rct_grad$Unemployment_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.05072 0.06827 0.06859 0.08760 0.17723       1

summary(grad_stdnt$Grad_unemployment_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851

unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Unemployment Rate", col = terrain.colors(nrow(unempl)))

summary(all_ages$Median)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35000   46000   53000   56816   65000  125000

hist(all_ages$Median, main = "Histogram for Median Income All Ages", xlab = "Median Income by Major All Ages (USD)", border="blue", 
     col="green")

summary(grad_stdnt$Grad_median)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000

hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", xlab = "Median Income by Major Grad Student (USD)",  border="green", 
     col="yellow")

medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
barplot(medsal/nrow(medsal), names.arg = c("All", "Recent Grad", "Grad Student"), xlab = "Median Salary", col = topo.colors(nrow(medsal)))