KLEBER PEREZ

DATA 606 - PROJECT PROPOSAL
SOURCE FILES AND RMARKDOWN ON RPUBS AND GITHUB



Load Libraries

# load libraries

library(kableExtra)
#suppressMessages(suppressWarnings(library(tidyverse)))
library(tidyr)
library(dplyr)
library(psych)
library(stringr)

Data Wrangling

# Load data and Subset data by category 
url1 <- "https://raw.githubusercontent.com/kleberperez1/CUNY-SPS-Data606-Project-Proposal/master/all_ages.csv"
all_ages <- url1 %>% 
  read.csv(stringsAsFactors = FALSE) %>% 
  tbl_df() %>% 
  arrange(Major_category)

kable(head(all_ages, 20)) %>%
  kable_styling("striped", "hovered", font_size = 12) %>%
  scroll_box(height = "500px")
Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th
1100 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 74078 2423 0.0261471 50000 34000 80000
1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 64240 2266 0.0286361 54000 36000 80000
1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 22810 821 0.0302483 63000 40000 98000
1103 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 64937 3619 0.0426789 46000 30000 72000
1104 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 12722 894 0.0491884 62000 38500 90000
1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources 79409 63043 51077 2070 0.0317909 50000 35000 75000
1106 SOIL SCIENCE Agriculture & Natural Resources 6586 4926 4042 264 0.0508671 63000 39400 88000
1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources 8549 6392 5074 261 0.0392304 52000 35000 75000
1302 FORESTRY Agriculture & Natural Resources 69447 48228 39613 2144 0.0425633 58000 40500 80000
1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources 83188 65937 50595 3789 0.0543413 52000 37100 75000
6000 FINE ARTS Arts 571961 386961 256747 29912 0.0717533 45000 30000 70000
6001 DRAMA AND THEATER ARTS Arts 174817 135071 81519 11789 0.0802737 42000 29000 62000
6002 MUSIC Arts 276262 192704 116142 11155 0.0547192 45000 30000 67000
6003 VISUAL AND PERFORMING ARTS Arts 55141 41098 23479 4297 0.0946580 40000 27000 59000
6004 COMMERCIAL ART AND GRAPHIC DESIGN Arts 504657 379980 266671 30330 0.0739197 46600 32000 70000
6005 FILM VIDEO AND PHOTOGRAPHIC ARTS Arts 133508 107651 69303 10080 0.0856189 47000 30000 70000
6007 STUDIO ARTS Arts 81008 58799 36943 5372 0.0837138 37600 24900 58000
6099 MISCELLANEOUS FINE ARTS Arts 8511 6431 3802 1190 0.1561475 45000 30000 60000
1301 ENVIRONMENTAL SCIENCE Biology & Life Science 106106 87602 65238 4736 0.0512898 52000 38000 75000
3600 BIOLOGY Biology & Life Science 839454 583079 422788 36757 0.0593012 51000 35000 80000

Subset by Category

# Subsetting by Major Category 
all_ages_ag <- all_ages %>% 
  filter(Major_category == "Agriculture & Natural Resources")

kable(head(all_ages_ag, 20)) %>%
  kable_styling("striped", "hovered", font_size = 12) %>%
  scroll_box(height = "500px")
Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th
1100 GENERAL AGRICULTURE Agriculture & Natural Resources 128148 90245 74078 2423 0.0261471 50000 34000 80000
1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 95326 76865 64240 2266 0.0286361 54000 36000 80000
1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 33955 26321 22810 821 0.0302483 63000 40000 98000
1103 ANIMAL SCIENCES Agriculture & Natural Resources 103549 81177 64937 3619 0.0426789 46000 30000 72000
1104 FOOD SCIENCE Agriculture & Natural Resources 24280 17281 12722 894 0.0491884 62000 38500 90000
1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources 79409 63043 51077 2070 0.0317909 50000 35000 75000
1106 SOIL SCIENCE Agriculture & Natural Resources 6586 4926 4042 264 0.0508671 63000 39400 88000
1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources 8549 6392 5074 261 0.0392304 52000 35000 75000
1302 FORESTRY Agriculture & Natural Resources 69447 48228 39613 2144 0.0425633 58000 40500 80000
1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources 83188 65937 50595 3789 0.0543413 52000 37100 75000

Apply Filter

# Apply Filter on category 
value_list <- c("Arts", "Biology & Life Science", "Business") #, "Communications & Journalism", "Computers & Mathematics", 
                #"Education", "Engineering", "Health", "Humanities & Liberal Arts", "Industrial Arts & Consumer Services", 
                #"Law & Public Policy", "Physical Sciences", "Psychology & Social Work", "Social Science")
all_ages_value <- all_ages %>% 
  filter(Major_category %in% value_list)

kable(all_ages_value) %>%
  kable_styling("striped", "hovered", font_size = 12) %>%
  scroll_box(height = "500px")
Major_code Major Major_category Total Employed Employed_full_time_year_round Unemployed Unemployment_rate Median P25th P75th
6000 FINE ARTS Arts 571961 386961 256747 29912 0.0717533 45000 30000 70000
6001 DRAMA AND THEATER ARTS Arts 174817 135071 81519 11789 0.0802737 42000 29000 62000
6002 MUSIC Arts 276262 192704 116142 11155 0.0547192 45000 30000 67000
6003 VISUAL AND PERFORMING ARTS Arts 55141 41098 23479 4297 0.0946580 40000 27000 59000
6004 COMMERCIAL ART AND GRAPHIC DESIGN Arts 504657 379980 266671 30330 0.0739197 46600 32000 70000
6005 FILM VIDEO AND PHOTOGRAPHIC ARTS Arts 133508 107651 69303 10080 0.0856189 47000 30000 70000
6007 STUDIO ARTS Arts 81008 58799 36943 5372 0.0837138 37600 24900 58000
6099 MISCELLANEOUS FINE ARTS Arts 8511 6431 3802 1190 0.1561475 45000 30000 60000
1301 ENVIRONMENTAL SCIENCE Biology & Life Science 106106 87602 65238 4736 0.0512898 52000 38000 75000
3600 BIOLOGY Biology & Life Science 839454 583079 422788 36757 0.0593012 51000 35000 80000
3601 BIOCHEMICAL SCIENCES Biology & Life Science 75322 52594 37103 4056 0.0715975 53000 33000 82000
3602 BOTANY Biology & Life Science 14135 9284 6333 327 0.0340235 50000 32000 75000
3603 MOLECULAR BIOLOGY Biology & Life Science 28197 20221 13366 1303 0.0605371 45000 30000 70000
3604 ECOLOGY Biology & Life Science 45368 36708 25677 1888 0.0489170 47500 32000 73000
3605 GENETICS Biology & Life Science 6362 4747 3498 206 0.0415910 48000 33000 80000
3606 MICROBIOLOGY Biology & Life Science 68885 45422 33990 2435 0.0508807 60000 40000 85000
3607 PHARMACOLOGY Biology & Life Science 5015 3481 2579 57 0.0161108 60000 35000 105000
3608 PHYSIOLOGY Biology & Life Science 43984 31394 20207 1692 0.0511395 50000 30000 75000
3609 ZOOLOGY Biology & Life Science 55395 35714 26152 1815 0.0483626 55000 34000 85000
3611 NEUROSCIENCE Biology & Life Science 13676 8987 5446 665 0.0688976 35000 28000 52000
3699 MISCELLANEOUS BIOLOGY Biology & Life Science 29389 22298 16508 1114 0.0475824 52000 33500 72800
4006 COGNITIVE SCIENCE AND BIOPSYCHOLOGY Biology & Life Science 6898 5527 3639 284 0.0488728 53000 31500 93000
6200 GENERAL BUSINESS Business 2148712 1580978 1304646 85626 0.0513775 60000 40000 95000
6201 ACCOUNTING Business 1779219 1335825 1095027 75379 0.0534147 65000 42500 100000
6202 ACTUARIAL SCIENCE Business 9763 7846 6880 466 0.0560635 72000 53000 115000
6203 BUSINESS MANAGEMENT AND ADMINISTRATION Business 3123510 2354398 1939384 147261 0.0588653 58000 39500 86000
6204 OPERATIONS LOGISTICS AND E-COMMERCE Business 57200 47341 41104 2141 0.0432683 65000 45000 90000
6205 BUSINESS ECONOMICS Business 75547 57983 48471 3816 0.0617486 65000 45000 100000
6206 MARKETING AND MARKETING RESEARCH Business 1114624 890125 704912 51839 0.0550329 56000 38500 90000
6207 FINANCE Business 816548 670681 561073 34166 0.0484729 65000 45000 100000
6209 HUMAN RESOURCES AND PERSONNEL MANAGEMENT Business 187274 142879 116466 9241 0.0607481 54000 38000 80000
6210 INTERNATIONAL BUSINESS Business 86064 66453 51012 5106 0.0713537 54000 38600 80000
6211 HOSPITALITY MANAGEMENT Business 200854 163393 122499 8862 0.0514470 49000 33000 70000
6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business 156673 134478 118249 6186 0.0439771 72000 50000 100000
6299 MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION Business 102753 77471 61603 4308 0.0526786 53000 36000 83000

Graduate Students

# Load graduate students file and subset data

url2 <- "https://raw.githubusercontent.com/kleberperez1/CUNY-SPS-Data606-Project-Proposal/master/grad_students.csv"
grad_stdnt <- url2 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

grad_ag <- grad_stdnt %>% 
  filter(Major_category == "Agriculture & Natural Resources")

kable(head(grad_ag, 20)) %>%
  kable_styling("striped", "hovered", font_size = 12) %>%
  scroll_box(height = "500px")
Major_code Major Major_category Grad_total Grad_sample_size Grad_employed Grad_full_time_year_round Grad_unemployed Grad_unemployment_rate Grad_median Grad_P25 Grad_P75 Nongrad_total Nongrad_employed Nongrad_full_time_year_round Nongrad_unemployed Nongrad_unemployment_rate Nongrad_median Nongrad_P25 Nongrad_P75 Grad_share Grad_premium
1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources 17488 386 13104 11207 473 0.0348383 67000 41600 100000 89169 71781 61335 1869 0.0253768 55000 38000 80000 0.1639649 0.2181818
1100 GENERAL AGRICULTURE Agriculture & Natural Resources 44306 764 28930 23024 874 0.0293249 68000 45000 104000 123984 86631 72409 2352 0.0264320 50000 34000 80000 0.2632717 0.3600000
1302 FORESTRY Agriculture & Natural Resources 24713 487 16831 14102 725 0.0412964 78000 52000 110000 67649 46815 39048 1885 0.0387064 59000 42000 80000 0.2675667 0.3220339
1303 NATURAL RESOURCES MANAGEMENT Agriculture & Natural Resources 29357 659 23394 19087 711 0.0294960 70000 50000 100000 77101 60690 48256 3413 0.0532424 53000 38000 75000 0.2757613 0.3207547
1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources 30983 624 22782 18312 735 0.0312540 67000 45000 100000 76190 60241 49506 1899 0.0305600 50000 35000 75000 0.2890933 0.3400000
1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources 14800 305 10592 8768 216 0.0199852 80000 53000 120000 33049 25557 22496 734 0.0279183 63000 40000 99000 0.3093064 0.2698413
1106 SOIL SCIENCE Agriculture & Natural Resources 3335 61 2284 1641 34 0.0146678 65000 50000 91000 6242 4654 3917 264 0.0536804 65000 41000 89000 0.3482301 0.0000000
1103 ANIMAL SCIENCES Agriculture & Natural Resources 56807 1335 47755 39047 596 0.0123265 70300 48000 104000 94910 74896 61629 3101 0.0397579 48000 32000 75000 0.3744274 0.4645833
1199 MISCELLANEOUS AGRICULTURE Agriculture & Natural Resources 5032 98 2758 2276 261 0.0864525 54000 45000 81000 8092 5978 4707 239 0.0384430 55000 39000 78000 0.3834197 -0.0181818
1104 FOOD SCIENCE Agriculture & Natural Resources 14521 266 10857 8074 370 0.0329563 72000 50000 110000 22853 16298 12431 681 0.0401084 63000 40000 92000 0.3885321 0.1428571

Recent Graduates

# Load graduate students file and subset data
url3 <- "https://raw.githubusercontent.com/kleberperez1/CUNY-SPS-Data606-Project-Proposal/master/recent_grads.csv"
rct_grad <- url3 %>% read.csv(stringsAsFactors = FALSE) %>% tbl_df() %>% arrange(Major_category)

rct_ag <- rct_grad %>% 
  filter(Major_category == "Agriculture & Natural Resources")

kable(head(rct_ag, 20)) %>%
  kable_styling("striped", "hovered", font_size = 12) %>%
  scroll_box(height = "500px")
Rank Major_code Major Total Men Women Major_category ShareWomen Sample_size Employed Full_time Part_time Full_time_year_round Unemployed Unemployment_rate Median P25th P75th College_jobs Non_college_jobs Low_wage_jobs
22 1104 FOOD SCIENCE NA NA NA Agriculture & Natural Resources NA 36 3149 2558 1121 1735 338 0.0969315 53000 32000 70000 1183 1274 485
64 1101 AGRICULTURE PRODUCTION AND MANAGEMENT 14240 9658 4582 Agriculture & Natural Resources 0.3217697 273 12323 11119 2196 9093 649 0.0500308 40000 25000 50000 1925 6221 1362
65 1100 GENERAL AGRICULTURE 10399 6053 4346 Agriculture & Natural Resources 0.4179248 158 8884 7589 2031 5888 178 0.0196425 40000 30000 50000 2418 4717 839
72 1102 AGRICULTURAL ECONOMICS 2439 1749 690 Agriculture & Natural Resources 0.2829028 44 2174 1819 620 1528 182 0.0772496 40000 27000 54000 535 893 94
108 1303 NATURAL RESOURCES MANAGEMENT 13773 8617 5156 Agriculture & Natural Resources 0.3743556 152 11797 10722 2613 6954 842 0.0666192 35000 25000 42000 4333 5808 1405
112 1302 FORESTRY 3607 3156 451 Agriculture & Natural Resources 0.1250347 48 3007 2473 891 1763 322 0.0967257 35000 28600 48000 1096 1692 327
113 1106 SOIL SCIENCE 685 476 209 Agriculture & Natural Resources 0.3051095 4 613 488 185 383 0 0.0000000 35000 18500 44000 355 144 0
144 1105 PLANT SCIENCE AND AGRONOMY 7416 4897 2519 Agriculture & Natural Resources 0.3396710 110 6594 5798 1246 4522 314 0.0454545 32000 22900 40000 2089 3545 1231
153 1103 ANIMAL SCIENCES 21573 5347 16226 Agriculture & Natural Resources 0.7521439 255 17112 14479 5353 10824 917 0.0508625 30000 22000 40000 5443 9571 2125
162 1199 MISCELLANEOUS AGRICULTURE 1488 404 1084 Agriculture & Natural Resources 0.7284946 24 1290 1098 335 936 82 0.0597668 29000 23000 42100 483 626 31

Research question

Cases

Data collection

Type of study

Explanatory Variables

Relevant summary statistics

summary(all_ages$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.04626 0.05472 0.05736 0.06904 0.15615
summary(rct_grad$Unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.05031 0.06796 0.06819 0.08756 0.17723
summary(grad_stdnt$Grad_unemployment_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02607 0.03665 0.03934 0.04805 0.13851
DT::datatable(summary(all_ages), options = list(pageLength = 7)) 
DT::datatable(summary(all_ages_ag), list(pageLength = 7)) 
DT::datatable(summary(rct_grad), list(pageLength = 7)) 

Unemployment Rate

unempl <- cbind(all_ages$Unemployment_rate, rct_grad$Unemployment_rate, grad_stdnt$Grad_unemployment_rate)
barplot(unempl/nrow(unempl), names.arg = c("All", "Recent Grad", "Grad Student"), 
        xlab = "Unemployment Rate", col = heat.colors(nrow(unempl)))

Median Income by Major All Ages (USD)

summary(all_ages$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35000   46000   53000   56816   65000  125000
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
hist(all_ages$Median, main = "Histogram for Median Income All Ages", 
     xlab = "Median Income by Major All Ages (USD)", col = colors)

Median Income by Major Recent Graduates (USD)

options(scipen = 999) 
summary(rct_grad$Median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22000   33000   36000   40151   45000  110000
hist(rct_grad$Median, main = "Histogram for Median Income Recent Grads", 
     xlab = "Median Income by Major Recent Grads (USD)", col = colors)

Median Income by Graduate Students (USD)

summary(grad_stdnt$Grad_median)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47000   65000   75000   76756   90000  135000
hist(grad_stdnt$Grad_median, main = "Histogram for Median Income Grad Students", 
     xlab = "Median Income by Major Grad Student (USD)", col = colors)

All, Recent Graduate and Graduate Students Median Salary

medsal <- cbind(all_ages$Median, rct_grad$Median, grad_stdnt$Grad_median)
barplot(medsal/nrow(medsal), names.arg = c("All", "Recent Grad", "Grad Student"), 
        xlab = "Median Salary", col = heat.colors(nrow(unempl)))







KLEBER PEREZ

DATA 606 - PROJECT PROPOSAL
SOURCE FILES AND RMARKDOWN ON RPUBS AND GITHUB




Please email to: Kleber Perez for any suggestion.

    DATA 606 PROJECT PROPOSAL - MSDS 2019 Program.