Source: recent grads list downloaded from the site: [via]https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/
readme file to understand the variables: [via]https://github.com/fivethirtyeight/data/blob/master/college-majors/readme.md
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(purrr)
library(stringr)
library(forcats)
college_grads <- read.csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv')
str(college_grads)
## 'data.frame': 173 obs. of 21 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Major_code : int 2419 2416 2415 2417 2405 2418 6202 5001 2414 2408 ...
## $ Major : chr "PETROLEUM ENGINEERING" "MINING AND MINERAL ENGINEERING" "METALLURGICAL ENGINEERING" "NAVAL ARCHITECTURE AND MARINE ENGINEERING" ...
## $ Total : int 2339 756 856 1258 32260 2573 3777 1792 91227 81527 ...
## $ Men : int 2057 679 725 1123 21239 2200 2110 832 80320 65511 ...
## $ Women : int 282 77 131 135 11021 373 1667 960 10907 16016 ...
## $ Major_category : chr "Engineering" "Engineering" "Engineering" "Engineering" ...
## $ ShareWomen : num 0.121 0.102 0.153 0.107 0.342 ...
## $ Sample_size : int 36 7 3 16 289 17 51 10 1029 631 ...
## $ Employed : int 1976 640 648 758 25694 1857 2912 1526 76442 61928 ...
## $ Full_time : int 1849 556 558 1069 23170 2038 2924 1085 71298 55450 ...
## $ Part_time : int 270 170 133 150 5180 264 296 553 13101 12695 ...
## $ Full_time_year_round: int 1207 388 340 692 16697 1449 2482 827 54639 41413 ...
## $ Unemployed : int 37 85 16 40 1672 400 308 33 4650 3895 ...
## $ Unemployment_rate : num 0.0184 0.1172 0.0241 0.0501 0.0611 ...
## $ Median : int 110000 75000 73000 70000 65000 65000 62000 62000 60000 60000 ...
## $ P25th : int 95000 55000 50000 43000 50000 50000 53000 31500 48000 45000 ...
## $ P75th : int 125000 90000 105000 80000 75000 102000 72000 109000 70000 72000 ...
## $ College_jobs : int 1534 350 456 529 18314 1142 1768 972 52844 45829 ...
## $ Non_college_jobs : int 364 257 176 102 4440 657 314 500 16384 10874 ...
## $ Low_wage_jobs : int 193 50 0 0 972 244 259 220 3253 3170 ...
# using map() function in purr package
# map()returns a a vector or multiple vectors based on the input provided
# it performs the function for all the values in the vector.It performs sort function in this case.
college_majors_final <- college_grads %>%
count(Major_category) %>%
rename(no_of_majors =n) %>%
map(sort, decreasing = TRUE)
# 2 vectors major category and no. of majors are returned
head(college_majors_final)
## $Major_category
## [1] "Social Science" "Psychology & Social Work"
## [3] "Physical Sciences" "Law & Public Policy"
## [5] "Interdisciplinary" "Industrial Arts & Consumer Services"
## [7] "Humanities & Liberal Arts" "Health"
## [9] "Engineering" "Education"
## [11] "Computers & Mathematics" "Communications & Journalism"
## [13] "Business" "Biology & Life Science"
## [15] "Arts" "Agriculture & Natural Resources"
##
## $no_of_majors
## [1] 29 16 15 14 13 12 11 10 10 9 9 8 7 5 4 1
# map_chr() always returns a character vector and we can perform character functions
college_majors_final$Major_category <- college_majors_final$Major_category |>
map_chr(str_replace_all, "&.+","")
# example using discard()
college_majors_final$no_of_majors <- college_majors_final$no_of_majors |>
discard(~.<10)
# the vectors ost transformation
head(college_majors_final)
## $Major_category
## [1] "Social Science" "Psychology " "Physical Sciences"
## [4] "Law " "Interdisciplinary" "Industrial Arts "
## [7] "Humanities " "Health" "Engineering"
## [10] "Education" "Computers " "Communications "
## [13] "Business" "Biology " "Arts"
## [16] "Agriculture "
##
## $no_of_majors
## [1] 29 16 15 14 13 12 11 10 10
#summarize to a table aggregating major category, count of men and count of women variable
major_category <- college_grads %>%
group_by(Major_category) %>%
summarize(count_Men = sum(Men,na.rm = TRUE),count_women = sum(Women,na.rm = TRUE) )
#convert to a data frame
major_category_wide <- as.data.frame(major_category )
#pivot the dataframe to longer
major_category_long <- pivot_longer(major_category_wide , cols = c(count_Men, count_women), names_to = "gender", values_to = "count")
# remove "count" text from the gender variable values
major_category_long$gender <- str_replace_all(major_category_long$gender,".*_","")
# Scale count values to thousands
major_category_long$count <- major_category_long$count / 1000
#visualize the data
# show male and female count side by side using position = dodge
# angle the x axis test by 45
#change the color of the fill variable using scale_fill_manual function
major_category_long %>%
ggplot(aes(x = Major_category , y = count , fill = gender)) +
geom_col(position="dodge") +
labs(title = "Counts of Men and Women by College Major Category",
x = "College Major Category",
y = "Count(in thousands)",
fill = "Gender") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("blue", "pink"))
major_category_long %>%
ggplot(aes(x = Major_category , y = count , fill = gender)) +
geom_col(position="dodge") +
labs(title = "Counts of Men and Women by College Major Category",
x = "College Major Category",
y = "Count(in thousands)",
fill = "Gender") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("blue", "pink")) +coord_flip()
i used the fct_reorder function in forcats package
major_category_long %>%
ggplot(aes(x = fct_reorder(Major_category,count) , y = count , fill = gender)) +
geom_col(position="dodge") +
labs(title = "Counts of Men and Women by College Major Category",
x = "College Major Category",
y = "Count(in thousands)",
fill = "Gender") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("blue", "pink")) +coord_flip()