resume <- scan('res.txt', 'character')
resume <- tm::removePunctuation(
tolower(resume)
)
counts <- sort(
table(
strsplit(
paste(
resume, collapse = ''), '')), decreasing = TRUE)
colors <- c('grey', 'red')
bools <- names(counts) %in% c('e','o','t','h','a','s','i','n','r','d')
bars <- barplot(counts,
space = .7,
border = colors[bools +1],
ylim = c(0,300),
xaxt = 'n',
cex.axis = .8)
text(x = bars,
y = counts,
label = counts,
pos = 3,
cex = .55,
col = "black")
text(x = 35,
y = 200,
cex = .8,
label = "The red outlines represent the 10 most common letters in English")
title("Resume Character Counts")
labs <- names(counts)
text(cex=1, x=bars, y=-6, labs, xpd=TRUE, srt=45)
nyc_jobs <- read.csv('NYC_Jobs.csv', stringsAsFactors = FALSE)
simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
}
nyc_jobs$Agency <- sapply(nyc_jobs$Agency,
function(x) simpleCap(tolower(x))
)
most_openings <- nyc_jobs %>%
group_by(Agency) %>%
summarize('Job.Postings' = n()) %>%
top_n(10, Job.Postings) %>%
arrange(desc(Job.Postings))
tab <- xtable(most_openings)
align(tab) <-"|l|l|r|"
print(tab,
type = "html",
include.rownames = F)
| Agency | Job.Postings |
|---|---|
| Dept Of Health/mental Hygiene | 1305 |
| Dept Of Environment Protection | 426 |
| Dept Of Design & Construction | 241 |
| Dept Of Info Tech & Telecomm | 228 |
| Housing Preservation & Dvlpmnt | 187 |
| Admin For Children’s Svcs | 177 |
| Department Of Transportation | 160 |
| Department Of Buildings | 147 |
| Department Of Investigation | 133 |
| Department Of Finance | 131 |
We can see the Department of Health has far and away the highest demand for jobs right now, with 1305 postings. The agency in second place, the Department of Environmental Protection, has less than a third of that amount.
In order to find out which Agencies are paying the highest and lowest wages, we need to adjust for different pay periods. This table lists Annual, Daily and Hourly wages. We will adjust Daily and Hourly wages to their Annual equivalents based on the publicly available government holiday schedule. There is a “From” and “To” wage given for each entry so I will make an Average Adjusted Salary column for us to filter on.
# Hourly Wage * 8 hours/day * 251 working days (minus weekends and gov holidays) = Annualized Wage
# Daily Wage * 251 working days = Annualized Wage
for(i in 1:nrow(nyc_jobs)){
if(nyc_jobs$Salary.Frequency[i] == "Annual"){
nyc_jobs$Adjusted.Salary_From[i] = nyc_jobs$Salary.Range.From[i]
nyc_jobs$Adjusted.Salary_To[i] = nyc_jobs$Salary.Range.To[i]
} else if (nyc_jobs$Salary.Frequency[i] == "Hourly"){
nyc_jobs$Adjusted.Salary_From[i] = 8*251*nyc_jobs$Salary.Range.From[i]
nyc_jobs$Adjusted.Salary_To[i] = 8*251*nyc_jobs$Salary.Range.To[i]
} else {
nyc_jobs$Adjusted.Salary_From[i] = 251*nyc_jobs$Salary.Range.From[i]
nyc_jobs$Adjusted.Salary_To[i] = 251*nyc_jobs$Salary.Range.To[i]
}
nyc_jobs$Average_Adjusted_Salary[i] <- trunc(
mean(
c(
nyc_jobs$Adjusted.Salary_From[i],
nyc_jobs$Adjusted.Salary_To[i])))
}
The below code chunk will select out the Agency, Salary, and Business Title for the highest paying jobs posted. When I first ran this it looked like there were a number of near duplicate job postings so I also applied a filter to only show unique results.
highest_paying_avg <- nyc_jobs %>%
select(Agency, Average_Adjusted_Salary, Business.Title)%>%
top_n(15, Average_Adjusted_Salary) %>%
arrange(desc(Average_Adjusted_Salary)) %>% unique()
highest_paying_avg$Average_Adjusted_Salary <- prettyNum(
highest_paying_avg$Average_Adjusted_Salary,
big.mark = ",")
tab2 <- xtable(highest_paying_avg)
align(tab2) <-"|l|l|c|l|"
print(tab2,
type = "html",
include.rownames = F)
| Agency | Average_Adjusted_Salary | Business.Title |
|---|---|---|
| Dept Of Environment Protection | 198,518 | Deputy Commissioner, Wastewater Treatment |
| Dept Of Environment Protection | 193,259 | General Counsel |
| Dept Of Health/mental Hygiene | 185,965 | Physician, Bureau of Sexually Transmitted Disease Control |
| Dept Of Health/mental Hygiene | 185,965 | Physician in Charge, Bureau of Sexually Transmitted Disease Control |
| Dept Of Citywide Admin Svcs | 182,500 | Deputy Commissioner of Energy Management |
| Dept Of Health/mental Hygiene | 173,490 | Pediatric Pulmonary Specialist, Bureau of School Health |
| Department Of Correction | 170,000 | Senior Correctional Institution Administrator |
| Dept Of Health/mental Hygiene | 169,065 | Psychiatrist, Bureau of Mental Health |
Looks like the highest paying jobs are upper level management (deputy commissioners) or other professionals such as lawyers and physicians.
lowest_paying_avg <- nyc_jobs %>%
select(Agency, Average_Adjusted_Salary, Business.Title)%>%
top_n(10, -Average_Adjusted_Salary) %>%
arrange(desc(Average_Adjusted_Salary)) %>% unique()
lowest_paying_avg$Average_Adjusted_Salary <- prettyNum(lowest_paying_avg$Average_Adjusted_Salary,
big.mark = ",")
tab3 <- xtable(lowest_paying_avg)
align(tab3) <-"|ll|c|l|"
print(tab3,
type = "html",
include.rownames = F)
| Agency | Average_Adjusted_Salary | Business.Title |
|---|---|---|
| Department Of Finance | 22,088 | Summer College Intern |
| Teachers Retirement System | 22,088 | Benefit Administration Summer Intern |
| Teachers Retirement System | 22,088 | IT- Help Desk/Desktop Support Intern |
| Teachers Retirement System | 22,088 | IT- Administrative Summer Intern |
| Teachers Retirement System | 22,088 | Internal Audit Summer Intern |
| Teachers Retirement System | 22,088 | Risk/Information Security Summer Intern |
| Admin Trials And Hearings | 21,084 | College Aide |
| Office Of Collective Bargainin | 19,076 | COLLEGE AIDE - CLERICAL |
All of these jobs appear to be interns or aides, roles often filled by college students looking to gain exposure to professional settings. Let’s filter out titles including “intern” or “aide”.
toMatch <- c("[Ii][Nn][Tt][Ee][Rr][Nn]",
"[Aa][Ii][Dd][Ee]")
nyc_jobs_internless <- nyc_jobs[!grepl(nyc_jobs$Business.Title,
pattern = paste(toMatch,
collapse = "|")),]
lowest_paying_avg <- nyc_jobs_internless %>%
select(Agency, Average_Adjusted_Salary, Business.Title)%>%
top_n(15, -Average_Adjusted_Salary) %>%
arrange(desc(Average_Adjusted_Salary)) %>% unique()
lowest_paying_avg$Average_Adjusted_Salary <- prettyNum(lowest_paying_avg$Average_Adjusted_Salary,
big.mark = ",")
tab4 <- xtable(lowest_paying_avg)
align(tab4) <-"|ll|c|l|"
print(tab4,
type = "html",
include.rownames = F)
| Agency | Average_Adjusted_Salary | Business.Title |
|---|---|---|
| Dept Of Health/mental Hygiene | 28,645 | Clerical Associate |
| Dept Of Health/mental Hygiene | 28,526 | Communications Specialist |
| Admin Trials And Hearings | 27,823 | Customer Service Representative |
| Dept Of Health/mental Hygiene | 24,096 | Office Assistant, Office of the Agency Chief Contracting Officer |
| Dept Of Health/mental Hygiene | 24,096 | Projects Coordinator, Facilities Planning and Space Management |
| Dept Of Health/mental Hygiene | 24,096 | Jr. Graphic Designer, Office of External Affairs/Publications and Language Access Group |
| Dept Of Health/mental Hygiene | 24,096 | Office Assistant, Bureau of Controller |
| Dept Of Environment Protection | 24,096 | Project Controls Support |
Jobs which require an advanced degree are by definition difficult to fill. According to US Census Bureau information on Education Attainment in the US, approximately 3.3% of people age 25 and over have a “doctorate and/or professional degree.” Right off the bat, that gives you a very small pool of people to pull from.
toMatch <- c('[pP][hH][dD]',
'^[Mm][Dd]$',
'^[Jj][Dd]$')
phds_pref <- nyc_jobs[grepl(nyc_jobs$Preferred.Skills,
pattern = paste(toMatch,
collapse = '|')),]
Now I want to add a column to see how many days the job posting is outstanding. While the pool of people for these highly qualified jobs is small, it its fiercely competitive and compensated well. Advanced degrees are a large investment (time and money) on the behalf of the degree holder who generally expect to recoup that investment through a larger salary. Jobs like these naturally take a longer amount of time to fill. This would lead me to conclude that a job which requires an advanced degree, but doesnt pay as well would be difficult to fill and thus be posted a long time. These jobs are probably very niche in their focus and so the pool of reasonable applicants is even tinier. Public sector jobs are also seen by many to pay less than their private sector equivalents, further reducing your talent pool.
phds_pref$Posting.Date <- as.Date(phds_pref$Posting.Date, format = "%m/%d/%Y")
phds_pref$Days.Posted <- as.numeric(
difftime(
as.Date(Sys.Date()),
phds_pref$Posting.Date, 'days'))
phds_pref$Avg_Sal_Scaled <- as.numeric(scale(phds_pref$Average_Adjusted_Salary))
phds_pref$Days.Posted.Scaled <- as.numeric(scale(phds_pref$Days.Posted))
phds_pref <- phds_pref %>%
select(Agency,
Business.Title,
Average_Adjusted_Salary,
Days.Posted,
Avg_Sal_Scaled,
Days.Posted.Scaled) %>%
filter(Average_Adjusted_Salary > 90000) %>%
top_n(10, -Average_Adjusted_Salary) %>%
arrange(Average_Adjusted_Salary) %>%
unique
phds_pref$Average_Adjusted_Salary <- prettyNum(phds_pref$Average_Adjusted_Salary,
big.mark = ",")
tab5 <- xtable(phds_pref)
align(tab5) <-"|l|l|l|c|c|c|c|"
print(tab5,
type = "html",
include.rownames = F)
| Agency | Business.Title | Average_Adjusted_Salary | Days.Posted | Avg_Sal_Scaled | Days.Posted.Scaled |
|---|---|---|---|---|---|
| Dept Of Health/mental Hygiene | Chief, Environmental Legionella, Bureau of the Public Health Laboratory | 97,866 | 165.00 | -0.02 | 0.35 |
| Fire Department | Director of Operations Research | 99,521 | 135.00 | 0.10 | 0.11 |
| Dept Of Health/mental Hygiene | Assistant Commissioner, Bureau of Children, Youth and Families | 104,072 | 68.00 | 0.45 | -0.42 |
| Dept Of Health/mental Hygiene | Chief of Molecular Core, Bureau of the Public Health Laboratory | 104,676 | 261.00 | 0.49 | 1.11 |
| Dept Of Health/mental Hygiene | Chief of Molecular Microbiology, Bureau of the Public Health Laboratory | 107,293 | 139.00 | 0.69 | 0.14 |
| Dept Of Health/mental Hygiene | HIV Phylogenetic Unit Chief, Bureau of the Public Health Laboratory | 107,293 | 68.00 | 0.69 | -0.42 |
I normalized the Adjusted Salary and Days Posted column for the advanced degree cohort and looked at the lowest paying of the job postings. Reading through the Business Title column there are some very niche areas, as predicted. On the other hand, my hypothesis that low paying jobs that mention an advanced degree would be posted for the longest amount of time is not supported by the data.
There is a lot of nuance in this data which could allow for further exploration. Having acess to historical data about when jobs were finally filled could be very helpful. Possible next steps are: