Challenge 1: Counting Vitae

resume <- scan('res.txt', 'character')
resume <- tm::removePunctuation(
                               tolower(resume)
                               )

counts <- sort(
               table(
                    strsplit(
                            paste(
                                  resume, collapse = ''), '')),    decreasing = TRUE)
colors <- c('grey', 'red')
bools <- names(counts) %in% c('e','o','t','h','a','s','i','n','r','d')
bars <- barplot(counts, 
                space = .7, 
                border = colors[bools +1], 
                ylim = c(0,300),
                xaxt = 'n',
                cex.axis = .8)
text(x = bars, 
     y = counts, 
     label = counts, 
     pos = 3, 
     cex = .55, 
     col = "black")
text(x = 35, 
     y = 200,
     cex = .8,
     label = "The red outlines represent the 10 most common letters in English")
title("Resume Character Counts")
labs <- names(counts)
text(cex=1, x=bars, y=-6, labs, xpd=TRUE, srt=45)


Challenge 2: Friendly Competition


Most Openings


nyc_jobs <- read.csv('NYC_Jobs.csv', stringsAsFactors = FALSE)

simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
      sep="", collapse=" ")
}

nyc_jobs$Agency <- sapply(nyc_jobs$Agency, 
                          function(x) simpleCap(tolower(x))
                          )
 
most_openings <-  nyc_jobs                        %>%
                  group_by(Agency)                %>%
                  summarize('Job.Postings' = n()) %>%
                  top_n(10, Job.Postings)         %>%
                  arrange(desc(Job.Postings))
tab <- xtable(most_openings)
align(tab) <-"|l|l|r|"
print(tab, 
     type = "html",
     include.rownames = F)
Agency Job.Postings
Dept Of Health/mental Hygiene 1305
Dept Of Environment Protection 426
Dept Of Design & Construction 241
Dept Of Info Tech & Telecomm 228
Housing Preservation & Dvlpmnt 187
Admin For Children’s Svcs 177
Department Of Transportation 160
Department Of Buildings 147
Department Of Investigation 133
Department Of Finance 131


We can see the Department of Health has far and away the highest demand for jobs right now, with 1305 postings. The agency in second place, the Department of Environmental Protection, has less than a third of that amount.


Highest and Lowest Paying


In order to find out which Agencies are paying the highest and lowest wages, we need to adjust for different pay periods. This table lists Annual, Daily and Hourly wages. We will adjust Daily and Hourly wages to their Annual equivalents based on the publicly available government holiday schedule. There is a “From” and “To” wage given for each entry so I will make an Average Adjusted Salary column for us to filter on.


# Hourly Wage * 8 hours/day * 251 working days (minus weekends and gov holidays) = Annualized Wage
# Daily Wage * 251 working days = Annualized Wage

for(i in 1:nrow(nyc_jobs)){
  if(nyc_jobs$Salary.Frequency[i] == "Annual"){
    nyc_jobs$Adjusted.Salary_From[i] = nyc_jobs$Salary.Range.From[i]
    nyc_jobs$Adjusted.Salary_To[i] = nyc_jobs$Salary.Range.To[i]
  } else if (nyc_jobs$Salary.Frequency[i] == "Hourly"){
    nyc_jobs$Adjusted.Salary_From[i] = 8*251*nyc_jobs$Salary.Range.From[i]
    nyc_jobs$Adjusted.Salary_To[i] = 8*251*nyc_jobs$Salary.Range.To[i]
  } else {
    nyc_jobs$Adjusted.Salary_From[i] = 251*nyc_jobs$Salary.Range.From[i]
    nyc_jobs$Adjusted.Salary_To[i] = 251*nyc_jobs$Salary.Range.To[i]
  }
 nyc_jobs$Average_Adjusted_Salary[i] <- trunc(
                                             mean(
                                                  c(
                                             nyc_jobs$Adjusted.Salary_From[i],
                                             nyc_jobs$Adjusted.Salary_To[i])))    
     
}


The below code chunk will select out the Agency, Salary, and Business Title for the highest paying jobs posted. When I first ran this it looked like there were a number of near duplicate job postings so I also applied a filter to only show unique results.


highest_paying_avg <- nyc_jobs %>%
  select(Agency, Average_Adjusted_Salary, Business.Title)%>% 
  top_n(15, Average_Adjusted_Salary) %>%
  arrange(desc(Average_Adjusted_Salary)) %>% unique()

highest_paying_avg$Average_Adjusted_Salary <- prettyNum(
                                        highest_paying_avg$Average_Adjusted_Salary, 
                                        big.mark = ",")

tab2 <- xtable(highest_paying_avg)
align(tab2) <-"|l|l|c|l|"
print(tab2, 
     type = "html",
     include.rownames = F)
Agency Average_Adjusted_Salary Business.Title
Dept Of Environment Protection 198,518 Deputy Commissioner, Wastewater Treatment
Dept Of Environment Protection 193,259 General Counsel
Dept Of Health/mental Hygiene 185,965 Physician, Bureau of Sexually Transmitted Disease Control
Dept Of Health/mental Hygiene 185,965 Physician in Charge, Bureau of Sexually Transmitted Disease Control
Dept Of Citywide Admin Svcs 182,500 Deputy Commissioner of Energy Management
Dept Of Health/mental Hygiene 173,490 Pediatric Pulmonary Specialist, Bureau of School Health
Department Of Correction 170,000 Senior Correctional Institution Administrator
Dept Of Health/mental Hygiene 169,065 Psychiatrist, Bureau of Mental Health


Looks like the highest paying jobs are upper level management (deputy commissioners) or other professionals such as lawyers and physicians.

lowest_paying_avg <- nyc_jobs %>%
  select(Agency, Average_Adjusted_Salary, Business.Title)%>% 
  top_n(10, -Average_Adjusted_Salary) %>%
  arrange(desc(Average_Adjusted_Salary)) %>% unique()

lowest_paying_avg$Average_Adjusted_Salary <- prettyNum(lowest_paying_avg$Average_Adjusted_Salary, 
                                                       big.mark = ",")


tab3 <- xtable(lowest_paying_avg)
align(tab3) <-"|ll|c|l|"
print(tab3, 
     type = "html",
     include.rownames = F)
Agency Average_Adjusted_Salary Business.Title
Department Of Finance 22,088 Summer College Intern
Teachers Retirement System 22,088 Benefit Administration Summer Intern
Teachers Retirement System 22,088 IT- Help Desk/Desktop Support Intern
Teachers Retirement System 22,088 IT- Administrative Summer Intern
Teachers Retirement System 22,088 Internal Audit Summer Intern
Teachers Retirement System 22,088 Risk/Information Security Summer Intern
Admin Trials And Hearings 21,084 College Aide
Office Of Collective Bargainin 19,076 COLLEGE AIDE - CLERICAL


All of these jobs appear to be interns or aides, roles often filled by college students looking to gain exposure to professional settings. Let’s filter out titles including “intern” or “aide”.


toMatch <- c("[Ii][Nn][Tt][Ee][Rr][Nn]", 
             "[Aa][Ii][Dd][Ee]")

nyc_jobs_internless <- nyc_jobs[!grepl(nyc_jobs$Business.Title, 
                                       pattern = paste(toMatch, 
                                                       collapse = "|")),]

lowest_paying_avg <- nyc_jobs_internless %>%
  select(Agency, Average_Adjusted_Salary, Business.Title)%>% 
  top_n(15, -Average_Adjusted_Salary) %>%
  arrange(desc(Average_Adjusted_Salary)) %>% unique()

lowest_paying_avg$Average_Adjusted_Salary <- prettyNum(lowest_paying_avg$Average_Adjusted_Salary, 
                                                       big.mark = ",")


tab4 <- xtable(lowest_paying_avg)
align(tab4) <-"|ll|c|l|"
print(tab4, 
     type = "html",
     include.rownames = F)
Agency Average_Adjusted_Salary Business.Title
Dept Of Health/mental Hygiene 28,645 Clerical Associate
Dept Of Health/mental Hygiene 28,526 Communications Specialist
Admin Trials And Hearings 27,823 Customer Service Representative
Dept Of Health/mental Hygiene 24,096 Office Assistant, Office of the Agency Chief Contracting Officer
Dept Of Health/mental Hygiene 24,096 Projects Coordinator, Facilities Planning and Space Management
Dept Of Health/mental Hygiene 24,096 Jr. Graphic Designer, Office of External Affairs/Publications and Language Access Group
Dept Of Health/mental Hygiene 24,096 Office Assistant, Bureau of Controller
Dept Of Environment Protection 24,096 Project Controls Support


Hardest to Fill


Jobs which require an advanced degree are by definition difficult to fill. According to US Census Bureau information on Education Attainment in the US, approximately 3.3% of people age 25 and over have a “doctorate and/or professional degree.” Right off the bat, that gives you a very small pool of people to pull from.


toMatch <- c('[pP][hH][dD]', 
             '^[Mm][Dd]$', 
             '^[Jj][Dd]$')

phds_pref <- nyc_jobs[grepl(nyc_jobs$Preferred.Skills,
                            pattern = paste(toMatch, 
                                            collapse = '|')),]


Now I want to add a column to see how many days the job posting is outstanding. While the pool of people for these highly qualified jobs is small, it its fiercely competitive and compensated well. Advanced degrees are a large investment (time and money) on the behalf of the degree holder who generally expect to recoup that investment through a larger salary. Jobs like these naturally take a longer amount of time to fill. This would lead me to conclude that a job which requires an advanced degree, but doesnt pay as well would be difficult to fill and thus be posted a long time. These jobs are probably very niche in their focus and so the pool of reasonable applicants is even tinier. Public sector jobs are also seen by many to pay less than their private sector equivalents, further reducing your talent pool.

phds_pref$Posting.Date <- as.Date(phds_pref$Posting.Date, format = "%m/%d/%Y")
phds_pref$Days.Posted <- as.numeric(
                                   difftime(
                                           as.Date(Sys.Date()),
                                           phds_pref$Posting.Date, 'days')) 
phds_pref$Avg_Sal_Scaled <- as.numeric(scale(phds_pref$Average_Adjusted_Salary))
phds_pref$Days.Posted.Scaled <- as.numeric(scale(phds_pref$Days.Posted))

phds_pref <- phds_pref                   %>%
     select(Agency, 
            Business.Title, 
            Average_Adjusted_Salary, 
            Days.Posted,
            Avg_Sal_Scaled,
            Days.Posted.Scaled)          %>%
     filter(Average_Adjusted_Salary > 90000) %>%
     top_n(10, -Average_Adjusted_Salary) %>% 
     arrange(Average_Adjusted_Salary) %>%
     unique                               



phds_pref$Average_Adjusted_Salary <- prettyNum(phds_pref$Average_Adjusted_Salary, 
                                               big.mark = ",")


tab5 <- xtable(phds_pref)
align(tab5) <-"|l|l|l|c|c|c|c|"
print(tab5, 
     type = "html",
     include.rownames = F)
Agency Business.Title Average_Adjusted_Salary Days.Posted Avg_Sal_Scaled Days.Posted.Scaled
Dept Of Health/mental Hygiene Chief, Environmental Legionella, Bureau of the Public Health Laboratory 97,866 165.00 -0.02 0.35
Fire Department Director of Operations Research 99,521 135.00 0.10 0.11
Dept Of Health/mental Hygiene Assistant Commissioner, Bureau of Children, Youth and Families 104,072 68.00 0.45 -0.42
Dept Of Health/mental Hygiene Chief of Molecular Core, Bureau of the Public Health Laboratory 104,676 261.00 0.49 1.11
Dept Of Health/mental Hygiene Chief of Molecular Microbiology, Bureau of the Public Health Laboratory 107,293 139.00 0.69 0.14
Dept Of Health/mental Hygiene HIV Phylogenetic Unit Chief, Bureau of the Public Health Laboratory 107,293 68.00 0.69 -0.42


I normalized the Adjusted Salary and Days Posted column for the advanced degree cohort and looked at the lowest paying of the job postings. Reading through the Business Title column there are some very niche areas, as predicted. On the other hand, my hypothesis that low paying jobs that mention an advanced degree would be posted for the longest amount of time is not supported by the data.

Next Steps


There is a lot of nuance in this data which could allow for further exploration. Having acess to historical data about when jobs were finally filled could be very helpful. Possible next steps are: