1 Introduction

1.0.1 Background

The following is summary information on complete cases with salaries over $10,000.

1.0.2 Libraries

Load required libraries.

library(proto)
library(gsubfn)
library(RSQLite)
library(knitr)
library(data.table)
library(plyr)
library(dplyr)
library(sqldf)
library(ggplot2)
library(RColorBrewer)
library(grid)
library(ggrepel)
library(plotly)
library(kableExtra)
library(tidyr)

2 Load data

2.0.1 recreate df10 - all complete cases over 10K

#run alone first to generate file
dfc<- data.frame(read.csv("dfc.csv"),stringsAsFactors = FALSE, strip.white=TRUE)

df10<-subset(dfc, sal>=10000)

2.0.2 export df10 - all complete cases over 10K

#export df10 ( csv of all complete cases over 10k)
write.csv(df10, file = "df10.csv")

2.0.3 import df10, remove community boards, clean

#import df10 ( csv of all complete cases over 10k)
df10<- data.frame(read.csv("df10.csv"),stringsAsFactors = FALSE, strip.white=TRUE)

#rename dataframe
df<-df10

#remove community boards
df<-df[- grep("COMMUNITY BOARD", df$agency),]

#change class to character
df$Description <- as.character(df$Description)

#clean title description to remove parenthesis
df$Description[df$Description == "ADMINISTRATIVE STAFF ANALYST ("] <- "ADMINISTRATIVE STAFF ANALYST"

#clean spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
df$agency <- trim(df$agency)


#df 414504 observations, all years, 10K+
df %>%
  group_by(Description) %>%
  summarise(count = n_distinct(agency))%>% 
  arrange(desc(count))%>%
  top_n(20)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Selecting by count
## # A tibble: 21 x 2
##    Description                    count
##    <chr>                          <int>
##  1 COMMUNITY ASSOCIATE               63
##  2 ADMINISTRATIVE STAFF ANALYST      62
##  3 COMMUNITY COORDINATOR             61
##  4 PRINCIPAL ADMINISTRATIVE ASSOC    60
##  5 CLERICAL ASSOCIATE                59
##  6 COMPUTER SYSTEMS MANAGER          55
##  7 EXECUTIVE AGENCY COUNSEL          55
##  8 ADM MANAGER-NON-MGRL FRM M1/M2    50
##  9 ASSOCIATE STAFF ANALYST           49
## 10 COMMUNITY ASSISTANT               47
## # ... with 11 more rows

2.0.4 Create df17 - 2017 only, 10K+

dfallyears<-df

df17<- dfallyears[dfallyears$year==2017,]

topsal<-df17 %>%
  group_by(Description) %>%
  mutate(Count = n()) %>%
  group_by(title, Description,  Count) %>%
  summarise_at(vars(sal), funs(mean(., na.rm=TRUE)  ))%>%
  arrange(desc(sal))
#export df17 ( csv of all complete cases over 10k, 2017)
write.csv(df17, file = "df17.csv")

3 Plots

3.0.1 Plot - Select titles - percent salary increases by agency - all years

dfraises<- data.frame(read.csv("bestraises-selecttitles.csv"),stringsAsFactors = FALSE, strip.white=TRUE)
    
         
dfraises$title <- factor(dfraises$title, levels = unique(dfraises$title)[order(dfraises$pctincrease, decreasing = TRUE)])


p1a <- plot_ly(
  data = dfraises, 
  x = ~title, 
  y = ~pctincrease, 
  type = 'bar', 
  hoverinfo='text',
  marker=list(color='#82E0AA'),
  text = ~paste(
    'Agency: ', title,
    '<br> Salary Percent Increase: ', pctincrease) ) %>%
  layout(title="Salary increases 2014-2017, top 10 titles, select titles", margin = list(b = 200), xaxis = list(tickangle = 90))

p1a

3.0.2 Plot - Select titles - percent headcount increases by agency - all years

dfheadcount<- data.frame(read.csv("headcountbyagency-pct-increase-top10.csv"),stringsAsFactors = FALSE, strip.white=TRUE)

#p2 <- ggplot(df12, aes(x = reorder(agency, -pctincrease), y = pctincrease)) +
         # geom_bar(stat = "identity")
#p2
         
         
dfheadcount$abbreviation <- factor(dfheadcount$abbreviation, levels = unique(dfheadcount$abbreviation)[order(dfheadcount$pctincrease, decreasing = TRUE)])


p1 <- plot_ly(
  data = dfheadcount, 
  x = ~abbreviation, 
  y = ~pctincrease, 
  type = 'bar', 
  hoverinfo='text',
  marker=list(color='#45B39D'),
  text = ~paste(
    'Agency: ', agency,
    '<br> Headcount Percent Increase: ', pctincrease) ) %>%
  layout(title="Headcount increases 2014-2017, top 10 agencies, select titles", margin = list(b = 200), xaxis = list(tickangle = 90))

p1

3.0.3 Plot - All Titles - Top 10 agencies with the most jobs over $10K, 2017

df3<- data.frame(read.csv("agency17high.csv"),stringsAsFactors = FALSE)

df3 %>% mutate_if(is.factor, as.character) -> df3

p2 <- df3 %>%
  plot_ly(labels = ~ADDRESS, values = ~n) %>%
  add_pie(hole = 0.6) %>%
  layout(title = "Agencies with the most jobs over $10K",  showlegend = T,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

p2

3.0.4 Plot - All Titles - Top 10 titles with the most jobs over $10K, 2017

df4<- data.frame(read.csv("agency17high-titles.csv"),stringsAsFactors = FALSE)

df4 %>% mutate_if(is.factor, as.character) -> df4

p3 <- df4 %>%
  plot_ly(labels = ~Description, values = ~n) %>%
  add_pie(hole = 0.6) %>%
  layout(title = "Titles with the most jobs over $10K",  showlegend = T,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

p3

3.0.5 Plot - All Titles - Histogram: Distribution of 10K+ salaries, 2017

#plot p4

hist(df17$sal, 
     main ="Distribution of salaries over $10K, 2017",
     xlab="Salary",
     border="black",
     col="#58D68D",
     xlim=c(0,200000),
     ylim=c(0,35000),
     breaks = seq(0, 350000, by = 5000)
     #prob=TRUE
     #breaks=c(0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,350000)
     )

3.0.6 Plot - All Titles - Salaries between 85K and 90K

What constitutes the spike in the histogram between $85,000 and $95,000?

df5<- data.frame(read.csv("titles17-8590k.csv"),stringsAsFactors = FALSE)

df5 %>% mutate_if(is.factor, as.character) -> df5

p5 <- df5 %>%
  plot_ly(labels = ~Description, values = ~n) %>%
  add_pie(hole = 0.6) %>%
  layout(title = "Titles between $85K and $90K",  showlegend = T,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

p5

3.0.7 Plot - Scatterplot of 2017 salaries with count of title

sal17a<-df17 %>%
  group_by(Description) %>%
  mutate(Count = n()) %>%
  group_by(title, Description, Count) %>%
  summarise_at(vars(sal), funs(round(mean(., na.rm=TRUE),0)  ))%>%
  arrange(desc(Count))

dfp5<-head(sal17a,100)

p6 <- plot_ly(
  data = dfp5, 
  x = ~Count, 
  y = ~sal, 
  type = 'scatter', 
  mode = 'markers',
  hoverinfo='text',
  text = ~paste(
    'Title: ', Description,
    '<br> Employees: ', Count,
    '<br> Avg Salary: ', sal)) %>%
  layout(title="100 most popular titles with average salary by title, 2017")
p6

3.0.8 Plot - Scatterplot of 2017 salaries with count of title, <5000 title headcount

p7 <- plot_ly(
  data = dfp5, 
  x = ~Count, 
  y = ~sal, 
  type = 'scatter', 
  mode = 'markers',
  hoverinfo='text',
  text = ~paste(
    'Title: ', Description,
    '<br> Employees: ', Count,
    '<br> Avg Salary: ', sal)) %>%
  layout(
    xaxis = list(range = c(0, 5000)),
    yaxis = list(range = c(0, 170000)),
    title="Popular titles with average salary by title, <5000 serving in title, 2017")
p7

3.0.9 Plot - Which titles have employees across the most agencies, excluding Community Boards?

dfcommontitles<- data.frame(read.csv("commontitles.csv"),stringsAsFactors = FALSE)

dfcommontitles$Description <- factor(dfcommontitles$Description, levels  =unique(dfcommontitles$Description)[order(dfcommontitles$count, decreasing = TRUE)])


p8 <- plot_ly(
  data = dfcommontitles, 
  x = ~Description, 
  y = ~count, 
  type = 'bar', 
  hoverinfo='text',
  marker=list(color='#5DADE2'),
  text = ~paste(
    'Title: ', Description,
    '<br> Agencies: ', count) ) %>%
  layout(title="Titles across the most agencies, excluding Community Boards", margin = list(b = 200), xaxis = list(tickangle = 90))

p8

3.0.10 Table - For select titles, do some agencies pay more? (Head only)

Best agencies to apply to if you want to work in these titles.

#import dfsub1
dfsub1<- data.frame(read.csv("dfsub1.csv"),stringsAsFactors = FALSE)

#subset only 2017
dfsub17<- dfsub1[dfsub1$year==2017,]

#group by agency and title
bestagencies<-dfsub17 %>%
  group_by(agency, Description) %>%
  mutate(Count = n()) %>%
  group_by(agency, Description, Count) %>%
  summarise_at(vars(sal), funs(mean(., na.rm=TRUE)  ))%>%
  arrange(agency)

kable(head(bestagencies))
agency Description Count sal
ADMINISTRATION FOR CHILDRE ACCOUNTANT 13 62636.00
ADMINISTRATION FOR CHILDRE ADM MANAGER-NON-MGRL FRM M1/M2 67 69026.58
ADMINISTRATION FOR CHILDRE ADMINISTRATIVE PUBLIC INFORMAT 4 114382.00
ADMINISTRATION FOR CHILDRE ADMINISTRATIVE STAFF ANALYST 100 92039.42
ADMINISTRATION FOR CHILDRE AGENCY ATTORNEY 271 77521.29
ADMINISTRATION FOR CHILDRE ASSOCIATE STAFF ANALYST 79 80490.30
#export best agencies
write.csv(bestagencies, file = "bestagencies.csv")

3.0.11 Plot - For select titles, do some agencies pay more?

#2017 best paying agencies for select titles, number of times the agency was in the top 5 highest paying agency for the select 20 titles

bestagencies5<- data.frame(read.csv("bestagencies5a.csv"),stringsAsFactors = FALSE)

bestagencies5$Agency <- factor(bestagencies5$Agency, levels = unique(bestagencies5$Agency)[order(bestagencies5$COUNT, decreasing = TRUE)])


p9 <- plot_ly(
  data = bestagencies5, 
  x = ~Agency, 
  y = ~COUNT, 
  type = 'bar', 
  hoverinfo='text',
  fill="#76D7C4",
  text = ~paste(
    'Agency: ', Agency,
    '<br> Times Appearing in Top 5 Best Paying Agencies: ', COUNT) ) %>%
  layout(title="Highest Paying Agencies for Select Titles, Excluding Community Boards", margin = list(b = 200), xaxis = list(tickangle = 90, title=""))

p9
## Warning: 'bar' objects don't have these attributes: 'fill'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'

3.0.12 Plot - Titles with the greatest salary ranges, select titles

Best opportunities for promotion within the same title

p <- plot_ly(dfsub17, y = ~sal, color = ~Description, type = "box")

p
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors