The following is summary information on complete cases with salaries over $10,000.
Load required libraries.
library(proto)
library(gsubfn)
library(RSQLite)
library(knitr)
library(data.table)
library(plyr)
library(dplyr)
library(sqldf)
library(ggplot2)
library(RColorBrewer)
library(grid)
library(ggrepel)
library(plotly)
library(kableExtra)
library(tidyr)
#run alone first to generate file
dfc<- data.frame(read.csv("dfc.csv"),stringsAsFactors = FALSE, strip.white=TRUE)
df10<-subset(dfc, sal>=10000)
#export df10 ( csv of all complete cases over 10k)
write.csv(df10, file = "df10.csv")
#import df10 ( csv of all complete cases over 10k)
df10<- data.frame(read.csv("df10.csv"),stringsAsFactors = FALSE, strip.white=TRUE)
#rename dataframe
df<-df10
#remove community boards
df<-df[- grep("COMMUNITY BOARD", df$agency),]
#change class to character
df$Description <- as.character(df$Description)
#clean title description to remove parenthesis
df$Description[df$Description == "ADMINISTRATIVE STAFF ANALYST ("] <- "ADMINISTRATIVE STAFF ANALYST"
#clean spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
df$agency <- trim(df$agency)
#df 414504 observations, all years, 10K+
df %>%
group_by(Description) %>%
summarise(count = n_distinct(agency))%>%
arrange(desc(count))%>%
top_n(20)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## Selecting by count
## # A tibble: 21 x 2
## Description count
## <chr> <int>
## 1 COMMUNITY ASSOCIATE 63
## 2 ADMINISTRATIVE STAFF ANALYST 62
## 3 COMMUNITY COORDINATOR 61
## 4 PRINCIPAL ADMINISTRATIVE ASSOC 60
## 5 CLERICAL ASSOCIATE 59
## 6 COMPUTER SYSTEMS MANAGER 55
## 7 EXECUTIVE AGENCY COUNSEL 55
## 8 ADM MANAGER-NON-MGRL FRM M1/M2 50
## 9 ASSOCIATE STAFF ANALYST 49
## 10 COMMUNITY ASSISTANT 47
## # ... with 11 more rows
dfallyears<-df
df17<- dfallyears[dfallyears$year==2017,]
topsal<-df17 %>%
group_by(Description) %>%
mutate(Count = n()) %>%
group_by(title, Description, Count) %>%
summarise_at(vars(sal), funs(mean(., na.rm=TRUE) ))%>%
arrange(desc(sal))
#export df17 ( csv of all complete cases over 10k, 2017)
write.csv(df17, file = "df17.csv")
dfraises<- data.frame(read.csv("bestraises-selecttitles.csv"),stringsAsFactors = FALSE, strip.white=TRUE)
dfraises$title <- factor(dfraises$title, levels = unique(dfraises$title)[order(dfraises$pctincrease, decreasing = TRUE)])
p1a <- plot_ly(
data = dfraises,
x = ~title,
y = ~pctincrease,
type = 'bar',
hoverinfo='text',
marker=list(color='#82E0AA'),
text = ~paste(
'Agency: ', title,
'<br> Salary Percent Increase: ', pctincrease) ) %>%
layout(title="Salary increases 2014-2017, top 10 titles, select titles", margin = list(b = 200), xaxis = list(tickangle = 90))
p1a
dfheadcount<- data.frame(read.csv("headcountbyagency-pct-increase-top10.csv"),stringsAsFactors = FALSE, strip.white=TRUE)
#p2 <- ggplot(df12, aes(x = reorder(agency, -pctincrease), y = pctincrease)) +
# geom_bar(stat = "identity")
#p2
dfheadcount$abbreviation <- factor(dfheadcount$abbreviation, levels = unique(dfheadcount$abbreviation)[order(dfheadcount$pctincrease, decreasing = TRUE)])
p1 <- plot_ly(
data = dfheadcount,
x = ~abbreviation,
y = ~pctincrease,
type = 'bar',
hoverinfo='text',
marker=list(color='#45B39D'),
text = ~paste(
'Agency: ', agency,
'<br> Headcount Percent Increase: ', pctincrease) ) %>%
layout(title="Headcount increases 2014-2017, top 10 agencies, select titles", margin = list(b = 200), xaxis = list(tickangle = 90))
p1
df3<- data.frame(read.csv("agency17high.csv"),stringsAsFactors = FALSE)
df3 %>% mutate_if(is.factor, as.character) -> df3
p2 <- df3 %>%
plot_ly(labels = ~ADDRESS, values = ~n) %>%
add_pie(hole = 0.6) %>%
layout(title = "Agencies with the most jobs over $10K", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p2
df4<- data.frame(read.csv("agency17high-titles.csv"),stringsAsFactors = FALSE)
df4 %>% mutate_if(is.factor, as.character) -> df4
p3 <- df4 %>%
plot_ly(labels = ~Description, values = ~n) %>%
add_pie(hole = 0.6) %>%
layout(title = "Titles with the most jobs over $10K", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p3
#plot p4
hist(df17$sal,
main ="Distribution of salaries over $10K, 2017",
xlab="Salary",
border="black",
col="#58D68D",
xlim=c(0,200000),
ylim=c(0,35000),
breaks = seq(0, 350000, by = 5000)
#prob=TRUE
#breaks=c(0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,350000)
)
What constitutes the spike in the histogram between $85,000 and $95,000?
df5<- data.frame(read.csv("titles17-8590k.csv"),stringsAsFactors = FALSE)
df5 %>% mutate_if(is.factor, as.character) -> df5
p5 <- df5 %>%
plot_ly(labels = ~Description, values = ~n) %>%
add_pie(hole = 0.6) %>%
layout(title = "Titles between $85K and $90K", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p5
sal17a<-df17 %>%
group_by(Description) %>%
mutate(Count = n()) %>%
group_by(title, Description, Count) %>%
summarise_at(vars(sal), funs(round(mean(., na.rm=TRUE),0) ))%>%
arrange(desc(Count))
dfp5<-head(sal17a,100)
p6 <- plot_ly(
data = dfp5,
x = ~Count,
y = ~sal,
type = 'scatter',
mode = 'markers',
hoverinfo='text',
text = ~paste(
'Title: ', Description,
'<br> Employees: ', Count,
'<br> Avg Salary: ', sal)) %>%
layout(title="100 most popular titles with average salary by title, 2017")
p6
p7 <- plot_ly(
data = dfp5,
x = ~Count,
y = ~sal,
type = 'scatter',
mode = 'markers',
hoverinfo='text',
text = ~paste(
'Title: ', Description,
'<br> Employees: ', Count,
'<br> Avg Salary: ', sal)) %>%
layout(
xaxis = list(range = c(0, 5000)),
yaxis = list(range = c(0, 170000)),
title="Popular titles with average salary by title, <5000 serving in title, 2017")
p7
dfcommontitles<- data.frame(read.csv("commontitles.csv"),stringsAsFactors = FALSE)
dfcommontitles$Description <- factor(dfcommontitles$Description, levels =unique(dfcommontitles$Description)[order(dfcommontitles$count, decreasing = TRUE)])
p8 <- plot_ly(
data = dfcommontitles,
x = ~Description,
y = ~count,
type = 'bar',
hoverinfo='text',
marker=list(color='#5DADE2'),
text = ~paste(
'Title: ', Description,
'<br> Agencies: ', count) ) %>%
layout(title="Titles across the most agencies, excluding Community Boards", margin = list(b = 200), xaxis = list(tickangle = 90))
p8
Best agencies to apply to if you want to work in these titles.
#import dfsub1
dfsub1<- data.frame(read.csv("dfsub1.csv"),stringsAsFactors = FALSE)
#subset only 2017
dfsub17<- dfsub1[dfsub1$year==2017,]
#group by agency and title
bestagencies<-dfsub17 %>%
group_by(agency, Description) %>%
mutate(Count = n()) %>%
group_by(agency, Description, Count) %>%
summarise_at(vars(sal), funs(mean(., na.rm=TRUE) ))%>%
arrange(agency)
kable(head(bestagencies))
| agency | Description | Count | sal |
|---|---|---|---|
| ADMINISTRATION FOR CHILDRE | ACCOUNTANT | 13 | 62636.00 |
| ADMINISTRATION FOR CHILDRE | ADM MANAGER-NON-MGRL FRM M1/M2 | 67 | 69026.58 |
| ADMINISTRATION FOR CHILDRE | ADMINISTRATIVE PUBLIC INFORMAT | 4 | 114382.00 |
| ADMINISTRATION FOR CHILDRE | ADMINISTRATIVE STAFF ANALYST | 100 | 92039.42 |
| ADMINISTRATION FOR CHILDRE | AGENCY ATTORNEY | 271 | 77521.29 |
| ADMINISTRATION FOR CHILDRE | ASSOCIATE STAFF ANALYST | 79 | 80490.30 |
#export best agencies
write.csv(bestagencies, file = "bestagencies.csv")
#2017 best paying agencies for select titles, number of times the agency was in the top 5 highest paying agency for the select 20 titles
bestagencies5<- data.frame(read.csv("bestagencies5a.csv"),stringsAsFactors = FALSE)
bestagencies5$Agency <- factor(bestagencies5$Agency, levels = unique(bestagencies5$Agency)[order(bestagencies5$COUNT, decreasing = TRUE)])
p9 <- plot_ly(
data = bestagencies5,
x = ~Agency,
y = ~COUNT,
type = 'bar',
hoverinfo='text',
fill="#76D7C4",
text = ~paste(
'Agency: ', Agency,
'<br> Times Appearing in Top 5 Best Paying Agencies: ', COUNT) ) %>%
layout(title="Highest Paying Agencies for Select Titles, Excluding Community Boards", margin = list(b = 200), xaxis = list(tickangle = 90, title=""))
p9
## Warning: 'bar' objects don't have these attributes: 'fill'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'hoverinfo', 'hoverlabel', 'stream', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'textposition', 'textfont', 'insidetextfont', 'outsidetextfont', 'orientation', 'base', 'offset', 'width', 'marker', 'r', 't', 'error_y', 'error_x', '_deprecated', 'xaxis', 'yaxis', 'xcalendar', 'ycalendar', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule'
Best opportunities for promotion within the same title
p <- plot_ly(dfsub17, y = ~sal, color = ~Description, type = "box")
p
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors