1 Introduction

1.0.1 Background

The City of New York updated their payroll dataset in late April 2018.

The payroll dataset contained the following information.

1.0.2 Libraries

library(proto)
library(gsubfn)
library(RSQLite)
library(knitr)
library(data.table)
library(plyr)
library(dplyr)
library(sqldf)
library(ggplot2)
library(RColorBrewer)
library(grid)
library(ggrepel)
library(plotly)
library(kableExtra)

# required files chunk 1
# 20180305titles.csv
# 20180425CivilList.csv

2 Import, clean, preliminary statistics

2.0.1 Import payroll and titles datasets

#import titles
dft <- data.frame(read.csv("20180305titles.csv"),stringsAsFactors = FALSE)

#import payroll
df <- data.frame(read.csv("20180425CivilList.csv"),stringsAsFactors = FALSE)

#payroll change factors to character
df %>% mutate_if(is.factor, as.character) -> df
## Warning: package 'bindrcpp' was built under R version 3.4.4
#payroll show head
df %>%
  head(5)%>%
  kable
CALENDAR.YEAR AGENCY.CODE EMPLOYEE.NAME AGENCY.NAME TITLE.CODE PAY.CLASS SALARY.RATE
2014 868 B J SANDIFORD DEPARTMENT OF CITYWIDE ADM 12702 X $ 5.00
2014 868 C A WIGFALL DEPARTMENT OF CITYWIDE ADM 12702 X $ 5.00
2014 69 A E A-AWOSOGBA HRA/DEPARTMENT OF SOCIAL S 52311 A $ 51,955.00
2014 868 K D AABY DEPARTMENT OF CITYWIDE ADM 10209 X $ 12.00
2014 56 I D AADIL POLICE DEPARTMENT 71012 A $ 46,953.00
#titles show head
dft %>%
  head(5)%>%
  kable
UniqueTitle Description CountSubTitles Min.MinRate Max.MaxRate SalaryMinMaxValid MultipleSubtitles PayType PayPeriod UnionCode UnionName BargUnit BargDesc TitleStatus
56099 FIELD SUPERVISOR (SUMMER YOUTH 1 14 14 ValidSalary SingleTitle HourlySalary Seasonal 172 SEASONALS DC37 District Council 37 ActiveTitle
56101 SENIOR FIELD SUPERVISOR (SUMME 1 15 15 ValidSalary SingleTitle HourlySalary Seasonal 172 SEASONALS DC37 District Council 37 ActiveTitle
47 BOARD OF EDUCATION 1 0 0 ZeroMinMax SingleTitle NoSalListed Annual 998 PENDING ASSIGNMENT TO CBU N/U Non-Union ActiveTitle
91406 CITY SEASONAL AIDE 21 11 16 ValidSalary MultipleTitles HourlySalary Seasonal 172 SEASONALS DC37 District Council 37 ActiveTitle
71205 LIFE GUARD 4 109 131 ValidSalary MultipleTitles HourlySalary Seasonal 172 SEASONALS DC37 District Council 37 ActiveTitle

2.0.2 Show structure and set column names

#payroll show structure
str(df)
## 'data.frame':    1161078 obs. of  7 variables:
##  $ CALENDAR.YEAR: int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ AGENCY.CODE  : int  868 868 69 868 56 69 464 998 998 826 ...
##  $ EMPLOYEE.NAME: chr  "B J  SANDIFORD" "C A  WIGFALL" "A E A-AWOSOGBA" "K D AABY" ...
##  $ AGENCY.NAME  : chr  "DEPARTMENT OF CITYWIDE ADM" "DEPARTMENT OF CITYWIDE ADM" "HRA/DEPARTMENT OF SOCIAL S" "DEPARTMENT OF CITYWIDE ADM" ...
##  $ TITLE.CODE   : chr  "12702" "12702" "52311" "10209" ...
##  $ PAY.CLASS    : chr  "X" "X" "A" "X" ...
##  $ SALARY.RATE  : chr  "$      5.00" "$      5.00" "$ 51,955.00" "$     12.00" ...
#title set column names
setnames(dft, "UniqueTitle", "title")

#payroll set column names
setnames(df, "CALENDAR.YEAR", "year")
setnames(df, "AGENCY.CODE", "agencycode")
setnames(df, "EMPLOYEE.NAME", "empname")
setnames(df, "AGENCY.NAME", "agency")
setnames(df, "TITLE.CODE", "title")
setnames(df, "PAY.CLASS", "payclass")
setnames(df, "SALARY.RATE", "sal")

2.0.3 Show count of records by year

#payroll show count by year
df %>%
     group_by(year) %>%
     summarise(total.count=n()) %>%
     arrange((year))
## # A tibble: 4 x 2
##    year total.count
##   <int>       <int>
## 1  2014      269838
## 2  2015      300653
## 3  2016      278869
## 4  2017      311718

2.0.4 Join Payroll and Titles tables

#left outer join to match by title
dfm<-merge(x = df, y = dft, by = "title", all.x = TRUE)
dfm %>% mutate_if(is.factor, as.character) -> dfm

#show structure of merged dataset. Should have same number of observations as payroll dataset -- 1161078
str(dfm)
## 'data.frame':    1161078 obs. of  20 variables:
##  $ title            : chr  "" "" "" "" ...
##  $ year             : int  2014 2014 2014 2014 2014 2014 2015 2014 2014 2016 ...
##  $ agencycode       : int  993 993 993 993 993 993 993 993 998 998 ...
##  $ empname          : chr  "L A CUBANO" "C   DIELE" "R O ODIDI" "S J PUGLIESE" ...
##  $ agency           : chr  "TRIBOROUGH BRIDGE AND TUNN" "TRIBOROUGH BRIDGE AND TUNN" "TRIBOROUGH BRIDGE AND TUNN" "TRIBOROUGH BRIDGE AND TUNN" ...
##  $ payclass         : chr  "WK" "BW" "BW" "WK" ...
##  $ sal              : chr  "$     12.86" "$ 44,488.00" "$ 37,722.00" "$     12.86" ...
##  $ Description      : chr  NA NA NA NA ...
##  $ CountSubTitles   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Min.MinRate      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Max.MaxRate      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SalaryMinMaxValid: chr  NA NA NA NA ...
##  $ MultipleSubtitles: chr  NA NA NA NA ...
##  $ PayType          : chr  NA NA NA NA ...
##  $ PayPeriod        : chr  NA NA NA NA ...
##  $ UnionCode        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ UnionName        : chr  NA NA NA NA ...
##  $ BargUnit         : chr  NA NA NA NA ...
##  $ BargDesc         : chr  NA NA NA NA ...
##  $ TitleStatus      : chr  NA NA NA NA ...

2.0.5 All years combined - show most common descriptions

dfm %>%
     group_by(Description) %>%
     summarise(total.count=n()) %>%
     arrange(desc(total.count))
## # A tibble: 1,297 x 2
##    Description                    total.count
##    <chr>                                <int>
##  1 <NA>                                313043
##  2 POLICE OFFICER (RECURRING NIGH       94447
##  3 MONITOR                              43711
##  4 CORRECTION OFFICER                   35845
##  5 FIREFIGHTER                          33088
##  6 SANITATION WORKER                    26144
##  7 COMMUNITY ASSOCIATE                  20395
##  8 SCHOOL SAFETY AGENT                  20208
##  9 COLLEGE ASSISTANT (NO LEAVE AC       19617
## 10 CLERICAL ASSOCIATE                   18815
## # ... with 1,287 more rows

2.0.6 All years combined - show incomplete cases to be excluded

dfna <- dfm[is.na(dfm$Description),]

dfna %>%
     group_by(agency) %>%
     summarise(total.count=n()) %>%
     arrange(desc(total.count))%>%
  head(70)%>%
  kable
agency total.count
N.Y.C. TRANSIT AUTHORITY 164234
N.Y.C. TRANSIT AUTHORITY 55587
CUNY KINGSBOROUGH COMMMUNI 12143
CUNY LAGUARDIA COMMUNITY C 11787
CUNY MANHATTAN COMMUNITY C 10557
DEPARTMENT OF EDUCATION 8969
CUNY QUEENSBOROUGH COMMUNI 7802
CUNY BRONX COMMUNITY COLLE 6320
TRIBOROUGH BRIDGE AND TUNN 6261
CUNY HOSTOS COMMUNITY COLL 4508
DEPARTMENT OF EDUCATION 3883
DEPARTMENT OF PARKS & RECR 2305
POLICE DEPARTMENT 1546
POLICE DEPARTMENT 1438
OFFICE OF MANAGEMENT AND B 1165
CUNY HUNTER COLLEGE HIGH S 1139
CUNY COLLEGE OF STATEN ISL 1137
DEPARTMENT FOR THE AGING 1135
ADMINISTRATION FOR CHILDRE 818
N.Y.C. HOUSING AUTHORITY 694
HRA/DEPARTMENT OF SOCIAL S 686
DEPARTMENT OF ENVIRONMENTA 628
DEPARTMENT OF HEALTH AND M 620
OFFICE OF EMERGENCY MANAGE 589
DEPARTMENT OF TRANSPORTATI 587
DEPARTMENT FOR THE AGING 509
DEPARTMENT OF CITYWIDE ADM 480
DEPARTMENT OF DESIGN AND C 430
OFFICE OF THE MAYOR 425
DEPARTMENT OF CORRECTION 304
DEPARTMENT OF BUILDINGS 299
HOUSING PRESERVATION & DEV 292
N.Y.C. HOUSING AUTHORITY 224
DEPARTMENT OF INFORMATION 201
FIRE DEPARTMENT 201
DEPARTMENT OF HOMELESS SER 186
DEPARTMENT OF FINANCE 177
DEPARTMENT OF INFORMATION 161
DEPARTMENT OF BUILDINGS 151
CAMPAIGN FINANCE BOARD 150
DEPARTMENT OF CORRECTION 136
OFFICE OF THE COMPTROLLER 118
DISTRICT ATTORNEY-BRONX CO 108
DEPARTMENT OF SANITATION 105
DEPARTMENT OF YOUTH AND CO 105
OFFICE OF THE MAYOR 100
FIRE DEPARTMENT 97
DISTRICT ATTORNEY - SPECIA 91
BOROUGH PRESIDENT-RICHMOND 82
MAYOR’S OFFICE OF CONTRACT 81
INDEPENDENT BUDGET OFFICE 76
OFFICE OF THE COMPTROLLER 73
BOROUGH PRESIDENT-BROOKLYN 69
LAW DEPARTMENT 69
OFFICE OF LABOR RELATIONS 69
CAMPAIGN FINANCE BOARD 68
DEPARTMENT OF FINANCE 68
DEPARTMENT OF SANITATION 60
FINANCIAL INFORMATION SERV 56
DEPARTMENT OF PROBATION 37
OFFICE OF PAYROLL ADMINIST 37
MUNICIPAL WATER FINANCE AU 36
LAW DEPARTMENT 35
DISTRICT ATTORNEY-QUEENS C 31
CUNY CENTRAL OFFICE 30
BOROUGH PRESIDENT-QUEENS 28
DEPARTMENT OF BUSINESS SER 28
CITY CLERK 27
BOROUGH PRESIDENT-BRONX 26
INDEPENDENT BUDGET OFFICE 24
dfna %>%
     group_by(year) %>%
     summarise(total.count=n()) %>%
  head(5)%>%
  kable
year total.count
2014 75607
2015 77878
2016 76009
2017 83549

2.0.7 View joined table; complete cases only

#show complete cases only-- do not include NA rows
dfc<-dfm[complete.cases(dfm), ]

#convert salary to numeric, remove currency symbol
dfc$sal <- as.numeric(gsub('[$,]', '', dfc$sal))

#show head by descending salary
dfc %>%
    arrange(desc(sal))%>%
  head(5)%>%
  kable
title year agencycode empname agency payclass sal Description CountSubTitles Min.MinRate Max.MaxRate SalaryMinMaxValid MultipleSubtitles PayType PayPeriod UnionCode UnionName BargUnit BargDesc TitleStatus
12707 2015 15 S C EVANS OFFICE OF THE COMPTROLLER A 350000 PENSION INVESTMENT ADVISOR 1 53373 212614 ValidSalary SingleTitle AnnualSalary Annual 772 NON COMPETITIVE MANAGER N/U Non-Union ActiveTitle
12707 2017 15 S C EVANS OFFICE OF THE COMPTROLLER A 350000 PENSION INVESTMENT ADVISOR 1 53373 212614 ValidSalary SingleTitle AnnualSalary Annual 772 NON COMPETITIVE MANAGER N/U Non-Union ActiveTitle
12707 2016 15 S C EVANS OFFICE OF THE COMPTROLLER A 350000 PENSION INVESTMENT ADVISOR 1 53373 212614 ValidSalary SingleTitle AnnualSalary Annual 772 NON COMPETITIVE MANAGER N/U Non-Union ActiveTitle
40735 2016 8 S S CHAN OFFICE OF THE ACTUARY A 290331 CHIEF ACTUARY 1 53373 212614 ValidSalary SingleTitle AnnualSalary Annual 890 MGR-WELFARE FUND/MGR LEAVE/NON N/U Non-Union ActiveTitle
40735 2017 8 S S CHAN OFFICE OF THE ACTUARY A 290331 CHIEF ACTUARY 1 53373 212614 ValidSalary SingleTitle AnnualSalary Annual 890 MGR-WELFARE FUND/MGR LEAVE/NON N/U Non-Union ActiveTitle
str(dfc)
## 'data.frame':    848035 obs. of  20 variables:
##  $ title            : chr  "0527A" "0527A" "0527A" "0527A" ...
##  $ year             : int  2014 2016 2015 2016 2015 2017 2014 2017 2017 2017 ...
##  $ agencycode       : int  2 2 2 2 2 2 2 82 2 2 ...
##  $ empname          : chr  "K H GREIG" "J   CHIU" "S   KAJEEPETA" "A   MCLAUCHLAN" ...
##  $ agency           : chr  "OFFICE OF THE MAYOR" "OFFICE OF THE MAYOR" "OFFICE OF THE MAYOR" "OFFICE OF THE MAYOR" ...
##  $ payclass         : chr  "A" "A" "A" "A" ...
##  $ sal              : num  125000 165609 65000 56650 128125 ...
##  $ Description      : chr  "Research Projects Coor(MA)-MGR" "Research Projects Coor(MA)-MGR" "Research Projects Coor(MA)-MGR" "Research Projects Coor(MA)-MGR" ...
##  $ CountSubTitles   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Min.MinRate      : int  53373 53373 53373 53373 53373 53373 53373 53373 53373 53373 ...
##  $ Max.MaxRate      : int  212614 212614 212614 212614 212614 212614 212614 212614 212614 212614 ...
##  $ SalaryMinMaxValid: chr  "ValidSalary" "ValidSalary" "ValidSalary" "ValidSalary" ...
##  $ MultipleSubtitles: chr  "SingleTitle" "SingleTitle" "SingleTitle" "SingleTitle" ...
##  $ PayType          : chr  "AnnualSalary" "AnnualSalary" "AnnualSalary" "AnnualSalary" ...
##  $ PayPeriod        : chr  "Annual" "Annual" "Annual" "Annual" ...
##  $ UnionCode        : int  883 883 883 883 883 883 883 883 883 883 ...
##  $ UnionName        : chr  "MGRL TEMP - PROP EX" "MGRL TEMP - PROP EX" "MGRL TEMP - PROP EX" "MGRL TEMP - PROP EX" ...
##  $ BargUnit         : chr  "N/U" "N/U" "N/U" "N/U" ...
##  $ BargDesc         : chr  "Non-Union" "Non-Union" "Non-Union" "Non-Union" ...
##  $ TitleStatus      : chr  "ActiveTitle" "ActiveTitle" "ActiveTitle" "ActiveTitle" ...

2.0.8 Export complete cases

#changed to eval=FALSE after first run.
write.csv(dfc, file = "dfc.csv")

3 Investigate Complete Cases

The following analysis relates to complete observations only. These observations are where the title code in the payroll list had a matching title code in the titles list. The incomplete cases were dropped from this analysis.

Incomplete cases primarily include CUNY, Triboro Bridge and Tunnel Authority, MTA, HHC, and other “non-mayoral” agencies. A small percentage of incomplete cases fall into mayoral agencies.

3.0.1 Pay Types

bypaytype<- dfc %>% count(PayType, year) %>% ungroup() %>% arrange(desc(n))

kable(head(bypaytype,10))
PayType year n
AnnualSalary 2017 190871
AnnualSalary 2015 183806
AnnualSalary 2016 174258
AnnualSalary 2014 165513
HourlySalary 2015 38107
HourlySalary 2017 36286
HourlySalary 2014 27901
HourlySalary 2016 27773
EitherAnnualHourly 2017 1007
EitherAnnualHourly 2015 857

EitherAnnualHourly indicates the min and max for the title are a broad range encompassing both hourly rates and annual rates. For example, the min salary may be $10, and the max salary may be $30,000. This indicates there may be multiple subtitles within the broader title, where some subtitles are paid hourly, and some are paid annually. Alternately, the data may have errors or may be outdated. It is not allowable to list a salary range in a job posting that is both hourly and annual.

3.0.2 Unique values in the SalaryMinMaxValid Column

byvalidsal<- dfc %>% count(SalaryMinMaxValid, year) %>% ungroup() %>% arrange(desc(n))

kable(head(byvalidsal,10))
SalaryMinMaxValid year n
ValidSalary 2017 228164
ValidSalary 2015 222770
ValidSalary 2016 202856
ValidSalary 2014 194226
ZeroMinMax 2014 5
ZeroMinMax 2015 5
ZeroMinMax 2017 5
ZeroMinMax 2016 4

We see that the majority of these complete cases have a valid salary listed from the Titles dataset. A very small percentage have $0 listed in the min and max possible for the salary as determined by the title designation. Since these employees received a non-zero salary, this indicates an error in the Titles dataset for these 10 observations.

3.0.3 Unique values in the TitleStatus Column

bytitlestatus<- dfc %>% count(TitleStatus, year) %>% ungroup() %>% arrange(desc(n))
kable(head(bytitlestatus,10))
TitleStatus year n
ActiveTitle 2017 217765
ActiveTitle 2015 211673
ActiveTitle 2016 192687
ActiveTitle 2014 183452
DeletedTitle 2015 11102
DeletedTitle 2014 10779
DeletedTitle 2017 10404
DeletedTitle 2016 10173

According to the Titles dataset, some of the employees in the payroll have titles that are considered deleted titles. This could be due to the Titles dataset being more current than the Payroll dataset, in which case the employees in the defunct titles will likely have been transfered to equivalent active titles. It could also indicate errors in the datasets.

3.0.4 Unique values in the UnionName Column

byunion<- dfc %>% count(UnionName, year) %>% ungroup() %>% arrange(desc(n))
kable(head(byunion,10))
UnionName year n
POLICE OFFICERS PBA 2017 24443
POLICE OFFICERS PBA 2015 24049
POLICE OFFICERS PBA 2016 23768
SOCIAL SERVICE TITLES DC 37L37 2017 22669
POLICE OFFICERS PBA 2014 22197
SOCIAL SERVICE TITLES DC 37L37 2015 20640
SOCIAL SERVICE TITLES DC 37L37 2016 19846
SOCIAL SERVICE TITLES DC 37L37 2014 17860
CLERICAL TITLES DC37 2015 13723
CLERICAL TITLES DC37 2014 13069

4 Salaries Under 10K

4.0.1 Split dataset into Over 10K and under 10K groups

For easier analysis, the dataset is split into salaries under 10K and over 10K.

df10<-subset(dfc, sal>=10000)
dfsub10<-subset(dfc, sal<10000)

The following section relates to salaries under 10K only. This includes trades workers such as carpenters, painters, and mechanics, which are full-time, but are listed with an hourly salary instead of annual. This causes these employees to appear as low-wage workers when they may actually earn a mid- or high-range annual salary. For clarity, these have been separated instead of attempting to calculate an equivalent annual rate.

4.0.2 Union Representation Under 10K

byunionsub10<- dfsub10 %>% count(UnionName, year) %>% ungroup() %>% arrange(desc(n))
kable(head(byunionsub10,10))
UnionName year n
COMPETITIVE OJ 2015 11418
COMPETITIVE OJ 2016 11286
COMPETITIVE OJ 2017 11258
COMPETITIVE OJ 2014 9792
JOB TRAINING PARTICIPANTS DC37 2017 6998
JOB TRAINING PARTICIPANTS DC37 2015 6101
PART TIME OJ 2015 5920
UNCLASSIFIED NON OJ & NON MANA 2017 5675
UNCLASSIFIED NON OJ & NON MANA 2016 5129
SEASONALS 2015 5059

4.0.3 Agency Representation Under 10K

byagencysub10<- dfsub10 %>% count(agency, year) %>% ungroup() %>% arrange(desc(n))
kable(head(byagencysub10,10))
agency year n
DEPARTMENT OF CITYWIDE ADM 2015 16061
DEPARTMENT OF CITYWIDE ADM 2014 14135
DEPARTMENT OF CITYWIDE ADM 2017 12664
DEPARTMENT OF CITYWIDE ADM 2016 12378
DEPARTMENT OF PARKS & RECR 2017 9955
DEPARTMENT OF PARKS & RECR 2015 9793
POLICE DEPARTMENT 2017 4013
DEPARTMENT OF PARKS & RECR 2016 3821
DEPARTMENT OF PARKS & RECR 2014 3528
POLICE DEPARTMENT 2016 3505

4.0.4 Title Representation Under 10K

bytitlesub10<- dfsub10 %>% count(Description, year) %>% ungroup() %>% arrange(desc(n))
kable(head(bytitlesub10,10))
Description year n
MONITOR 2015 11400
MONITOR 2016 11277
MONITOR 2017 11251
MONITOR 2014 9783
JOB TRAINING PARTICIPANT 2017 6998
JOB TRAINING PARTICIPANT 2015 6101
COLLEGE ASSISTANT (NO LEAVE AC 2017 5641
COLLEGE ASSISTANT (NO LEAVE AC 2016 5097
COLLEGE ASSISTANT (NO LEAVE AC 2015 4996
COLLEGE AIDE (ALL CITY DEPTS) 2015 4963

5 Salaries over 10K

The following section relates to salaries over 10K only.

5.0.1 Union Representation Over 10K

byunion10plus<- df10 %>% count(UnionName, year) %>% ungroup() %>% arrange(desc(n))
kable(head(byunion10plus,10))
UnionName year n
POLICE OFFICERS PBA 2017 24443
POLICE OFFICERS PBA 2015 24049
POLICE OFFICERS PBA 2016 23768
POLICE OFFICERS PBA 2014 22197
SOCIAL SERVICE TITLES DC 37L37 2017 22033
SOCIAL SERVICE TITLES DC 37L37 2015 19718
SOCIAL SERVICE TITLES DC 37L37 2016 19404
SOCIAL SERVICE TITLES DC 37L37 2014 17333
CLERICAL TITLES DC37 2015 13218
CLERICAL TITLES DC37 2014 12641

5.0.2 Agency Representation Over 10K

byagency10plus<- df10 %>% count(agency, year) %>% ungroup() %>% arrange(desc(n))
kable(head(byagency10plus,10))
agency year n
POLICE DEPARTMENT 2017 52200
POLICE DEPARTMENT 2015 51051
POLICE DEPARTMENT 2016 49815
POLICE DEPARTMENT 2014 48451
FIRE DEPARTMENT 2017 17552
FIRE DEPARTMENT 2015 16839
FIRE DEPARTMENT 2016 16717
FIRE DEPARTMENT 2014 15400
HRA/DEPARTMENT OF SOCIAL S 2015 14023
HRA/DEPARTMENT OF SOCIAL S 2017 13681

5.0.3 Title Representation Over 10K

bytitle10plus<- df10 %>% count(Description, year) %>% ungroup() %>% arrange(desc(n))
kable(head(bytitle10plus,10))
Description year n
POLICE OFFICER (RECURRING NIGH 2017 24443
POLICE OFFICER (RECURRING NIGH 2015 24044
POLICE OFFICER (RECURRING NIGH 2016 23768
POLICE OFFICER (RECURRING NIGH 2014 22192
CORRECTION OFFICER 2017 10159
CORRECTION OFFICER 2016 9235
FIREFIGHTER 2017 8720
CORRECTION OFFICER 2015 8693
FIREFIGHTER 2015 8424
FIREFIGHTER 2016 8325

5.0.4 Agency and Title Distribution Over 10K

summary10<- df10 %>% count(agency, Description) %>% ungroup() %>% arrange(desc(n))

kable(head(summary10,10))
agency Description n
POLICE DEPARTMENT POLICE OFFICER (RECURRING NIGH 70004
DEPARTMENT OF CORRECTION CORRECTION OFFICER 25686
POLICE DEPARTMENT POLICE OFFICER (RECURRING NIGH 24443
FIRE DEPARTMENT FIREFIGHTER 24368
DEPARTMENT OF SANITATION SANITATION WORKER 19474
POLICE DEPARTMENT SCHOOL SAFETY AGENT 14835
POLICE DEPARTMENT SERGEANT (RECURRING NIGHT SHIF 12428
DEPARTMENT OF CORRECTION CORRECTION OFFICER 10159
HRA/DEPARTMENT OF SOCIAL S ELIGIBILITY SPECIALIST 9963
N.Y.C. HOUSING AUTHORITY CARETAKER (HA) 9319

5.0.5 Average Salary by Title Over 10K

10 highest and 10 lowest average salaries by title.

salbytitle<-df10 %>%
  group_by(Description) %>%
  mutate(Count = n()) %>%
  group_by(title, Description,  Count) %>%
  summarise_at(vars(sal), funs(mean(., na.rm=TRUE)  ))%>%
  arrange(desc(sal))

kable(head(salbytitle,10))
title Description Count sal
12707 PENSION INVESTMENT ADVISOR 4 318644.5
40735 CHIEF ACTUARY 3 287512.3
12942 FIRST DEPUTY MAYOR 4 262817.8
94450 CHIEF OF STAFF (CITY COUNCIL) 3 234282.3
12995 MAYOR 4 233437.5
12940 DEPUTY MAYOR 13 227925.8
94465 EXECUTIVE DIRECTOR (CAMPAIGN F 4 226882.0
94488 CHAIR (HA) 4 224576.0
94310 DIRECTOR OF THE OFFICE OF COLL 3 224168.3
94468 EXECUTIVE DIRECTOR (FISA) 3 224168.3
kable(tail(salbytitle,10))
title Description Count sal
10232 SUMMER GRADUATE INTERN 5 27524.80
52406 COMMUNITY SERVICE AIDE 653 27183.45
54501 SCHOOL LUNCH HELPER 2 27031.00
10234 SUMMER COLLEGE INTERN 2 26596.00
51800 PROBATION ASSISTANT 14 26513.36
1299A COMMISSIONER (NOT FULL-TIME) 16 25677.00
94367 TEMPORARY CLERK (BOARD OF ELEC 51 25645.98
91405 CITY SERVICE AIDE 103 23149.38
91406 CITY SEASONAL AIDE 1144 18255.26
10209 COLLEGE AIDE (ALL CITY DEPTS) 14 18067.57

5.0.6 Average Salary by Title Over 10K, 10 Most Common Titles

salbytitle2<-df10 %>%
  group_by(Description) %>%
  mutate(Count = n()) %>%
  group_by(title, Description, Count) %>%
  summarise_at(vars(sal), funs(round(mean(., na.rm=TRUE),0)  ))%>%
  arrange(desc(Count))

kable(head(salbytitle2,10))
title Description Count sal
70210 POLICE OFFICER (RECURRING NIGH 94447 66208
70410 CORRECTION OFFICER 35845 65330
70310 FIREFIGHTER 33088 71896
70112 SANITATION WORKER 26144 62442
60817 SCHOOL SAFETY AGENT 20208 37925
56057 COMMUNITY ASSOCIATE 19440 44111
10251 CLERICAL ASSOCIATE 17609 41081
10124 PRINCIPAL ADMINISTRATIVE ASSOC 16987 55415
70235 SERGEANT (RECURRING NIGHT SHIF 16603 95804
90645 CARETAKER (HA) 12881 38218

5.0.7 Export complete cases over 10K

#changed to eval=FALSE after first run.
write.csv(df10, file = "df10.csv")