x <- read_csv("https://raw.githubusercontent.com/Financial-Times/police-misconduct-complaints-analysis/main/output/philly_clean.csv") %>% 
  select(officer_id, po_race, po_sex)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   complaint_id = col_character(),
##   date_received = col_date(format = ""),
##   district_occurrence = col_character(),
##   general_cap_classification = col_character(),
##   summary = col_character(),
##   officer_id = col_double(),
##   po_race = col_character(),
##   po_sex = col_character(),
##   po_assigned_unit = col_character(),
##   allegations_investigated = col_character(),
##   investigative_findings = col_character(),
##   disciplinary_findings = col_character()
## )
x %>% 
  group_by(officer_id) %>%  
  summarize(total = n()) %>% 
  arrange(desc(total)) %>% 
  mutate(officer_id = as.character(officer_id))
## # A tibble: 3,360 x 2
##    officer_id total
##    <chr>      <int>
##  1 29180642      39
##  2 40251428      27
##  3 65283090      25
##  4 47524892      24
##  5 66221391      24
##  6 47882806      23
##  7 91568567      23
##  8 90705110      22
##  9 64290544      21
## 10 23622762      20
## # ... with 3,350 more rows
decile_data <- x %>% 
 
    group_by(officer_id) %>% 
  summarise(total = n()) %>% 
  mutate(compl_dec = ntile(total, 10)) %>% 
  group_by(compl_dec) %>% 
  summarize(compl_total = sum(total)) %>% 
  mutate(compl_percentage = compl_total/sum(compl_total))
ggplot(data = decile_data, 
       mapping = aes(x= compl_dec,
                     y= compl_percentage))+
         geom_col()+
  scale_y_continuous(labels=scales::percent_format())+
  theme_bw()+
  labs(title= "Percentage of Complaints Recieved by Phildephia Police, sorted by percentile", subtitle = "The upper  percentile contains a large portion of total complaints", x="Percentile", y= "Complaint Percentage")