First read in data

ccrb= read.csv("/Users/yh101251/Desktop/HU/512/week8/ccrb.csv")

Look at the data structure

str(ccrb)
## 'data.frame':    204397 obs. of  16 variables:
##  $ DateStamp                                  : Factor w/ 1 level "11/29/2016": 1 1 1 1 1 1 1 1 1 1 ...
##  $ UniqueComplaintId                          : int  11 18 18 18 18 18 18 18 18 18 ...
##  $ Close.Year                                 : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ Received.Year                              : int  2005 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ Borough.of.Occurrence                      : Factor w/ 6 levels "Bronx","Brooklyn",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ Is.Full.Investigation                      : logi  FALSE TRUE TRUE TRUE TRUE TRUE ...
##  $ Complaint.Has.Video.Evidence               : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Complaint.Filed.Mode                       : Factor w/ 7 levels "Call Processing System",..: 6 7 7 7 7 7 7 7 7 7 ...
##  $ Complaint.Filed.Place                      : Factor w/ 14 levels "CCRB","Comm. to Combat Police Corruption",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Complaint.Contains.Stop...Frisk.Allegations: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Incident.Location                          : Factor w/ 15 levels "Apartment/house",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ Incident.Year                              : int  2005 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ Encounter.Outcome                          : Factor w/ 4 levels "Arrest","No Arrest or Summons",..: 2 1 1 1 1 1 1 1 1 1 ...
##  $ Reason.For.Initial.Contact                 : Factor w/ 49 levels "Aided case","Arrest/Complainant",..: 23 32 32 32 32 32 32 32 32 32 ...
##  $ Allegation.FADO.Type                       : Factor w/ 4 levels "Abuse of Authority",..: 1 1 2 2 2 3 3 3 3 3 ...
##  $ Allegation.Description                     : Factor w/ 56 levels "Action","Animal",..: 48 35 56 56 56 27 27 27 27 27 ...

how many complaints happened each year?

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)

ggplot(data = ccrb %>% 
     distinct(UniqueComplaintId, Incident.Year) %>%
     group_by(Incident.Year) %>%
     tally) +
  geom_line(aes(Incident.Year,n)) +
  ggtitle("Number of Incidents led to Complaints Each Year") +
  labs(x="Year", y = "Number of incidents") +
  theme_classic()

The number of incidents before 2005 looks a little off. After 2006, the number of incidents each year declined.

how many complaints were received each year: what’s the trend, increased or not?

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = ccrb %>% 
     distinct(UniqueComplaintId, Received.Year) %>%
     group_by(Received.Year) %>%
     tally) +
  geom_line(aes(Received.Year,n)) +
  ggtitle("Number of Complaints Received Each Year") +
  labs(y="Year", x = "Number of complaints") +
  theme_classic()

Similar to the number of incidents, the number of complaints dropped after 2006.

Within these cases, how have the numbers of complaints in each Borough.of.Occurrence changed?

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = group_by(ccrb, Received.Year) %>%
  group_by(Received.Year, Borough.of.Occurrence) %>%
  summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId))) +
  geom_line(aes(x=Received.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
  ggtitle("Number of Complaints Received Each Year") +
  labs(y="Year", x = "Number of compaints") +
  theme_classic()

Numbers before 2005 look very different. It could be incomplete data. What if we look at numbers after 2005

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = filter(
       group_by(ccrb, Received.Year) %>%
       group_by(Received.Year, Borough.of.Occurrence) %>%
       summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId)),
       Received.Year>2004)
       ) +
  geom_line(aes(x=Received.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
  ggtitle("Number of Complaints Received Each Year") +
  labs(y="Year", x = "Number of compaints") +
  theme_classic()
## Warning: package 'bindrcpp' was built under R version 3.4.4

How many complaints were closed each year?

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = ccrb %>% 
     distinct(UniqueComplaintId, Close.Year) %>%
     group_by(Close.Year) %>%
     tally) +
  geom_line(aes(Close.Year,n)) +
  ggtitle("Number of Complaints Closed Each Year") +
  labs(x="Year", y = "Number of compaints") +
  theme_classic()

How many complaints were closed in each borough

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = group_by(ccrb, Close.Year) %>%
  group_by(Close.Year, Borough.of.Occurrence) %>%
  summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId))) +
  geom_line(aes(x=Close.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
  ggtitle("Number of Complaints Closed Each Year") +
  labs(y="Year", x = "Number of compaints") +
  theme_classic()

What’s the relationship between Received.Year and Close.Year

ccrb$process_time = ccrb$Close.Year - ccrb$Received.Year
ccrb_distinct = ccrb[!duplicated(ccrb$UniqueComplaintId),]
library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = ccrb_distinct) +
  geom_point(aes(x=Received.Year, y=Close.Year), alpha = 0.05) +
  ggtitle("Receive and close year") +
  labs(y="Close Year", x = "Received Year") +
  theme_classic()

Because all years are intigers, all the points are overlapped. This plot would work better for continuous data. Although it isn’t successful in showing in showing the closed year and received year, it was fun to try with.

How long does it take to close the complaint?

ggplot(data = ccrb_distinct) +
  geom_boxplot(aes(x=Borough.of.Occurrence, y=process_time))  +
  theme_bw()

Because the numbers are small and don’t vary much, the boxplot isn’t showing any apparent difference.

ggplot(data = ccrb_distinct, aes(process_time)) +
  geom_histogram(binwidth = 1) +
  theme_bw()

Most complaints were closed within the same year of the report or the next year.

How long does it take to close the complaint for different boroughs?

group_by(ccrb_distinct, Received.Year) %>%
  group_by(Received.Year, Borough.of.Occurrence) %>%
  summarize(AverageProcessTime = mean(process_time))
## # A tibble: 96 x 3
## # Groups:   Received.Year [?]
##    Received.Year Borough.of.Occurrence AverageProcessTime
##            <int> <fct>                              <dbl>
##  1          1999 Queens                              9   
##  2          2000 Brooklyn                            6   
##  3          2002 Bronx                               4   
##  4          2002 Manhattan                           4   
##  5          2003 Bronx                               3   
##  6          2003 Brooklyn                            3.5 
##  7          2003 Queens                              3   
##  8          2004 Bronx                               2   
##  9          2004 Brooklyn                            2.02
## 10          2004 Manhattan                           2   
## # ... with 86 more rows
library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = group_by(ccrb_distinct, Received.Year) %>%
  group_by(Received.Year, Borough.of.Occurrence) %>%
  summarize(AverageProcessTime = mean(process_time))) +
  geom_point(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7) +
  ggtitle("Average Number of Years for Complaints to be Closed") +
  labs(x="Received Year", x = "Average Process Time in Years")  +
  theme_bw()
## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (`x`)

If we exclude the years before 2005

library(dplyr)
library(ggplot2)
library(ggthemes)

data = 
         filter(
           group_by(ccrb_distinct, Received.Year) %>%
             group_by(Received.Year, Borough.of.Occurrence) %>%
             summarize(AverageProcessTime = mean(process_time)
                       ), Received.Year > 2004)
ggplot(data = 
         filter(
           group_by(ccrb_distinct, Received.Year) %>%
             group_by(Received.Year, Borough.of.Occurrence) %>%
             summarize(AverageProcessTime = mean(process_time)
                       ), Received.Year > 2004)) +
  geom_point(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7, size = 4) +
#  geom_line(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7) +
  ggtitle("Average Number of Years It Took for Complaints to be Closed") +
  labs(x="Received Year", y = "Average Process Time in Years") +
  scale_x_continuous(breaks = data$Received.Year,minor_breaks=NULL) +
  theme_bw()

Outside NYC and State island have been on the top for many years.

However we can’t just conclude that the amount of time it takes for complaints to be closed in all these boroughs dropped to zero in 2016, because there are still unclosed cases that are not in the data.

How has the Complaint Filed Mode changed over time?

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = 
           group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Complaint.Filed.Mode) %>%
           tally)  +
  geom_point(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
  geom_line(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
  ggtitle("Number of Complaints Filed Through Different Modes") +
  labs(x="Received Year", y = "Number of Complaints") +
  theme_bw()

If we look at after 2005

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = filter(
           group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Complaint.Filed.Mode) %>%
           tally, Received.Year>2004)
       )  +
  geom_point(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
  geom_line(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
  ggtitle("Number of Complaints Filed Through Different Modes") +
  labs(x="Received Year", y = "Number of Complaints") +
  theme_bw()

How would the trend look like if we look at it in percentage?

library(dplyr)
library(ggplot2)
library(ggthemes)

group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Complaint.Filed.Mode) %>%
           tally
## # A tibble: 98 x 3
## # Groups:   Received.Year [?]
##    Received.Year Complaint.Filed.Mode       n
##            <int> <fct>                  <int>
##  1          1999 Phone                      1
##  2          2000 In-person                  1
##  3          2002 Call Processing System     1
##  4          2002 Phone                      1
##  5          2003 Call Processing System     1
##  6          2003 Mail                       1
##  7          2003 Phone                      3
##  8          2004 Call Processing System    28
##  9          2004 E-mail                     2
## 10          2004 Fax                        2
## # ... with 88 more rows
library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = 
         group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Complaint.Filed.Mode) %>%
           tally, aes(x = Received.Year, y = n, fill = Complaint.Filed.Mode))  +
  geom_bar(position = "fill",stat = "identity") +
  ggtitle("Stacked Bar Chart: Portion of Complaint Filed Modes") + 
  labs(x= "Received Year", y = "Porpotion of Complaints in %") +
  theme_bw()

If we just look at after 2005

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data =filter(
         group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Complaint.Filed.Mode) %>%
           tally, Received.Year>2004), 
         aes(x = Received.Year, y = n, fill = Complaint.Filed.Mode))  +
  geom_bar(position = "fill",stat = "identity") +
  ggtitle("Stacked Bar Chart: Proportion of Complaint Filed Modes (After 2005)") + 
  labs(x= "Received Year", y = "Porpotion of Complaints in %") +
  theme_bw()

From 2005 to 2016, the proportion of complaints filed via on-line website has increased, the propotion via call processing system has decreased, the proportion via phone has slightly decreased.

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = 
           filter(group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Incident.Location) %>%
           tally, Received.Year>2004))  +
  geom_point(aes(x=Received.Year, y=n, color = Incident.Location), alpha = 0.7) +
  geom_line(aes(x=Received.Year, y=n, color = Incident.Location), alpha = 0.7) +
  ggtitle("Number of Complaints in Different Locations After 2005") +
  labs(x="Received Year", y = "Number of Complaints") +
  theme_bw()

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data =filter(group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Incident.Location) %>%
           tally, Received.Year>2004), 
         aes(x = Received.Year, y = n, fill = Incident.Location))  +
  geom_bar(position = "fill",stat = "identity") +
  ggtitle("Stacked Bar Chart: Incident Location (After 2005)") + 
  labs(x= "Received Year", y = "Porpotion of Complaints in %") +
  theme_bw()

The proportion of incidents happened in subway station and train decreased from 2005 to 2016. In the mean time, the proportion of incidents in apartments/house and park increased.

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = 
           filter(group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Encounter.Outcome) %>%
           tally, Received.Year>2004), 
       aes(x = Received.Year, y = n, fill = Encounter.Outcome))  +
  geom_bar(position = "fill",stat = "identity") +
  ggtitle("Stacked Bar Chart: Encounter Outcome (After 2005)") + 
  labs(x= "Received Year", y = "Porpotion of Complaints in %") +
  theme_bw()

library(dplyr)
library(ggplot2)
library(ggthemes)

ggplot(data = 
           filter(group_by(ccrb_distinct, Received.Year) %>%
           group_by(Received.Year, Encounter.Outcome) %>%
           tally, Received.Year>2004))  +
  geom_point(aes(x=Received.Year, y=n, color = Encounter.Outcome), alpha = 0.7) +
  geom_line(aes(x=Received.Year, y=n, color = Encounter.Outcome), alpha = 0.7) +
  ggtitle("Number of Complaints in Different Locations After 2005") +
  labs(x="Received Year", y = "Number of Complaints") +
  theme_bw()

Conclusions and findings:

-The number of complaints reported declined from 2006 to 2016. Brooklyn accounts for the most complaints among all boroughs of occurence. -The number of complaints closed each year declined since 2006, had a rebound in 2013 and then dropped again. -Most complaints were closed within the same year of the report or the next year. -Outside NYC and State island have been on the top for many years. -From 2005 to 2016, the proportion of complaints filed via on-line website has increased, the propotion via call processing system has decreased, the proportion via phone has slightly decreased. -The proportion of incidents happened in subway station and train decreased from 2005 to 2016. In the mean time, the proportion of incidents in apartments/house and park increased. -The proportion of complaints ended up with an arrest went down slightly from 2005 to 2013, and had a rebound thereafter.