ccrb= read.csv("/Users/yh101251/Desktop/HU/512/week8/ccrb.csv")
str(ccrb)
## 'data.frame': 204397 obs. of 16 variables:
## $ DateStamp : Factor w/ 1 level "11/29/2016": 1 1 1 1 1 1 1 1 1 1 ...
## $ UniqueComplaintId : int 11 18 18 18 18 18 18 18 18 18 ...
## $ Close.Year : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Received.Year : int 2005 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ Borough.of.Occurrence : Factor w/ 6 levels "Bronx","Brooklyn",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ Is.Full.Investigation : logi FALSE TRUE TRUE TRUE TRUE TRUE ...
## $ Complaint.Has.Video.Evidence : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Complaint.Filed.Mode : Factor w/ 7 levels "Call Processing System",..: 6 7 7 7 7 7 7 7 7 7 ...
## $ Complaint.Filed.Place : Factor w/ 14 levels "CCRB","Comm. to Combat Police Corruption",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Complaint.Contains.Stop...Frisk.Allegations: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Incident.Location : Factor w/ 15 levels "Apartment/house",..: 14 14 14 14 14 14 14 14 14 14 ...
## $ Incident.Year : int 2005 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ Encounter.Outcome : Factor w/ 4 levels "Arrest","No Arrest or Summons",..: 2 1 1 1 1 1 1 1 1 1 ...
## $ Reason.For.Initial.Contact : Factor w/ 49 levels "Aided case","Arrest/Complainant",..: 23 32 32 32 32 32 32 32 32 32 ...
## $ Allegation.FADO.Type : Factor w/ 4 levels "Abuse of Authority",..: 1 1 2 2 2 3 3 3 3 3 ...
## $ Allegation.Description : Factor w/ 56 levels "Action","Animal",..: 48 35 56 56 56 27 27 27 27 27 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
ggplot(data = ccrb %>%
distinct(UniqueComplaintId, Incident.Year) %>%
group_by(Incident.Year) %>%
tally) +
geom_line(aes(Incident.Year,n)) +
ggtitle("Number of Incidents led to Complaints Each Year") +
labs(x="Year", y = "Number of incidents") +
theme_classic()
The number of incidents before 2005 looks a little off. After 2006, the number of incidents each year declined.
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = ccrb %>%
distinct(UniqueComplaintId, Received.Year) %>%
group_by(Received.Year) %>%
tally) +
geom_line(aes(Received.Year,n)) +
ggtitle("Number of Complaints Received Each Year") +
labs(y="Year", x = "Number of complaints") +
theme_classic()
Similar to the number of incidents, the number of complaints dropped after 2006.
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = group_by(ccrb, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId))) +
geom_line(aes(x=Received.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
ggtitle("Number of Complaints Received Each Year") +
labs(y="Year", x = "Number of compaints") +
theme_classic()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = filter(
group_by(ccrb, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId)),
Received.Year>2004)
) +
geom_line(aes(x=Received.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
ggtitle("Number of Complaints Received Each Year") +
labs(y="Year", x = "Number of compaints") +
theme_classic()
## Warning: package 'bindrcpp' was built under R version 3.4.4
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = ccrb %>%
distinct(UniqueComplaintId, Close.Year) %>%
group_by(Close.Year) %>%
tally) +
geom_line(aes(Close.Year,n)) +
ggtitle("Number of Complaints Closed Each Year") +
labs(x="Year", y = "Number of compaints") +
theme_classic()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = group_by(ccrb, Close.Year) %>%
group_by(Close.Year, Borough.of.Occurrence) %>%
summarise(number_of_distinct_complaints=n_distinct(UniqueComplaintId))) +
geom_line(aes(x=Close.Year, y=number_of_distinct_complaints, colour=Borough.of.Occurrence)) +
ggtitle("Number of Complaints Closed Each Year") +
labs(y="Year", x = "Number of compaints") +
theme_classic()
ccrb$process_time = ccrb$Close.Year - ccrb$Received.Year
ccrb_distinct = ccrb[!duplicated(ccrb$UniqueComplaintId),]
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = ccrb_distinct) +
geom_point(aes(x=Received.Year, y=Close.Year), alpha = 0.05) +
ggtitle("Receive and close year") +
labs(y="Close Year", x = "Received Year") +
theme_classic()
Because all years are intigers, all the points are overlapped. This plot would work better for continuous data. Although it isn’t successful in showing in showing the closed year and received year, it was fun to try with.
ggplot(data = ccrb_distinct) +
geom_boxplot(aes(x=Borough.of.Occurrence, y=process_time)) +
theme_bw()
Because the numbers are small and don’t vary much, the boxplot isn’t showing any apparent difference.
ggplot(data = ccrb_distinct, aes(process_time)) +
geom_histogram(binwidth = 1) +
theme_bw()
Most complaints were closed within the same year of the report or the next year.
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarize(AverageProcessTime = mean(process_time))
## # A tibble: 96 x 3
## # Groups: Received.Year [?]
## Received.Year Borough.of.Occurrence AverageProcessTime
## <int> <fct> <dbl>
## 1 1999 Queens 9
## 2 2000 Brooklyn 6
## 3 2002 Bronx 4
## 4 2002 Manhattan 4
## 5 2003 Bronx 3
## 6 2003 Brooklyn 3.5
## 7 2003 Queens 3
## 8 2004 Bronx 2
## 9 2004 Brooklyn 2.02
## 10 2004 Manhattan 2
## # ... with 86 more rows
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarize(AverageProcessTime = mean(process_time))) +
geom_point(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7) +
ggtitle("Average Number of Years for Complaints to be Closed") +
labs(x="Received Year", x = "Average Process Time in Years") +
theme_bw()
## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (`x`)
library(dplyr)
library(ggplot2)
library(ggthemes)
data =
filter(
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarize(AverageProcessTime = mean(process_time)
), Received.Year > 2004)
ggplot(data =
filter(
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarize(AverageProcessTime = mean(process_time)
), Received.Year > 2004)) +
geom_point(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7, size = 4) +
# geom_line(aes(x=Received.Year, y=AverageProcessTime, color = Borough.of.Occurrence), alpha = 0.7) +
ggtitle("Average Number of Years It Took for Complaints to be Closed") +
labs(x="Received Year", y = "Average Process Time in Years") +
scale_x_continuous(breaks = data$Received.Year,minor_breaks=NULL) +
theme_bw()
Outside NYC and State island have been on the top for many years.
However we can’t just conclude that the amount of time it takes for complaints to be closed in all these boroughs dropped to zero in 2016, because there are still unclosed cases that are not in the data.
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
tally) +
geom_point(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
geom_line(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
ggtitle("Number of Complaints Filed Through Different Modes") +
labs(x="Received Year", y = "Number of Complaints") +
theme_bw()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data = filter(
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
tally, Received.Year>2004)
) +
geom_point(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
geom_line(aes(x=Received.Year, y=n, color = Complaint.Filed.Mode), alpha = 0.7) +
ggtitle("Number of Complaints Filed Through Different Modes") +
labs(x="Received Year", y = "Number of Complaints") +
theme_bw()
library(dplyr)
library(ggplot2)
library(ggthemes)
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
tally
## # A tibble: 98 x 3
## # Groups: Received.Year [?]
## Received.Year Complaint.Filed.Mode n
## <int> <fct> <int>
## 1 1999 Phone 1
## 2 2000 In-person 1
## 3 2002 Call Processing System 1
## 4 2002 Phone 1
## 5 2003 Call Processing System 1
## 6 2003 Mail 1
## 7 2003 Phone 3
## 8 2004 Call Processing System 28
## 9 2004 E-mail 2
## 10 2004 Fax 2
## # ... with 88 more rows
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
tally, aes(x = Received.Year, y = n, fill = Complaint.Filed.Mode)) +
geom_bar(position = "fill",stat = "identity") +
ggtitle("Stacked Bar Chart: Portion of Complaint Filed Modes") +
labs(x= "Received Year", y = "Porpotion of Complaints in %") +
theme_bw()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =filter(
group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
tally, Received.Year>2004),
aes(x = Received.Year, y = n, fill = Complaint.Filed.Mode)) +
geom_bar(position = "fill",stat = "identity") +
ggtitle("Stacked Bar Chart: Proportion of Complaint Filed Modes (After 2005)") +
labs(x= "Received Year", y = "Porpotion of Complaints in %") +
theme_bw()
From 2005 to 2016, the proportion of complaints filed via on-line website has increased, the propotion via call processing system has decreased, the proportion via phone has slightly decreased.
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =
filter(group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Incident.Location) %>%
tally, Received.Year>2004)) +
geom_point(aes(x=Received.Year, y=n, color = Incident.Location), alpha = 0.7) +
geom_line(aes(x=Received.Year, y=n, color = Incident.Location), alpha = 0.7) +
ggtitle("Number of Complaints in Different Locations After 2005") +
labs(x="Received Year", y = "Number of Complaints") +
theme_bw()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =filter(group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Incident.Location) %>%
tally, Received.Year>2004),
aes(x = Received.Year, y = n, fill = Incident.Location)) +
geom_bar(position = "fill",stat = "identity") +
ggtitle("Stacked Bar Chart: Incident Location (After 2005)") +
labs(x= "Received Year", y = "Porpotion of Complaints in %") +
theme_bw()
The proportion of incidents happened in subway station and train decreased from 2005 to 2016. In the mean time, the proportion of incidents in apartments/house and park increased.
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =
filter(group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Encounter.Outcome) %>%
tally, Received.Year>2004),
aes(x = Received.Year, y = n, fill = Encounter.Outcome)) +
geom_bar(position = "fill",stat = "identity") +
ggtitle("Stacked Bar Chart: Encounter Outcome (After 2005)") +
labs(x= "Received Year", y = "Porpotion of Complaints in %") +
theme_bw()
library(dplyr)
library(ggplot2)
library(ggthemes)
ggplot(data =
filter(group_by(ccrb_distinct, Received.Year) %>%
group_by(Received.Year, Encounter.Outcome) %>%
tally, Received.Year>2004)) +
geom_point(aes(x=Received.Year, y=n, color = Encounter.Outcome), alpha = 0.7) +
geom_line(aes(x=Received.Year, y=n, color = Encounter.Outcome), alpha = 0.7) +
ggtitle("Number of Complaints in Different Locations After 2005") +
labs(x="Received Year", y = "Number of Complaints") +
theme_bw()
-The number of complaints reported declined from 2006 to 2016. Brooklyn accounts for the most complaints among all boroughs of occurence. -The number of complaints closed each year declined since 2006, had a rebound in 2013 and then dropped again. -Most complaints were closed within the same year of the report or the next year. -Outside NYC and State island have been on the top for many years. -From 2005 to 2016, the proportion of complaints filed via on-line website has increased, the propotion via call processing system has decreased, the proportion via phone has slightly decreased. -The proportion of incidents happened in subway station and train decreased from 2005 to 2016. In the mean time, the proportion of incidents in apartments/house and park increased. -The proportion of complaints ended up with an arrest went down slightly from 2005 to 2013, and had a rebound thereafter.