setwd("C:/Users/Hannah/Desktop/Assignment/ANLY512/Week8 Visual Data Exploration (VDE)")
data = read.csv("ccrb.csv", sep=",")
# Histogram of Received Year
hist(data$Received.Year, main="Histogram of Received Year", xlab="Years")
Graph 1 shows the distribution of received years. From this graph, we can see from which years the data were collected, and in which year there were more complaints.
# Number of Complaints over Time
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data2<-as.data.frame(
data %>%
group_by(Received.Year) %>%
summarise(N= n()))
ggplot(data2, aes(Received.Year, y=N))+
geom_line()+
ggtitle("Number of Complaints over Time") +
xlab("Received Years") + ylab("Number of Complaints")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(limits = c(2004, 2016))
## Warning: Removed 4 rows containing missing values (geom_path).
Graph 2 shows number of complaints received across years.
# Frequency of Complaint Filed Mode
ggplot(data, aes(Complaint.Filed.Mode))+
geom_bar()+
ggtitle("Frequency of Complaint Filed Mode") +
xlab("Complaint Filed Mode") + ylab("Frequency")+
theme(plot.title = element_text(hjust = 0.5))
This graph shows the frequency of each complaint filed mode.
# Frequency of Encounter Outcome
ggplot(data, aes(Encounter.Outcome))+
geom_bar()+
ggtitle("Frequency of Encounter Outcome") +
xlab("Encounter Outcome") + ylab("Frequency")+
theme(plot.title = element_text(hjust = 0.5))
Graph 4 shows the frequency of each encounter outcome.
# Frequency of Complaint Filed Mode over Time
data5<-as.data.frame(
data %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
summarise(N= n()))
ggplot(data5, aes(Received.Year, y=N, group=Complaint.Filed.Mode, colour=Complaint.Filed.Mode))+
geom_line()+
ggtitle("Frequency of Complaint Filed Mode over Time") +
xlab("Received Years") + ylab("Frequency")+
labs(fill='Complaint Filed Mode') +
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous(limits = c(2004, 2016))
## Warning: Removed 7 rows containing missing values (geom_path).
Graph 5 indicates the trend of each complaint filed mode across years.
# Frequency of Whether Full Investigation over Time
data6<-as.data.frame(
data %>%
group_by(Received.Year, Is.Full.Investigation) %>%
summarise(N= n()))
ggplot(data6, aes(Received.Year, y=N, group=Is.Full.Investigation, colour=Is.Full.Investigation))+
geom_line()+
ggtitle("Frequency of Whether Full Investigation over Time") +
xlab("Received Years") + ylab("Frequency")+
labs(colour='Whether Full Investigation') +
theme(plot.title = element_text(hjust = 0.5))
Graph 6 indicates the trend of complaints disaggregated by whether there were full investigations across years.
# Distribution of Complaint Filed Mode over Time
data7<-as.data.frame(
data %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
summarise(N= n()))
library(scales)
ggplot(data7,aes(x = Received.Year, y = N, fill = Complaint.Filed.Mode))+ geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
ggtitle("Distribution of Complaint Filed Mode over Time") +
xlab("Received Years") + ylab("Percentage")+
labs(fill='Complaint Filed Mode') +
theme(plot.title = element_text(hjust = 0.5))
Graph 7 shows the distribution of complaint filed mode across years.
# Distribution of Full Investigation over Time
data8<-as.data.frame(
data %>%
group_by(Received.Year, Is.Full.Investigation) %>%
summarise(N= n()))
ggplot(data8,aes(x = Received.Year, y = N, fill =Is.Full.Investigation))+ geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
ggtitle("Distribution of Whether Full Investigation over Time") +
xlab("Received Years") + ylab("Percentage")+
labs(fill='Full Investigation') +
theme(plot.title = element_text(hjust = 0.5))
Graph 8 shows the distribution of whether full investigations across years.
# Number of Complaints by Borough of Occurrence over Time
data9<-as.data.frame(
data %>%
group_by(Received.Year, Borough.of.Occurrence) %>%
summarise(N= n()))
data9 <- data9[ which(data9$Borough.of.Occurrence!='NA'),]
ggplot(data9, aes(Received.Year, y=N))+
geom_line()+
facet_wrap(~Borough.of.Occurrence, ncol=3, strip.position = "bottom") +
ggtitle("Number of Complaints by Borough of Occurrence over Time") +
xlab("Received Years") + ylab("Number of Complaints")+
theme(plot.title = element_text(hjust = 0.5))
Graph 9 shows the trend of complaint numbers by borough of occurrence across years.
# Number of Complaints by Complaint Filed Mode over Time
data10<-as.data.frame(
data %>%
group_by(Received.Year, Complaint.Filed.Mode) %>%
summarise(N= n()))
data10 <- data10[ which(data10$Complaint.Filed.Mode!='Fax'),]
ggplot(data10, aes(Received.Year, y=N))+
geom_line()+
facet_wrap(~Complaint.Filed.Mode, ncol=3, strip.position = "bottom") +
ggtitle("Number of Complaints by Complaint Filed Mode over Time") +
xlab("Received Years") + ylab("Number of Complaints")+
theme(plot.title = element_text(hjust = 0.5))
Graph 10 shows the trend of complaint numbers by complaint filed mode across years.
In summary, this exploratory data analysis tries to obtain a general idea of the data. The outcome I am most interested in is the number of complaints. To investigate the trend of complaint numbers across years, I explored the relationship between complaint numbers and several other variables, such as complaint filed mode, encounter outcome, full investigations, and borough of occurrence. Overall, number of complaints increased from the year of 2004 to 2007, and then decreased from 2007 to 2016. The trend was almost consistent when the number of complaints were disaggregated by complaint filed mode, full investigations, and borough of occurrence.
In terms of complaint filed mode, more complaints were made through phone, followed by call processing system. There were more and more people who made complaints through on-line website during the years. When disaggregated by borough of occurrence, we can see that there were more complaints made in Brooklyn, followed by Bronx and Manhattan. However, there were very few complaints outside NYC.