The objective of this assignment is to conduct an exploratory data analysis of a data set that you are not familiar with. In this week’s lecture, we discussed a number of visualization approaches in order to explore a data set. This assignment will apply those tools and techniques. An important distinction between class examples and applied data science work is iterative and repetitive nature of exploring a data set. It takes time to understand what the data is and what is interesting about the data (patterns).
For this week, we will be exploring data from the NYC Data Transparency Initiative. They maintain a database of complaints that fall within the Civilian Complain Review Board (CCRB), an independent municiple agency. Your objective is to identify interesting patterns and trends within the data that may be indicative of large scale trends.
This link will allow you to download the data set in .xlsx format. The data file has two tabs: one with metadata, and the “Complaints_Allegations” tab with the actual data.
For this assignment, you should submit a link to a knitr rendered html document that shows your exploratory data analysis. Organize your analysis using section headings:
# This is a top section
## This is a subsection
Your final document should include at minimum 10 visualizations. Each should include a brief statement of why you made the graphic.
A final section should summarize what you learned from your EDA. Your grade will be based on the quality of your graphics and the sophistication of your findings.
library(readxl)
ccrb<- read_excel("ccrb_datatransparencyinitiative.xlsx",sheet=2)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'),
legend.key = element_rect(fill = 'white'),
text = element_text(size = 12))
ggplot(ccrb,aes(x=`Incident Year`))+
geom_histogram(binwidth=1.0,color="black",fill="red")+
labs(title="Yearly number of incidents that led to complaints")+
xlab("Incident Year")+
ylab("Frequency")+
cleanup
bar1<-ggplot(ccrb,aes(`Borough of Occurrence`, fill=`Incident Location`))
bar1+geom_bar(stat="count")+
labs(title="Incidents that led to complaints by location")+
xlab("Borough of Occurrence")+
ylab("Frequency")+
theme_classic()
##The below pie chart shows that Brooklyn has the highest percentage of incidents that lead to complaints. NA & Staten Island have lowest percentage of incidents.
pie1<-table(ccrb$`Borough of Occurrence`)
pie1
##
## Bronx Brooklyn Manhattan NA Outside NYC
## 49442 72215 42104 483 170
## Queens Staten Island
## 30883 9100
prop.pie1<-round(100*pie1/sum(pie1), 1)
prop.pie1
##
## Bronx Brooklyn Manhattan NA Outside NYC
## 24.2 35.3 20.6 0.2 0.1
## Queens Staten Island
## 15.1 4.5
newpie<-paste(prop.pie1, "%", sep="")
pie(prop.pie1,
labels=newpie,
border="white",
col=c(rainbow(7)),
main="Proportion of incidents per borough of occurrence")
legend("topright", c("Bronx","Brooklyn","Manhattan","NA","Outside NYC","Queens","Staten Island"),
cex=0.9,
fill=c(rainbow(7)))
pie2<-table(ccrb$`Encounter Outcome`)
pie2
##
## Arrest No Arrest or Summons Other/NA
## 89139 82964 1050
## Summons
## 31244
prop.pie2<-round(100*pie1/sum(pie2), 1)
prop.pie2
##
## Bronx Brooklyn Manhattan NA Outside NYC
## 24.2 35.3 20.6 0.2 0.1
## Queens Staten Island
## 15.1 4.5
newpie<-paste(prop.pie2, "%", sep="")
pie(prop.pie2,
labels=newpie,
border="white",
col=c(rainbow(4)),
main="Percentage of Encounter Outcome")
legend("topright", c("Arrest","No Arrest of Summons","Other/NA","Summons"),
cex=0.9,
fill=c(rainbow(4)))
bar3<-ggplot(ccrb,aes(`Borough of Occurrence`, fill=`Encounter Outcome`))
bar3+geom_bar()+
labs(title="Encounter outcome by borough")+
xlab("Borough of Ocurrence")+
ylab("Frequency")+
theme_classic()
bar4<-ggplot(ccrb,aes(`Allegation FADO Type`, fill=`Is Full Investigation`))
bar4+geom_bar()+
coord_flip()+
labs(title="Number of complaints by allegation type")+
xlab("Allegation Type")+
ylab("Number of fully investigated complaints")+
theme_classic()
bar5<-ggplot(ccrb,aes(`Complaint Has Video Evidence`, fill=`Is Full Investigation`))
bar5+geom_bar()+
labs(title="Complaints that have video evidence & are fully investigated")+
xlab("Complaint has video evidence")+
ylab("Frequency")+
theme_classic()
scatter=ggplot(ccrb, aes(`Received Year`,`Close Year`))+
geom_point(aes(colour= `Is Full Investigation`))+
geom_smooth(method="lm", color="red")+
labs(title="Relationship between when a complaint is received and when it is closed")
scatter
## `geom_smooth()` using formula 'y ~ x'
pie3<-table(ccrb$`Complaint Filed Mode`)
pie3
##
## Call Processing System E-mail Fax
## 42447 799 356
## In-person Mail On-line website
## 9586 3424 14197
## Phone
## 133588
prop.pie3<-round(100*pie3/sum(pie3), 1)
prop.pie3
##
## Call Processing System E-mail Fax
## 20.8 0.4 0.2
## In-person Mail On-line website
## 4.7 1.7 6.9
## Phone
## 65.4
newpie<-paste(prop.pie3, "%", sep="")
pie(prop.pie3,
labels=newpie,
border="white",
col=c(rainbow(7)),
main="Percentage of Complaints filed by mode")
legend("bottom", c("Call Processing System","Email","Fax","In-Person","Mail","On-line website","Phone"),
cex=0.9,
fill=c(rainbow(7)))