The objective of this assingment is to conduct an exploratory data analysis of a data set that you are not familiar with. In this weeks lecture we discussed a number of visualiation approaches to exploring a data set, this assignment will apply those tools and techniques. An important distinction between class examples and applied data science work is that interative and repetitive nature of exploring a data set. It takes time and understand what is is the data and what is interesting in the data.
For this week we will be exploring data from the NYC Data Transparnecy Initiative. They maintain a database of complaints that fall within the Civilian Complain Review Board (CCRB), an independent municiple agency. Your objective is to identify interesting patterns and trends within the data that may be indicative of large scale trends.
This link will allow you to download the data set in .xlsx format. The data file has two tabs: one with metadata, and the “Complaints_Allegations” tab with the actual data.
library(ggplot2)
library(ggthemes)
ccrb=read.csv("C:/Users/xierong/Desktop/ccrb.csv")
summary(ccrb)
## DateStamp UniqueComplaintId Close.Year Received.Year
## 11/29/2016:204397 Min. : 1 Min. :2006 Min. :1999
## 1st Qu.:17356 1st Qu.:2008 1st Qu.:2007
## Median :34794 Median :2010 Median :2009
## Mean :34778 Mean :2010 Mean :2010
## 3rd Qu.:52204 3rd Qu.:2013 3rd Qu.:2012
## Max. :69492 Max. :2016 Max. :2016
##
## Borough.of.Occurrence Is.Full.Investigation
## Bronx :49442 Mode :logical
## Brooklyn :72215 FALSE:107084
## Manhattan :42104 TRUE :97313
## Outside NYC : 170
## Queens :30883
## Staten Island: 9100
## NA's : 483
## Complaint.Has.Video.Evidence Complaint.Filed.Mode
## Mode :logical Call Processing System: 42447
## FALSE:195530 E-mail : 799
## TRUE :8867 Fax : 356
## In-person : 9586
## Mail : 3424
## On-line website : 14197
## Phone :133588
## Complaint.Filed.Place Complaint.Contains.Stop...Frisk.Allegations
## CCRB :130877 Mode :logical
## IAB : 69214 FALSE:119856
## Precinct : 3548 TRUE :84541
## Other City agency: 295
## Mayor's Office : 157
## Other : 110
## (Other) : 196
## Incident.Location Incident.Year Encounter.Outcome
## Street/highway :123274 Min. :1999 Arrest :89139
## Apartment/house : 34720 1st Qu.:2007 No Arrest or Summons:82964
## Residential building: 12421 Median :2009 Other/NA : 1050
## Police building : 8968 Mean :2010 Summons :31244
## Subway station/train: 6077 3rd Qu.:2012
## (Other) : 15581 Max. :2016
## NA's : 3356
## Reason.For.Initial.Contact
## PD suspected C/V of violation/crime - street:60107
## Other :39030
## PD suspected C/V of violation/crime - bldg :16067
## PD suspected C/V of violation/crime - auto :12953
## Moving violation : 8843
## (Other) :66542
## NA's : 855
## Allegation.FADO.Type
## Abuse of Authority:102173
## Discourtesy : 34452
## Force : 61761
## Offensive Language: 6008
## NA's : 3
##
##
## Allegation.Description
## Physical force :44116
## Word :31704
## Stop :12944
## Search (of person) :12250
## Refusal to provide name/shield number:10359
## (Other) :93021
## NA's : 3
ggplot(ccrb, aes(ccrb$Close.Year)) +
geom_histogram(fill = 'red', binwidth = 2) +
labs(x = 'Closed Year' , y = 'Number of Complaints', title = 'Cases closed year-wise')
ggplot(ccrb, aes(ccrb$Received.Year, colour = ccrb$Borough.of.Occurrence)) +
geom_freqpoly(binwidth = 2) +
labs( x = 'Received Year', y = 'Number of Complaints' , title = 'Number of Complaints Received') +
scale_fill_discrete(name = 'Borough')
ggplot(ccrb, aes(x=ccrb$Borough.of.Occurrence, fill= ccrb$Allegation.FADO.Type)) + geom_histogram(stat = "count") + labs (title = "Frequency of Incident Occurence by Borough and Type", x="Borough of Occurence", y="Frequency of Occurrence") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Allegation Type")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(ccrb, aes(x=ccrb$Is.Full.Investigation, fill= ccrb$Complaint.Has.Video.Evidence)) + geom_bar(stat = "count") + labs (title = "Investigation by Evidence", x="Is Full Investigation", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Has Video Evidence")
ggplot(ccrb, aes(x=ccrb$Incident.Year, fill= ccrb$Complaint.Has.Video.Evidence)) + geom_histogram(stat = "count") + labs (title = "Number of Incident Occurred Each Year by Evidence", x="Incident Year", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Has Video Evidence")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(ccrb, aes(x=ccrb$Close.Year, fill= ccrb$Is.Full.Investigation)) + geom_histogram(stat = "count") + labs (title = "Number of Case Closed Each Year by Investigation", x="Close Year", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Is Full Investigation")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(ccrb,aes(Complaint.Filed.Place,fill=Encounter.Outcome))+geom_bar()+guides(fill=guide_legend(title="Encounter Outcome"))+labs(x="Location of Claim filed",title="Encounter Outcome by Location of Claimed Filed")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
ggplot(ccrb,aes(Complaint.Filed.Mode,fill=Encounter.Outcome))+geom_bar()+guides(fill=guide_legend(title="Encounter Outcome"))+labs(x="Filing method",title="Encounter Outcome by Filing Method")+theme(axis.text.x=element_text(angle=90,hjust=.5,vjust=0.5))
ggplot(ccrb, aes(x = ccrb$Incident.Location, fill = ccrb$Borough.of.Occurrence)) +
geom_bar() +
labs(x = 'Incident Location') +
coord_flip() +
scale_fill_discrete(name = 'Bourough')
ggplot(ccrb, aes(x = ccrb$Incident.Year, y = ccrb$Close.Year)) +
geom_point() +
geom_smooth(method = lm) +
labs(x = 'Incident Year', y = 'Close Year')