Directions

The objective of this assingment is to conduct an exploratory data analysis of a data set that you are not familiar with. In this weeks lecture we discussed a number of visualiation approaches to exploring a data set, this assignment will apply those tools and techniques. An important distinction between class examples and applied data science work is that interative and repetitive nature of exploring a data set. It takes time and understand what is is the data and what is interesting in the data.

For this week we will be exploring data from the NYC Data Transparnecy Initiative. They maintain a database of complaints that fall within the Civilian Complain Review Board (CCRB), an independent municiple agency. Your objective is to identify interesting patterns and trends within the data that may be indicative of large scale trends.

This link will allow you to download the data set in .xlsx format. The data file has two tabs: one with metadata, and the “Complaints_Allegations” tab with the actual data.

library(ggplot2)
library(ggthemes)
ccrb=read.csv("C:/Users/xierong/Desktop/ccrb.csv")
summary(ccrb)
##       DateStamp      UniqueComplaintId   Close.Year   Received.Year 
##  11/29/2016:204397   Min.   :    1     Min.   :2006   Min.   :1999  
##                      1st Qu.:17356     1st Qu.:2008   1st Qu.:2007  
##                      Median :34794     Median :2010   Median :2009  
##                      Mean   :34778     Mean   :2010   Mean   :2010  
##                      3rd Qu.:52204     3rd Qu.:2013   3rd Qu.:2012  
##                      Max.   :69492     Max.   :2016   Max.   :2016  
##                                                                     
##    Borough.of.Occurrence Is.Full.Investigation
##  Bronx        :49442     Mode :logical        
##  Brooklyn     :72215     FALSE:107084         
##  Manhattan    :42104     TRUE :97313          
##  Outside NYC  :  170                          
##  Queens       :30883                          
##  Staten Island: 9100                          
##  NA's         :  483                          
##  Complaint.Has.Video.Evidence             Complaint.Filed.Mode
##  Mode :logical                Call Processing System: 42447   
##  FALSE:195530                 E-mail                :   799   
##  TRUE :8867                   Fax                   :   356   
##                               In-person             :  9586   
##                               Mail                  :  3424   
##                               On-line website       : 14197   
##                               Phone                 :133588   
##        Complaint.Filed.Place Complaint.Contains.Stop...Frisk.Allegations
##  CCRB             :130877    Mode :logical                              
##  IAB              : 69214    FALSE:119856                               
##  Precinct         :  3548    TRUE :84541                                
##  Other City agency:   295                                               
##  Mayor's Office   :   157                                               
##  Other            :   110                                               
##  (Other)          :   196                                               
##             Incident.Location  Incident.Year             Encounter.Outcome
##  Street/highway      :123274   Min.   :1999   Arrest              :89139  
##  Apartment/house     : 34720   1st Qu.:2007   No Arrest or Summons:82964  
##  Residential building: 12421   Median :2009   Other/NA            : 1050  
##  Police building     :  8968   Mean   :2010   Summons             :31244  
##  Subway station/train:  6077   3rd Qu.:2012                               
##  (Other)             : 15581   Max.   :2016                               
##  NA's                :  3356                                              
##                                 Reason.For.Initial.Contact
##  PD suspected C/V of violation/crime - street:60107       
##  Other                                       :39030       
##  PD suspected C/V of violation/crime - bldg  :16067       
##  PD suspected C/V of violation/crime - auto  :12953       
##  Moving violation                            : 8843       
##  (Other)                                     :66542       
##  NA's                                        :  855       
##          Allegation.FADO.Type
##  Abuse of Authority:102173   
##  Discourtesy       : 34452   
##  Force             : 61761   
##  Offensive Language:  6008   
##  NA's              :     3   
##                              
##                              
##                            Allegation.Description
##  Physical force                       :44116     
##  Word                                 :31704     
##  Stop                                 :12944     
##  Search (of person)                   :12250     
##  Refusal to provide name/shield number:10359     
##  (Other)                              :93021     
##  NA's                                 :    3

1st Graph to show Number of Complaints closed each year

ggplot(ccrb, aes(ccrb$Close.Year)) + 
  geom_histogram(fill = 'red', binwidth = 2) + 
  labs(x = 'Closed Year' , y = 'Number of Complaints', title = 'Cases closed year-wise')

2nd Graph to show Number of Complaints received each year

ggplot(ccrb, aes(ccrb$Received.Year, colour = ccrb$Borough.of.Occurrence)) + 
  geom_freqpoly(binwidth = 2) + 
  labs( x = 'Received Year', y = 'Number of Complaints' , title = 'Number of Complaints Received') + 
  scale_fill_discrete(name = 'Borough')

3rd Graph to show the frequency of Incident Occurrence by Borough and Type

ggplot(ccrb, aes(x=ccrb$Borough.of.Occurrence, fill= ccrb$Allegation.FADO.Type)) + geom_histogram(stat = "count") + labs (title = "Frequency of Incident Occurence by Borough and Type", x="Borough of Occurence", y="Frequency of Occurrence") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Allegation Type")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

4th Graph to show Investigation by Evidence

ggplot(ccrb, aes(x=ccrb$Is.Full.Investigation, fill= ccrb$Complaint.Has.Video.Evidence)) + geom_bar(stat = "count") + labs (title = "Investigation by Evidence", x="Is Full Investigation", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Has Video Evidence")

5th Graph to show Number of Incident Occurred Each Year by Evidence

ggplot(ccrb, aes(x=ccrb$Incident.Year, fill= ccrb$Complaint.Has.Video.Evidence)) + geom_histogram(stat = "count") + labs (title = "Number of Incident Occurred Each Year by Evidence", x="Incident Year", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Has Video Evidence")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

6th Graph to show Number of Cases Closed Each Year by Investigation

ggplot(ccrb, aes(x=ccrb$Close.Year, fill= ccrb$Is.Full.Investigation)) + geom_histogram(stat = "count") + labs (title = "Number of Case Closed Each Year by Investigation", x="Close Year", y="Number") + theme (legend.position = "bottom") + scale_fill_discrete(name = "Is Full Investigation")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

7th Graph to show Encounter Outcome by the location

ggplot(ccrb,aes(Complaint.Filed.Place,fill=Encounter.Outcome))+geom_bar()+guides(fill=guide_legend(title="Encounter Outcome"))+labs(x="Location of Claim filed",title="Encounter Outcome by Location of Claimed Filed")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

8th Graph to show the Outcome by the filing method

ggplot(ccrb,aes(Complaint.Filed.Mode,fill=Encounter.Outcome))+geom_bar()+guides(fill=guide_legend(title="Encounter Outcome"))+labs(x="Filing method",title="Encounter Outcome by Filing Method")+theme(axis.text.x=element_text(angle=90,hjust=.5,vjust=0.5))

9th Graph to show the incident location by the different boroughs.

ggplot(ccrb, aes(x = ccrb$Incident.Location, fill = ccrb$Borough.of.Occurrence)) + 
  geom_bar() + 
  labs(x = 'Incident Location') + 
  coord_flip() + 
  scale_fill_discrete(name = 'Bourough')

10th Graph to show the linear relationship between Incident Year and closing year.

ggplot(ccrb, aes(x = ccrb$Incident.Year, y = ccrb$Close.Year)) + 
  geom_point() + 
  geom_smooth(method = lm) +
  labs(x = 'Incident Year', y = 'Close Year')