Load the Cleansed Dataset

load("SQF_clean.rda")

Investigate Data

First, load the ggplot2 library

library(ggplot2)
  1. Most people stopped by NYPD were youngsters.
ggplot(data=dat, aes(dat$age)) + 
  geom_histogram(breaks=seq(10, 90, by=1), aes(fill=..count..)) +
  scale_fill_gradient("Count", low="green", high="red") +
  ggtitle("Histrogram of Age") + 
  labs(x="Age", y="Count")
## Warning: Removed 212 rows containing non-finite values (stat_bin).

  1. Less people are stopped in December.
month <- format(dat$datestop, "%m")
ggplot(data=dat, aes(x=month)) + 
  geom_bar() + 
  ggtitle("Barplot of Month") + 
  labs(x="Month of Year", y="Count")

  1. Less people are stopped in Sunday or Monday.
weekday <- format(dat$datestop, "%A")
ggplot(data=dat, aes(x=weekday)) + 
  geom_bar() + 
  ggtitle("Barplot of Weekday") + 
  labs(x="Weekday", y="Count")

  1. Less people are stopped in early morning.
ggplot(data=dat, aes(x=timestop)) + 
  geom_bar() + 
  ggtitle("Barplot of Hour") + 
  labs(x="Hour of Day", y="Count")

  1. More black male are stopped.
ggplot(data=dat, aes(x=race, group=sex)) + 
  facet_grid(~sex) +
  geom_bar() + 
  ggtitle("Barplot of Race, filled by Sex") + 
  labs(x="Race", y="Count")

  1. Brooklyn and Queens record more stops than other areas.
ggplot(data=dat, aes(x=city, fill=race)) + 
  geom_bar() + 
  ggtitle("Barplot of City, filled by Race") + 
  labs(x="City", y="Count")

  1. Correlation between some result items and force used. Intuitive: if hand cuff is used, people stopped are more likely to be searched and arrested.
library(corrplot)
result <- c("arstmade","sumissue","frisked", "searched")
force <- c(grep("pf_", names(dat), value=TRUE))
dd <- dat[,c(result, force)]
cc <- cor(dd)
cc <- cc[result, force]
corrplot(t(cc), method="ellipse")

  1. Correlation between some result items and reason to stop. Intuitive: People with furtive movement and suspicious bulge are more likely to be frisked.
library(corrplot)
result <- c("arstmade","sumissue","frisked", "searched")
force <- c(grep("cs_", names(dat), value=TRUE))
dd <- dat[,c(result, force)]
cc <- cor(dd)
cc <- cc[result, force]
corrplot(t(cc), method="ellipse")

  1. Draw the “ASSAULT(9)” and “BURGLARY(14)” incidents on the map. More assaults happen Manhattan and Bronx. More Burglaries happen in Brooklyn and Queens.
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.3
NYC <- get_map("New York City", zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
map <- ggmap(NYC)
coords <- dat[, c("xcoord", "ycoord")]
# EPSG Projection 2263 - NAD83 / New York Long Island (ftUS)
# WGS84 Bounds: -74.2700, 40.4700, -71.7500, 41.3100
# Projected Bounds: 909126.0155, 110626.2880, 1610215.3590, 424498.0529
library(proj4) ### needs PROJ.4 installed (http://trac.osgeo.org/proj/)
c2 <- project(coords, inverse=TRUE, proj="+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs")
coords <- data.frame(lon=c2[[1]], lat=c2[[2]])
d2 <- coords
d2$detailCM <- dat$detailCM
d2 <- d2[d2$detailCM==c("9", "14"),]
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple of
## shorter object length
## Warning in `==.default`(d2$detailCM, c("9", "14")): longer object length is
## not a multiple of shorter object length
d2 <- na.omit(d2)
map + geom_point(aes(x=lon, y=lat, colour=detailCM), data=d2, alpha=.6)
## Warning: Removed 583 rows containing missing values (geom_point).