data <- read.csv("PDI_Use_of_Force.csv", header = TRUE)

Extra EDA

ggplot(data=data, aes(x=factor(data$OFFICER_GENDER), y=data$SUBJECT_RACE)) + geom_boxplot() + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))

ggplot(data=data, aes(y=factor(data$INCIDENT_DESCRIPTION), x=factor(data$DISTRICT))) + geom_boxplot() + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))

Question 2 Extra Analysis

data2 <- read.csv("Crime_Incidents.csv", header = TRUE)

data2$DISTRICT[data2$DISTRICT == "CENTRAL BUSINESS"] = "Central Business"
data2$DISTRICT[data2$DISTRICT == "OTHER"] = "Other"

data2$DISTRICT_ZIP[data2$DISTRICT == "Central Business"] = 45202

data2$DISTRICT_ZIP[data2$DISTRICT == "1"] = 45214

data2$DISTRICT_ZIP[data2$DISTRICT == "2"] = 45208

data2$DISTRICT_ZIP[data2$DISTRICT == "3"] = 45238

data2$DISTRICT_ZIP[data2$DISTRICT == "4"] = 45229

data2$DISTRICT_ZIP[data2$DISTRICT == "5"] = 45223

data2$DISTRICT_ZIP[data2$DISTRICT == "Other"] = 45219

crime_ag_zip <- aggregate(data.frame(count=data2$OFFENSE), list(zip=data2$DISTRICT_ZIP, district=data2$DISTRICT), FUN = length)
crime_ag_zip

##     zip         district count
## 1 45214                1 29598
## 2 45208                2 35773
## 3 45238                3 75981
## 4 45229                4 50931
## 5 45223                5 45480
## 6 45202 Central Business  6095
## 7 45219            Other    78

data(zip_codes)
zips<-zip_codes[zip_codes$city=="Cincinnati",]
zips<-zips[zips$zip %in% crime_ag_zip$zip,]

#url <- "http://www2.census.gov/geo/tiger/TIGER2017/ZCTA510/tl_2017_us_zcta510.zip"
downloaddir<-getwd()
#destname<-"tl_2017_us_zcta510.zip"
#download.file(url, destname)
#unzip(destname, exdir=downloaddir, junkpaths=TRUE)


filename<-list.files(downloaddir, pattern=".shp", full.names=FALSE)
filename<-gsub(".shp", "", filename)


dat<-readOGR(downloaddir, "tl_2017_us_zcta510")

## OGR data source with driver: ESRI Shapefile 
## Source: "E:/OneDrive/STAT 470/Police Data Case Study", layer: "tl_2017_us_zcta510"
## with 33144 features
## It has 9 fields
## Integer64 fields read as strings:  ALAND10 AWATER10

subdat<-dat[dat$GEOID10 %in% crime_ag_zip$zip,]

#subdat<-spTransform(subdat, CRS("+init=epsg:4326"))



subdat_data<-subdat@data[,c("GEOID10", "ALAND10", "AWATER10")]




subdat<-SpatialPolygonsDataFrame(subdat, data=subdat_data)

bins <- c(0, 25000, 50000, 75000, 100000)
pal <- colorBin("YlOrRd", domain = 1:100000, bins = bins)

labels <- sprintf(
  "<strong>%s</strong><br/>%g offences",
  crime_ag_zip$district, crime_ag_zip$count
) %>% lapply(htmltools::HTML)

labels1 <- sprintf(
  "<strong>%s</strong>",
  subdat$GEOID10
) %>% lapply(htmltools::HTML)

highlights <- highlightOptions(
    weight = 5,
    color = "#666",
    dashArray = "",
    fillOpacity = 0.7,
    bringToFront = TRUE)

labeloptions <- labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto")

cin <- leaflet(subdat) %>% setView(lat = 39.1031, lng = -84.5120, zoom = 12) %>% addProviderTiles(providers$CartoDB.Positron) %>% addPolygons(
  data=subdat, 
  fillColor = ~pal(crime_ag_zip$count),
  weight = 2,
  opacity = 1,
  color = "white",
  dashArray = "3",
  fillOpacity = 0.5,
  highlight = highlights,
  label = labels,
  labelOptions = labeloptions) %>%
  addLegend(pal = pal, values = ~1:100000, opacity = 0.7, title = 'No. of Offences',
  position = "bottomright")
  

cin

ggplot(data=crime_ag_zip, aes(x=crime_ag_zip$district, y=crime_ag_zip$count)) + geom_boxplot()

Question 3 Extra Analysis

data123 <- read.csv("E:/OneDrive/STAT 470/Police Data Case Study/Police_Calls_For_Service__with_Time_Dispatched___Transmit_Time_.csv", header = TRUE)

ggplot(data=data123, aes(x=data123$CREATE_TIME_TIMEONLY, y=factor(data123$INCIDENT_TYPE_ID))) + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))

#chisq.test(x=factor(data123$INCIDENT_TYPE_ID), y=data123$CREATE_TIME_TIMEONLY)
ggplot(data=data123, aes(x=data123$CREATE_TIME_TIMEONLY)) + geom_histogram(binwidth = 1, bins = 100) + labs(x="TIME OF DAY", y="Number of Offences", title="Histogram of Offences per Hour")

calls <- aggregate(data.frame(calls = data123$CREATE_TIME_INCIDENT), list(time = data123$CREATE_TIME_TIMEONLY), FUN = length)
calls$calls <- calls$calls/365

fit <- nls(calls ~ C + A*sin(2*pi*time*F), data=calls, start=list(A = 60, F = .08, C=60), trace=T)

## 190778.9 :  60.00  0.08 60.00
## 44101.26 :  -12.57128238   0.08038137 120.15355782
## 44025.7 :  -12.51370057   0.07886289 120.06720967
## 43729.19 :  -12.86148809   0.07598434 120.47555874
## 42153.3 :  -14.53329989   0.07008223 121.69285030
## 35143.25 :  -20.77751387   0.06050761 124.69671861
## 17744.14 :  -31.65156208   0.04740038 125.04874461
## 9748.549 :  -57.43044538   0.04555843 120.84071964
## 9616.175 :  -57.51449876   0.04614959 120.73075657
## 9614.363 :  -57.60950038   0.04621784 120.79771319
## 9614.336 :  -57.61304854   0.04622625 120.80322630
## 9614.336 :  -57.61337979   0.04622731 120.80387633
## 9614.336 :  -57.61341962   0.04622744 120.80395755

fitt <- fitted.values(fit)

summary(fit)

## 
## Formula: calls ~ C + A * sin(2 * pi * time * F)
## 
## Parameters:
##     Estimate Std. Error t value Pr(>|t|)    
## A -57.613420   6.469929  -8.905 1.42e-08 ***
## F   0.046227   0.001172  39.426  < 2e-16 ***
## C 120.803958   4.427618  27.284  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.4 on 21 degrees of freedom
## 
## Number of iterations to convergence: 12 
## Achieved convergence tolerance: 3.112e-06

plot(x=calls$time, y=calls$calls, xlab="Hour of the Day", ylab="Calls Made", main="Plot of Average Calls per Hour with Model Overlay", sub="Model: Calls = 120.804 - 57.613*sin(2*pi*Hour*0.046)")
lines(x=calls$time,predict(fit), col="RED")

Question 1 Extra Analysis

data1234 <- data123

plot(data1234$CREATE_TIME_TIMEONLY, data1234$ALARM_LEVEL)

chisq.test(y=factor(data1234$ALARM_LEVEL), x=data1234$CREATE_TIME_TIMEONLY)

## Warning in chisq.test(y = factor(data1234$ALARM_LEVEL), x =
## data1234$CREATE_TIME_TIMEONLY): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  data1234$CREATE_TIME_TIMEONLY and factor(data1234$ALARM_LEVEL)
## X-squared = 17441, df = 805, p-value < 2.2e-16

R CODE

data <- read.csv("PDI_Use_of_Force.csv", header = TRUE)
ggplot(data=data, aes(x=factor(data$OFFICER_GENDER), y=data$SUBJECT_RACE)) + geom_boxplot() + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))
ggplot(data=data, aes(y=factor(data$INCIDENT_DESCRIPTION), x=factor(data$DISTRICT))) + geom_boxplot() + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))
data2 <- read.csv("Crime_Incidents.csv", header = TRUE)
data2$DISTRICT[data2$DISTRICT == "CENTRAL BUSINESS"] = "Central Business"
data2$DISTRICT[data2$DISTRICT == "OTHER"] = "Other"

data2$DISTRICT_ZIP[data2$DISTRICT == "Central Business"] = 45202

data2$DISTRICT_ZIP[data2$DISTRICT == "1"] = 45214

data2$DISTRICT_ZIP[data2$DISTRICT == "2"] = 45208

data2$DISTRICT_ZIP[data2$DISTRICT == "3"] = 45238

data2$DISTRICT_ZIP[data2$DISTRICT == "4"] = 45229

data2$DISTRICT_ZIP[data2$DISTRICT == "5"] = 45223

data2$DISTRICT_ZIP[data2$DISTRICT == "Other"] = 45219
crime_ag_zip <- aggregate(data.frame(count=data2$OFFENSE), list(zip=data2$DISTRICT_ZIP, district=data2$DISTRICT), FUN = length)
crime_ag_zip
data(zip_codes)
zips<-zip_codes[zip_codes$city=="Cincinnati",]
zips<-zips[zips$zip %in% crime_ag_zip$zip,]
#url <- "http://www2.census.gov/geo/tiger/TIGER2017/ZCTA510/tl_2017_us_zcta510.zip"
downloaddir<-getwd()
#destname<-"tl_2017_us_zcta510.zip"
#download.file(url, destname)
#unzip(destname, exdir=downloaddir, junkpaths=TRUE)


filename<-list.files(downloaddir, pattern=".shp", full.names=FALSE)
filename<-gsub(".shp", "", filename)


dat<-readOGR(downloaddir, "tl_2017_us_zcta510") 
subdat<-dat[dat$GEOID10 %in% crime_ag_zip$zip,]

#subdat<-spTransform(subdat, CRS("+init=epsg:4326"))



subdat_data<-subdat@data[,c("GEOID10", "ALAND10", "AWATER10")]




subdat<-SpatialPolygonsDataFrame(subdat, data=subdat_data)


bins <- c(0, 25000, 50000, 75000, 100000)
pal <- colorBin("YlOrRd", domain = 1:100000, bins = bins)

labels <- sprintf(
  "<strong>%s</strong><br/>%g offences",
  crime_ag_zip$district, crime_ag_zip$count
) %>% lapply(htmltools::HTML)

labels1 <- sprintf(
  "<strong>%s</strong>",
  subdat$GEOID10
) %>% lapply(htmltools::HTML)

highlights <- highlightOptions(
    weight = 5,
    color = "#666",
    dashArray = "",
    fillOpacity = 0.7,
    bringToFront = TRUE)

labeloptions <- labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto")
cin <- leaflet(subdat) %>% setView(lat = 39.1031, lng = -84.5120, zoom = 12) %>% addProviderTiles(providers$CartoDB.Positron) %>% addPolygons(
  data=subdat, 
  fillColor = ~pal(crime_ag_zip$count),
  weight = 2,
  opacity = 1,
  color = "white",
  dashArray = "3",
  fillOpacity = 0.5,
  highlight = highlights,
  label = labels,
  labelOptions = labeloptions) %>%
  addLegend(pal = pal, values = ~1:100000, opacity = 0.7, title = 'No. of Offences',
  position = "bottomright")
  

cin


ggplot(data=crime_ag_zip, aes(x=crime_ag_zip$district, y=crime_ag_zip$count)) + geom_boxplot()
data123 <- read.csv("E:/OneDrive/STAT 470/Police Data Case Study/Police_Calls_For_Service__with_Time_Dispatched___Transmit_Time_.csv", header = TRUE)
ggplot(data=data123, aes(x=data123$CREATE_TIME_TIMEONLY, y=factor(data123$INCIDENT_TYPE_ID))) + geom_count(color="blue") + ggtitle("Boxplot with Overlayed Density Plot") + guides(size = guide_legend("Density"))
#chisq.test(x=factor(data123$INCIDENT_TYPE_ID), y=data123$CREATE_TIME_TIMEONLY)
ggplot(data=data123, aes(x=data123$CREATE_TIME_TIMEONLY)) + geom_histogram(binwidth = 1, bins = 100) + labs(x="TIME OF DAY", y="Number of Offences", title="Histogram of Offences per Hour")
calls <- aggregate(data.frame(calls = data123$CREATE_TIME_INCIDENT), list(time = data123$CREATE_TIME_TIMEONLY), FUN = length)
calls$calls <- calls$calls/365

fit <- nls(calls ~ C + A*sin(2*pi*time*F), data=calls, start=list(A = 60, F = .08, C=60), trace=T)

fitt <- fitted.values(fit)

summary(fit)

plot(x=calls$time, y=calls$calls, xlab="Hour of the Day", ylab="Calls Made", main="Plot of Average Calls per Hour with Model Overlay", sub="Model: Calls = 120.804 - 57.613*sin(2*pi*Hour*0.046)")
lines(x=calls$time,predict(fit), col="RED")
data1234 <- data123

plot(data1234$CREATE_TIME_TIMEONLY, data1234$ALARM_LEVEL)

chisq.test(y=factor(data1234$ALARM_LEVEL), x=data1234$CREATE_TIME_TIMEONLY)

Appendix

Chase Rosendale, Daniel Jang, Xuhui Jin, Zifan Chen

October 18, 2017

Extra EDA

Question 2 Extra Analysis

Question 3 Extra Analysis

Question 1 Extra Analysis

R CODE