Load the Cleansed Dataset
library(arules)
## Warning: package 'arules' was built under R version 3.2.3
## Loading required package: Matrix
##
## Attaching package: 'arules'
##
## The following objects are masked from 'package:base':
##
## %in%, abbreviate, write
load("SQF_clean.rda")
datestop <- dat[,"datestop"]
dat[,"datestop"] <- discretize(datestop,method="interval",categories=12)
timestop <- dat[,"timestop"]
dat[,"timestop"] <- discretize(timestop,method="interval",categories=24)
age <- dat[,"age"]
dat[,"age"] <- discretize(timestop,method="frequency",categories= 3)
perobs <- dat[,"perobs"]
dat[,"perobs"] <- discretize(perobs,method="frequency",categories= 3)
perstop <- dat[,"perstop"]
dat[,"perstop"] <- discretize(perstop,method="frequency",categories= 3)
dat$armed <- dat$pistol | dat$riflshot | dat$asltweap | dat$knifcuti |
dat$machgun | dat$othrweap | dat$contrabn
for (i in c("contrabn","pistol","riflshot","asltweap","knifcuti","machgun","othrweap")){
dat[,i]<-NULL
}
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
library(proj4)
NYC <- get_map("New York City", zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
map <- ggmap(NYC)
map

d1 <- dat[ dat$arstmade & dat$detailCM == "14", ]
sel <- complete.cases(d1[, c("xcoord","ycoord")])
d1 <- d1[sel,c("xcoord","ycoord")]
plot(d1)

c1 <- project(d1, inverse=TRUE, proj="+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs")
coords <- data.frame(lon=c1[[1]], lat=c1[[2]])
map + geom_point(aes(x=lon, y=lat), data=coords, alpha=.5)
## Warning: Removed 67 rows containing missing values (geom_point).

thresholds <- length(levels(dat$city)):length(levels(dat$pct))
WSS <- sapply(thresholds, FUN=function(k) {
kmeans(d1, centers=k, nstart=5)$tot.withinss
})
plot(thresholds, WSS, type="l")

nc <- thresholds[which.min(WSS)]
nc
## [1] 77
library(cluster)
library(fpc)
km <- kmeans(d1, centers=nc, nstart=10)
clusters <- factor(km$cluster)
map + geom_point(aes(x=lon, y=lat, colour=clusters), data=coords, alpha=.5) + guides("cluster", col=guide_legend(ncol=5))
## Warning: Removed 67 rows containing missing values (geom_point).

hc <- hclust(dist(d1),method="complete")
clusters <- factor(cutree(hc,nc))
map + geom_point(aes(x=lon, y=lat, colour=clusters), data=coords, alpha=.5) + guides("cluster",col=guide_legend(ncol=5))
## Warning: Removed 67 rows containing missing values (geom_point).

sapply(list(
km=km$cluster,
hc_compl=cutree(hc,nc)),
FUN=function(x)
cluster.stats(dist(d1), x))[c("within.cluster.ss","avg.silwidth"),]
## km hc_compl
## within.cluster.ss 2245476859 1844980327
## avg.silwidth 0.4930952 0.501612
truth = as.integer(dat$pct[dat$arstmade==TRUE & dat$detailCM=="14"][sel])
sapply(list(
km=km$cluster,
hc_compl=cutree(hc,nc)),
FUN=function(x)
cluster.stats(dist(d1), x, truth))[c("corrected.rand"),]
## Warning in cluster.stats(dist(d1), x, truth): alt.clustering renumbered
## because maximum != number of clusters
## Warning in cluster.stats(dist(d1), x, truth): alt.clustering renumbered
## because maximum != number of clusters
## $km
## [1] 0.4529588
##
## $hc_compl
## [1] 0.5032786