By Srihari Mohan and Jack Sather

CrimeScore Algorithm

Custom CrimeScore Algorithm as the Foundation for Relational Scoring

There must be a way of quantifying the magnitude of crimes in a community, but data is only given on the numbers of different types of crimes (like murder, rape, burglary, etc.) committed annually within a community. The custom CrimeScore algorithm measures magnitude of crime by assigning each type of crime a score on a CrimeScale. For example, murder is scored as the highest crime and is assigned a key of 100 on the CrimeScale, while rape is assigned a key of 90. The CrimeScore is calculated by multiplying the number of each type of crime in a community by its key and by then dividing by the population of the community.

I've decided to call the process of creating scoring algorithms to simplify analyzing large data sets Relational Scoring. Relational Scoring helps find relationships between data, which would otherwise be hard to look for manually, by assigning scores to variables in the dataset and seeing how they relate to one other. The CrimeScore algorithm is an example of relational scoring because it sees how different variables (like the number of murders) relates to a target variable, like magnitude of crime.

R code for Reading Data and Calculating CrimeScore

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.1.2

library(grid)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.1.2

#reading in the data
setInternet2(use = TRUE)
crimedataraw <-read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/00211/CommViolPredUnnormalizedData.txt",
               header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "",
               stringsAsFactors = default.stringsAsFactors())
crimedatacleaned <-read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/00211/CommViolPredUnnormalizedData.txt",
               header = FALSE, sep = ",", quote = "\"", dec = ".", fill = TRUE, comment.char = "",
               na.strings="?",strip.white=TRUE,stringsAsFactors = default.stringsAsFactors())

#x-axis RacialMatchCommPol - remove all NA rows
names(crimedatacleaned)[112] <- "RacialMatchCommPol"
crimedatacleaned <- crimedatacleaned[-which(is.na(crimedatacleaned$RacialMatchCommPol)),]

# Field 2 = state code
names(crimedatacleaned)[2] <- "statecode"

#Filter by region: input regional code to produce graphic for that region
userInput <-"West"

#Field 130 = murders, field 132 = rapes, Field 134 = robberies, Field 136 = assaults, Field 138 = burglaries
names(crimedatacleaned)[130] <- "murders"
names(crimedatacleaned)[132] <- "rapes"
names(crimedatacleaned)[134] <- "robberies"
names(crimedatacleaned)[136] <- "assaults"
names(crimedatacleaned)[138] <- "burglaries"
names(crimedatacleaned)[142] <- "autoTheft"
names(crimedatacleaned)[146] <- "violentcrimes"
names(crimedatacleaned)[112] <- "RacialMatchCommPol"
names(crimedatacleaned)[6] <- "population"
names(crimedatacleaned)[8] <- "racepctblack"
names(crimedatacleaned)[9] <- "racepctWhite"
names(crimedatacleaned)[10] <- "racepctAsian"
names(crimedatacleaned)[11] <- "racepctHisp"

# Region codes
crimedatacleaned$region <- NA
names(crimedatacleaned)[148] <- "region"
west <- c("AZ","CO","ID","NM","MT","UT","NV","WY","AK","CA","HI","OR","WA")
south <- c("DE","DC","FL","GA","MD","NC","SC","VA","WV","AL","KY","MS","TN","AR","LA","OK","TX")
midwest <- c("IN","IL","MI","OH","WI","IO","NE","KS","ND","MN","SD","MO")
northeast <- c("CT","ME","MA","NH","RI","VT","NJ","NY","PA")

crimedatacleaned[which(crimedatacleaned$statecode %in% west),148] <- "West"
crimedatacleaned[which(crimedatacleaned$statecode %in% south),148] <- "South"
crimedatacleaned[which(crimedatacleaned$statecode %in% midwest),148] <- "MidWest"
crimedatacleaned[which(crimedatacleaned$statecode %in% northeast),148] <- "NorthEast"

#CrimeScore algorithm
crimedatacleaned$crimeScore <- (((crimedatacleaned$murders * 100) + (crimedatacleaned$rapes * 90)+ (crimedatacleaned$assaults * 70)+
                                    (crimedatacleaned$robberies *60)+ (crimedatacleaned$robberies *50)+ (crimedatacleaned$violentcrimes *80)+
                                    (crimedatacleaned$burglaries * 40))/(crimedatacleaned$population))

#Creating bins for percent population of each race
crimedatacleaned$racepctblackbin<-cut(crimedatacleaned$racepctblack,breaks=c(0,10,15,20,100),
                                      labels=c("0-10 \n% of pop","10-15 \n% of pop","15-20 \n% of pop", "20-100\n% of pop"))
crimedatacleaned$racepctWhitebin<-cut(crimedatacleaned$racepctWhite,breaks=c(0,40,60,80,90,100),
                                      labels=c("0-40 \n% of pop","40-60 \n% of pop","60-80 \n% of pop", "80-90\n% of pop","90-100\n% of pop"))
crimedatacleaned$racepctHispbin<-cut(crimedatacleaned$racepctHisp,breaks=c(0,10,15,20,100),
                                     labels=c("0-10 \n% of pop","10-15 \n% of pop","15-20 \n% of pop", "20-100\n% of pop"))
crimedatacleaned$racepctAsianbin<-cut(crimedatacleaned$racepctAsian,breaks=c(0,2,5,10,100),
                                      labels=c("0-2 \n% of pop","2-5 \n% of pop","5-10 \n% of pop", "10-100\n% of pop"))

#subsetted dataframe for just region (userInput)
crimedatasubset <- crimedatacleaned[which(crimedatacleaned[148]==userInput),]

#create graphics plots
firstPart <- ggplot(crimedatasubset,aes(RacialMatchCommPol,crimeScore))+geom_point(,na.rm=TRUE)

finalPart <-  theme(axis.text.x=element_text(colour="slateblue4",size=12,face="bold"))+
  theme(axis.text.y=element_text(colour="slateblue4",size=12,face="bold"))+
  theme(axis.title.x=element_text(colour="slateblue4",size=16,face="bold"))+
  theme(axis.title.y=element_text(colour="slateblue4",size=16,face="bold"))+
  theme(plot.title=element_text(colour="slateblue4", face="bold", size=20))+
  theme(axis.text.x = element_text(angle=90,vjust=0.5, hjust=1,face="bold"))+
  theme(axis.ticks = element_line(colour = "slateblue4"))+
  theme(strip.text = element_text(size=12,face="bold"))+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(strip.text = element_text(size=12,face="bold"))

p1<- firstPart + facet_grid(racepctblackbin ~ .,)+geom_jitter(na.rm=TRUE)+
  xlab("Racial match between the community \n and the police force")+ylab("CrimeScore")+
  ggtitle("Black community")+theme_bw()+xlim(0, 100)+finalPart

p2<-firstPart+facet_grid(racepctWhitebin ~ .)+geom_jitter(na.rm=TRUE)+
  xlab("Racial match between the community \n and the police force")+ylab("CrimeScore")+
  ggtitle("White community")+theme_bw() +xlim(0, 100)+finalPart

p3 <- firstPart+facet_grid(racepctHispbin ~ .)+geom_jitter(na.rm=TRUE)+
  xlab("Racial match between the community \n and the police force")+ylab("CrimeScore")+
  ggtitle("Hispanic community")+theme_bw() +xlim(0, 100)+finalPart

p4 <- firstPart +facet_grid(racepctAsianbin ~ .)+geom_jitter(na.rm=TRUE)+
  xlab("Racial match between the community\n and the police force")+ylab("CrimeScore")+
  ggtitle("Asian community")+theme_bw()+xlim(0, 100)+finalPart

CrimeScore and Racial Match Plots

grid.arrange(p1, p2, p3,p4, nrow=2,ncol=2, main = textGrob(paste("Crime Rate in ",userInput), vjust = 1, gp = gpar(fontface = "bold", cex = 1.5,col="slateblue4")))

plot of chunk unnamed-chunk-2

plot of chunk unnamed-chunk-4