Kaggle Challenge to classify 39 different crimes that occurred in SF in 12 years. https://www.kaggle.com/c/sf-crime
Load Required Packages
suppressWarnings( library(caret) )
suppressWarnings( library(ggplot2) )
suppressWarnings( library(lubridate) ) # to extract date, month etc
suppressWarnings( library(ggmap) ) # to get zipcode from longitude and latitude
suppressWarnings( library(dplyr) )
load data
# load data
file_location <- file.path("/Users","shruti","Desktop","WorkMiscellaneous","MachineLearning","SanFranciscoCrime/")
train_data <- read.csv(paste(file_location,"train.csv",sep=""))
test_data <- read.csv(paste(file_location,"test.csv",sep=""))
# data exploration
dim(train_data)
## [1] 878049 9
head(train_data)
## Dates Category Descript
## 1 2015-05-13 23:53:00 WARRANTS WARRANT ARREST
## 2 2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST
## 3 2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST
## 4 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO
## 5 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO
## 6 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO
## DayOfWeek PdDistrict Resolution Address X
## 1 Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.4259
## 2 Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.4259
## 3 Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.4244
## 4 Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.4270
## 5 Wednesday PARK NONE 100 Block of BRODERICK ST -122.4387
## 6 Wednesday INGLESIDE NONE 0 Block of TEDDY AV -122.4033
## Y
## 1 37.77460
## 2 37.77460
## 3 37.80041
## 4 37.80087
## 5 37.77154
## 6 37.71343
summary(train_data)
## Dates Category
## 2011-01-01 00:01:00: 185 LARCENY/THEFT :174900
## 2006-01-01 00:01:00: 136 OTHER OFFENSES:126182
## 2012-01-01 00:01:00: 94 NON-CRIMINAL : 92304
## 2006-01-01 12:00:00: 63 ASSAULT : 76876
## 2007-06-01 00:01:00: 61 DRUG/NARCOTIC : 53971
## 2006-06-01 00:01:00: 58 VEHICLE THEFT : 53781
## (Other) :877452 (Other) :300035
## Descript DayOfWeek
## GRAND THEFT FROM LOCKED AUTO : 60022 Friday :133734
## LOST PROPERTY : 31729 Monday :121584
## BATTERY : 27441 Saturday :126810
## STOLEN AUTOMOBILE : 26897 Sunday :116707
## DRIVERS LICENSE, SUSPENDED OR REVOKED: 26839 Thursday :125038
## WARRANT ARREST : 23754 Tuesday :124965
## (Other) :681367 Wednesday:129211
## PdDistrict Resolution
## SOUTHERN :157182 NONE :526790
## MISSION :119908 ARREST, BOOKED :206403
## NORTHERN :105296 ARREST, CITED : 77004
## BAYVIEW : 89431 LOCATED : 17101
## CENTRAL : 85460 PSYCHOPATHIC CASE: 14534
## TENDERLOIN: 81809 UNFOUNDED : 9585
## (Other) :238963 (Other) : 26632
## Address X Y
## 800 Block of BRYANT ST : 26533 Min. :-122.5 Min. :37.71
## 800 Block of MARKET ST : 6581 1st Qu.:-122.4 1st Qu.:37.75
## 2000 Block of MISSION ST: 5097 Median :-122.4 Median :37.78
## 1000 Block of POTRERO AV: 4063 Mean :-122.4 Mean :37.77
## 900 Block of MARKET ST : 3251 3rd Qu.:-122.4 3rd Qu.:37.78
## 0 Block of TURK ST : 3228 Max. :-120.5 Max. :90.00
## (Other) :829296
levels(train_data$Category)
## [1] "ARSON" "ASSAULT"
## [3] "BAD CHECKS" "BRIBERY"
## [5] "BURGLARY" "DISORDERLY CONDUCT"
## [7] "DRIVING UNDER THE INFLUENCE" "DRUG/NARCOTIC"
## [9] "DRUNKENNESS" "EMBEZZLEMENT"
## [11] "EXTORTION" "FAMILY OFFENSES"
## [13] "FORGERY/COUNTERFEITING" "FRAUD"
## [15] "GAMBLING" "KIDNAPPING"
## [17] "LARCENY/THEFT" "LIQUOR LAWS"
## [19] "LOITERING" "MISSING PERSON"
## [21] "NON-CRIMINAL" "OTHER OFFENSES"
## [23] "PORNOGRAPHY/OBSCENE MAT" "PROSTITUTION"
## [25] "RECOVERED VEHICLE" "ROBBERY"
## [27] "RUNAWAY" "SECONDARY CODES"
## [29] "SEX OFFENSES FORCIBLE" "SEX OFFENSES NON FORCIBLE"
## [31] "STOLEN PROPERTY" "SUICIDE"
## [33] "SUSPICIOUS OCC" "TREA"
## [35] "TRESPASS" "VANDALISM"
## [37] "VEHICLE THEFT" "WARRANTS"
## [39] "WEAPON LAWS"
table(train_data$Category)
##
## ARSON ASSAULT
## 1513 76876
## BAD CHECKS BRIBERY
## 406 289
## BURGLARY DISORDERLY CONDUCT
## 36755 4320
## DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC
## 2268 53971
## DRUNKENNESS EMBEZZLEMENT
## 4280 1166
## EXTORTION FAMILY OFFENSES
## 256 491
## FORGERY/COUNTERFEITING FRAUD
## 10609 16679
## GAMBLING KIDNAPPING
## 146 2341
## LARCENY/THEFT LIQUOR LAWS
## 174900 1903
## LOITERING MISSING PERSON
## 1225 25989
## NON-CRIMINAL OTHER OFFENSES
## 92304 126182
## PORNOGRAPHY/OBSCENE MAT PROSTITUTION
## 22 7484
## RECOVERED VEHICLE ROBBERY
## 3138 23000
## RUNAWAY SECONDARY CODES
## 1946 9985
## SEX OFFENSES FORCIBLE SEX OFFENSES NON FORCIBLE
## 4388 148
## STOLEN PROPERTY SUICIDE
## 4540 508
## SUSPICIOUS OCC TREA
## 31414 6
## TRESPASS VANDALISM
## 7326 44725
## VEHICLE THEFT WARRANTS
## 53781 42214
## WEAPON LAWS
## 8555
# missing data
sum(!complete.cases(train_data))
## [1] 0
dim(test_data)
## [1] 884262 7
head(test_data)
## Id Dates DayOfWeek PdDistrict Address
## 1 0 2015-05-10 23:59:00 Sunday BAYVIEW 2000 Block of THOMAS AV
## 2 1 2015-05-10 23:51:00 Sunday BAYVIEW 3RD ST / REVERE AV
## 3 2 2015-05-10 23:50:00 Sunday NORTHERN 2000 Block of GOUGH ST
## 4 3 2015-05-10 23:45:00 Sunday INGLESIDE 4700 Block of MISSION ST
## 5 4 2015-05-10 23:45:00 Sunday INGLESIDE 4700 Block of MISSION ST
## 6 5 2015-05-10 23:40:00 Sunday TARAVAL BROAD ST / CAPITOL AV
## X Y
## 1 -122.3996 37.73505
## 2 -122.3915 37.73243
## 3 -122.4260 37.79221
## 4 -122.4374 37.72141
## 5 -122.4374 37.72141
## 6 -122.4590 37.71317
Data Munging
# since test data does not contain "Descript", "Resolution", not using it for training the model
# function for Feature Extraction
FeatureExtraction <- function(dataset,zip){
year_of_crime <- as.factor(year(dataset$Dates))
month_of_crime <- as.factor(month(dataset$Dates))
date_of_crime <- as.factor(day(dataset$Dates))
hour_of_crime <- as.factor(hour(dataset$Dates))
new_dataframe <- data.frame(cbind(hour_of_crime,date_of_crime,month_of_crime,year_of_crime,droplevels(dataset[,c("DayOfWeek","PdDistrict","X","Y")])))
return(new_dataframe)
}
# Feature engineering: get zip code from latitude and longitude
# install.packages("zipcode")
library(zipcode)
data(zipcode)
sf_zipcodes <- subset(zipcode,city=="San Francisco",)
sf_zipcodes$latitude <- as.numeric(format(sf_zipcodes$latitude,digits=8))
sf_zipcodes$longitude <- as.numeric(format(sf_zipcodes$longitude,digits=8))
dim(sf_zipcodes)
## [1] 73 5
head(sf_zipcodes)
## zip city state latitude longitude
## 41315 94101 San Francisco CA 37.78483 -122.7278
## 41316 94102 San Francisco CA 37.77933 -122.4192
## 41317 94103 San Francisco CA 37.77233 -122.4109
## 41318 94104 San Francisco CA 37.79173 -122.4019
## 41319 94105 San Francisco CA 37.78923 -122.3957
## 41320 94106 San Francisco CA 37.78483 -122.7278
# function to convert latitude and longitude to zipcode
closest_zipcode <- function(latitude_longitude_matrix)
# latitude_longitude_matrix : input matrix containing latitude and longitude from training or test set
{
# find euclidean distance between user provided latitude and longitude and
# all latitudes, longitudes from sf_zipcodes and select the row from latter
# which has minimum distance. then extract its zip code
return(sf_zipcodes[which.min
(apply (sf_zipcodes[,c("latitude","longitude")],1,function(i)
{
dist( rbind(i,latitude_longitude_matrix) ) #euclidean distance
} )
),"zip"])
}
# Sanity Check: to validate some of the zipcodes with google api
train_data[1,]
## Dates Category Descript DayOfWeek PdDistrict
## 1 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN
## Resolution Address X Y
## 1 ARREST, BOOKED OAK ST / LAGUNA ST -122.4259 37.7746
closest_zipcode(c(train_data$Y[1],train_data$X[1]))
## [1] "94102"
revgeocode(c(train_data$X[1], train_data$Y[1]))
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?latlng=37.7745985956747,-122.425891675136&sensor=false
## [1] "391 Oak St, San Francisco, CA 94102, USA"
# zip_codes_training <- apply(train_data[,c("Y","X")],1,closest_zipcode)
# save(zip_codes_training,file=paste(file_location,"zip_codes_training.rda",sep=""))
load(paste(file_location,"zip_codes_training.rda",sep=""))
# zip_codes_test <- apply(test_data[,c("Y","X")],1,closest_zipcode)
# save(zip_codes_test,file=paste(file_location,"zip_codes_test.rda",sep=""))
# load(paste(file_location,"zip_codes_test.rda",sep=""))
# combining new features
training_features <- cbind( FeatureExtraction(train_data),zip=zip_codes_training, Category=train_data$Category )
#test_features <- FeatureExtraction(test_data,zip=zip_codes_test)
# save(training_features,file=paste(file_location,"training_features.rda",sep=""))
# save(test_features,file=paste(file_location,"test_features.rda",sep=""))
# load(paste(file_location,"training_features.rda",sep=""))
head(training_features)
## hour_of_crime date_of_crime month_of_crime year_of_crime DayOfWeek
## 1 23 13 5 2015 Wednesday
## 2 23 13 5 2015 Wednesday
## 3 23 13 5 2015 Wednesday
## 4 23 13 5 2015 Wednesday
## 5 23 13 5 2015 Wednesday
## 6 23 13 5 2015 Wednesday
## PdDistrict X Y zip Category
## 1 NORTHERN -122.4259 37.77460 94102 WARRANTS
## 2 NORTHERN -122.4259 37.77460 94102 OTHER OFFENSES
## 3 NORTHERN -122.4244 37.80041 94109 OTHER OFFENSES
## 4 NORTHERN -122.4270 37.80087 94109 LARCENY/THEFT
## 5 PARK -122.4387 37.77154 94117 LARCENY/THEFT
## 6 INGLESIDE -122.4033 37.71343 94134 LARCENY/THEFT
split data
set.seed(123)
training_index <- createDataPartition(training_features$Category,p=0.6,list=F)
training_set <- droplevels(training_features[training_index,])
test_set <- droplevels(training_features[-training_index,])
dim(training_set); dim(test_set)
## [1] 526842 10
## [1] 351207 10
exploratory data analysis
# Function to group data frame by specific columns
group_by_col <- function(dataset,...){
dataset %>%
group_by_(.dots = ...) %>%
summarise(count = n())
}
# Function to plot crime count with variation in different features
plot_by_col <- function(dataset,xcolname,ycolname){
par(mar=c(6,4,4,2),mgp=c(3,0.5,0),cex.axis=0.8,cex.main=0.8)
x_levels <- levels(dataset[[xcolname]])
x_level_count <- seq_along(x_levels)
boxplot(dataset[[ycolname]] ~ dataset[[xcolname]],
col=x_level_count,
ylim = c( min(dataset[[ycolname]]), max(dataset[[ycolname]]) ),
xaxt="n",
#xlab=xcolname,
ylab=c("Number of crimes"),
main = c(paste("Variation in crime with",xcolname))
)
axis(side=1, at=x_level_count, labels=x_levels, las=2)
}
# top most crimes
Category_hist <- training_set %>%
group_by(Category) %>%
summarise(count = n()) %>%
transform(Category = reorder(Category,-count))
Category_hist <- arrange(Category_hist, desc(count))
head(Category_hist)
## Category count
## 1 LARCENY/THEFT 104940
## 2 OTHER OFFENSES 75710
## 3 NON-CRIMINAL 55383
## 4 ASSAULT 46126
## 5 DRUG/NARCOTIC 32383
## 6 VEHICLE THEFT 32269
top10_crimes <- Category_hist[1:10,1]
top5_crimes <- Category_hist[1:5,1]
# To Do: check the following
# Category_hist <- group_by_col(training_set,"Category")
# Category_hist2 <- Category_hist %>%
# transform(Category = reorder(Category,-count))
# arrange(Category_hist, desc(count))
# top10_crimes <- Category_hist[1:10,1]
# top5_crimes <- Category_hist[1:5,1]
# Frequency/historgram of different crimes.
#ggplot(Category_hist2) +
ggplot(Category_hist) +
geom_bar(aes(x=Category, y=count,
color = Category, fill = Category),
stat="identity") +
coord_flip() +
theme(legend.position="None") +
ggtitle("Number of crimes in each category") +
ylab("Number of crimes") +
xlab("Category of crime")
# Variation in crime with day of week
data_day <- group_by_col(training_set,"DayOfWeek","year_of_crime","month_of_crime")
head(data_day)
## Source: local data frame [6 x 4]
## Groups: DayOfWeek, year_of_crime [1]
##
## DayOfWeek year_of_crime month_of_crime count
## (fctr) (fctr) (fctr) (int)
## 1 Friday 2003 1 525
## 2 Friday 2003 2 537
## 3 Friday 2003 3 552
## 4 Friday 2003 4 551
## 5 Friday 2003 5 765
## 6 Friday 2003 6 492
plot_by_col(data_day,"DayOfWeek","count")
# the graph indicates that maximum crimes occur on Friday and minimum on sunday
# Variation in crime with year
data_year <- group_by_col(training_set,"year_of_crime","month_of_crime")
#head(data_year)
plot_by_col(data_year,"year_of_crime","count")
# Variation in crime with month
data_month <- group_by_col(training_set,"month_of_crime","year_of_crime")
plot_by_col(data_month,"month_of_crime","count")
# the graph indicates that maximum crimes occur in October and least in Dec
# Variation in crime with hour of day
data_hour <- group_by_col(training_set,"hour_of_crime","year_of_crime","month_of_crime")
plot_by_col(data_hour,"hour_of_crime","count")
## Variation in crime with zipcode
data_zip <- group_by_col(training_set,"zip","year_of_crime","month_of_crime")
plot_by_col(data_zip,"zip","count")
# Function to group data frame by specific columns and Category of crime
group_category_by_col <- function(dataset,...){
x <- dataset %>%
subset(Category %in% top10_crimes) %>%
#subset(Category %in% top5_crimes) %>%
group_by_(.dots = ...) %>%
summarise(count = n())
# x$Category <- factor(x$Category,levels = top10_crimes)
# return(x)
}
# Function to plot crime count in top 10 crimes with variation in different features
plot_category_by_col <- function(dataset,xcolname,ycolname="count"){
ggplot(data=dataset, aes_string(x=xcolname, y=ycolname, fill="Category")) +
geom_boxplot() +
facet_wrap(~Category,ncol = 5)+
#facet_wrap wraps a 1d sequence of panels into 2d
theme(legend.position="None",
axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle( c(paste("Variations in crime by",xcolname)) )+
xlab(xcolname)+
ylab("Number of crime incidents")
}
# Group top 10 crimes by hour of crime
data_hour_category <- group_category_by_col(training_set,"hour_of_crime","Category","year_of_crime","month_of_crime")
data_hour_category$Category <- factor(data_hour_category$Category,levels = top10_crimes)
head(data_hour_category)
## Source: local data frame [6 x 5]
## Groups: hour_of_crime, Category, year_of_crime [1]
##
## hour_of_crime Category year_of_crime month_of_crime count
## (fctr) (fctr) (fctr) (fctr) (int)
## 1 0 ASSAULT 2003 1 12
## 2 0 ASSAULT 2003 2 25
## 3 0 ASSAULT 2003 3 19
## 4 0 ASSAULT 2003 4 12
## 5 0 ASSAULT 2003 5 13
## 6 0 ASSAULT 2003 6 17
# Variations in top 10 crimes by hour of crime
plot_category_by_col(data_hour_category,"hour_of_crime")
# Group top 10 crimes by year
data_year_category <- group_category_by_col(training_set,"year_of_crime","Category","month_of_crime")
data_year_category$Category <- factor(data_year_category$Category,levels = top10_crimes)
head(data_year_category)
## Source: local data frame [6 x 4]
## Groups: year_of_crime, Category [1]
##
## year_of_crime Category month_of_crime count
## (fctr) (fctr) (fctr) (int)
## 1 2003 ASSAULT 1 306
## 2 2003 ASSAULT 2 303
## 3 2003 ASSAULT 3 358
## 4 2003 ASSAULT 4 316
## 5 2003 ASSAULT 5 360
## 6 2003 ASSAULT 6 316
# Variations in top 10 crimes with different features
plot_category_by_col(data_year_category,"year_of_crime")
# Group top 10 crimes by day of week
data_day_category <- group_category_by_col(training_set,"DayOfWeek","Category","year_of_crime","month_of_crime")
data_day_category$Category <- factor(data_day_category$Category,levels = top10_crimes)
head(data_day_category)
## Source: local data frame [6 x 5]
## Groups: DayOfWeek, Category, year_of_crime [1]
##
## DayOfWeek Category year_of_crime month_of_crime count
## (fctr) (fctr) (fctr) (fctr) (int)
## 1 Friday ASSAULT 2003 1 47
## 2 Friday ASSAULT 2003 2 48
## 3 Friday ASSAULT 2003 3 50
## 4 Friday ASSAULT 2003 4 36
## 5 Friday ASSAULT 2003 5 66
## 6 Friday ASSAULT 2003 6 39
# Variations in top 10 crimes by day of week
plot_category_by_col(data_day_category,"DayOfWeek")
# Variations in top 10 crimes by zip code
data_zip_category <- group_category_by_col(training_set,"zip","Category","year_of_crime","month_of_crime")
data_zip_category$Category <- factor(data_zip_category$Category,levels = top10_crimes)
plot_category_by_col(data_zip_category,"zip")
# Variations in top 3 crimes by zip code
top3_crimes <- Category_hist[1:3,1]
data_zip_top3category <- training_set %>%
subset(Category %in% top3_crimes) %>%
group_by(zip,Category,year_of_crime,month_of_crime) %>%
summarise(count = n())
data_zip_top3category$Category <- factor(data_zip_top3category$Category,levels = top3_crimes)
plot_category_by_col(data_zip_top3category,"zip")
# Function to group data frame by category, then by a specific column and normalize all values for that column for each category.
normalized_group_by_col <- function(dataset,...){
dataset %>%
group_by_(.dots = ...) %>%
summarise(count = n()) %>%
# for each category of crime, calculate mean and standard deviation and normalize all values for that category.
mutate(normalized_count = (count-mean(count))/sd(count))
}
# function to generate normalized plots to check for trends.
# NOT WORKING
normalized_plot <- function(dataset,xcolname,ycolname="normalized_count"){
ggplot(data = subset(dataset, Category %in% top10_crimes),
aes(x=as.numeric(xcolname), y=normalized_count, fill=normalized_count)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks=seq_along(levels(xcolname)),labels=levels(xcolname))
xlab(xcolname)+
ylab("Normalized crime count")}
# pattern by month
norm_data_month <- normalized_group_by_col(training_set,"Category","month_of_crime")
ggplot(data = subset(norm_data_month, Category %in% top10_crimes),
aes(x=as.numeric(month_of_crime), y=normalized_count,color = Category)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks = 1:12, labels=c("Jan","Feb","Mar",
"Apr","May","Jun",
"Jul","Aug","Sep",
"Oct","Nov","Dec")) +
xlab("Months")+
ylab("Normalized crime count")
#theme(legend.position="None")
# pattern by hour
norm_data_hour <- normalized_group_by_col(training_set,"Category","hour_of_crime")
ggplot(data = subset(norm_data_hour, Category %in% top10_crimes),
aes(x=as.numeric(hour_of_crime), y=normalized_count,color = Category)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks = 1:24, labels=c(0:23)) +
xlab("Hour")+
ylab("Normalized crime count")
# pattern by week
norm_data_week <- normalized_group_by_col(training_set,"Category","DayOfWeek")
ggplot(data = subset(norm_data_week, Category %in% top10_crimes),
aes(x=as.numeric(DayOfWeek), y=normalized_count,color = Category)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks = 1:7, labels=c(levels(norm_data_week$DayOfWeek))) +
xlab("Day Of Week")+
ylab("Normalized crime count")
# "Assault" crime seems to be high on sunday. Also certain crimes cluster and follow similar trend over days of the week
# pattern by year
norm_data_year <- normalized_group_by_col(training_set,"Category","year_of_crime")
ggplot(data = subset(norm_data_year, Category %in% top10_crimes),
aes(x=as.numeric(year_of_crime), y=normalized_count,color = Category)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks = seq_along(levels(norm_data_year$year_of_crime)), labels=c(levels(norm_data_year$year_of_crime))) +
xlab("year")+
ylab("Normalized crime count")
# pattern by zip
norm_data_zip <- normalized_group_by_col(training_set,"Category","zip")
ggplot(data = subset(norm_data_zip, Category %in% top10_crimes),
aes(x=as.numeric(zip), y=normalized_count,color = Category)) +
geom_line()+
geom_point()+
scale_x_continuous(breaks = seq_along(levels(norm_data_zip$zip)), labels=c(levels(norm_data_zip$zip))) +
xlab("zip code")+
ylab("Normalized crime count")