Setup
tmp.library.list <- c("haven", "zoo", "fUnitRoots", "tseries", "urca", "lmtest", "forecast", "data.table", "readxl","reshape", "quantmod", "ggplot2", "reshape2", "plyr","scales", "RColorBrewer")
for (i in 1:length(tmp.library.list)) {
if (!tmp.library.list[i] %in% rownames(installed.packages())) {
install.packages(tmp.library.list[i])
}
library(tmp.library.list[i], character.only = TRUE)
}
rm(tmp.library.list)
Data Import
incidents.2017 <- paste0(RawData,"/SFIncidents2017.csv")
incidents.2018 <- paste0(RawData,"/SFIncidents2018.csv")
sf.population <- paste0(RawData,"/SFpopulation_by_pddistrict.csv")
incidents2017 <- data.table(read.csv(incidents.2017, sep = ',', stringsAsFactors = F))
incidents2018 <- data.table(read.csv(incidents.2018, sep = ',', stringsAsFactors = F))
sfpopulation <- data.table(read.csv(sf.population, sep = ',', stringsAsFactors = F))
str(incidents2018)
## Classes 'data.table' and 'data.frame': 46668 obs. of 13 variables:
## $ IncidntNum: int 189014013 186095971 186084021 186072593 186063279 186042336 186036248 186034026 186030579 186029102 ...
## $ Category : chr "NON-CRIMINAL" "OTHER OFFENSES" "FRAUD" "NON-CRIMINAL" ...
## $ Descript : chr "STAY AWAY OR COURT ORDER, NON-DV RELATED" "OBSCENE PHONE CALLS(S)" "CREDIT CARD, THEFT BY USE OF" "LOST PROPERTY" ...
## $ DayOfWeek : chr "Monday" "Monday" "Monday" "Monday" ...
## $ Date : chr "01/01/2018" "01/01/2018" "01/01/2018" "01/01/2018" ...
## $ Time : chr "22:37" "08:00" "00:01" "17:00" ...
## $ PdDistrict: chr "TARAVAL" "TENDERLOIN" "NORTHERN" "MISSION" ...
## $ Resolution: chr "ARREST, BOOKED" "NONE" "NONE" "NONE" ...
## $ Address : chr "3500 Block of 19TH AV" "500 Block of EDDY ST" "1200 Block of EDDY ST" "2100 Block of MARKET ST" ...
## $ X : num -122 -122 -122 -122 -122 ...
## $ Y : num 37.7 37.8 37.8 37.8 37.8 ...
## $ Location : chr "(37.723975823482135, -122.47482947577244)" "(37.78335703909351, -122.41671073384802)" "(37.78189594886033, -122.42822330317604)" "(37.766652147476215, -122.4297889869883)" ...
## $ PdId : num 1.89e+13 1.86e+13 1.86e+13 1.86e+13 1.86e+13 ...
## - attr(*, ".internal.selfref")=<externalptr>
# check if variables from both sets are the same
names(incidents2017) <- tolower(names(incidents2017))
names(incidents2018) <- tolower(names(incidents2018))
names(incidents2017) == names(incidents2018)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# convert date to date format
incidents2017[, date := as.Date(date, format = "%m/%d/%Y")]
incidents2018[, date := as.Date(date, format = "%m/%d/%Y")]
# combine 2 data sets since we want both 2017 & 2018 data)
incidents <- rbind(incidents2017, incidents2018)
Cleaning
table(incidents$category)
##
## ARSON ASSAULT
## 424 18254
## BAD CHECKS BRIBERY
## 33 84
## BURGLARY DISORDERLY CONDUCT
## 7955 494
## DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC
## 391 4790
## DRUNKENNESS EMBEZZLEMENT
## 423 223
## EXTORTION FAMILY OFFENSES
## 75 57
## FORGERY/COUNTERFEITING FRAUD
## 704 3360
## GAMBLING KIDNAPPING
## 23 280
## LARCENY/THEFT LIQUOR LAWS
## 61458 91
## LOITERING MISSING PERSON
## 48 5751
## NON-CRIMINAL OTHER OFFENSES
## 22615 23884
## PORNOGRAPHY/OBSCENE MAT PROSTITUTION
## 9 575
## RECOVERED VEHICLE ROBBERY
## 718 4422
## RUNAWAY SECONDARY CODES
## 335 2681
## SEX OFFENSES, FORCIBLE SEX OFFENSES, NON FORCIBLE
## 1368 55
## STOLEN PROPERTY SUICIDE
## 1083 102
## SUSPICIOUS OCC TREA
## 7833 1
## TRESPASS VANDALISM
## 2127 12561
## VEHICLE THEFT WARRANTS
## 7207 6638
## WEAPON LAWS
## 2309
# Take criminal cases only
non.criminal <- c("FAMILY OFFENSES","NON-CRIMINAL","MISSING PERSON"
,"BAD CHECKS","RUNAWAY","RECOVERED VEHICLE", "TREA")
incidents.crime <- incidents[-grep(paste(non.criminal, collapse = "|")
,category),]
Descriptive Analysis
Which district has the most criminal incidents per capita?
str(incidents.crime)
## Classes 'data.table' and 'data.frame': 171931 obs. of 13 variables:
## $ incidntnum: int 186043061 186043061 186006116 180080782 180002015 180002015 180001675 179000044 176111991 176025386 ...
## $ category : chr "LARCENY/THEFT" "LARCENY/THEFT" "LARCENY/THEFT" "SEX OFFENSES, FORCIBLE" ...
## $ descript : chr "GRAND THEFT OF PROPERTY" "PETTY THEFT OF PROPERTY" "GRAND THEFT OF PROPERTY" "FORCIBLE RAPE, BODILY FORCE" ...
## $ dayofweek : chr "Sunday" "Sunday" "Sunday" "Sunday" ...
## $ date : Date, format: "2017-01-01" "2017-01-01" ...
## $ time : chr "00:01" "00:01" "09:00" "00:01" ...
## $ pddistrict: chr "NORTHERN" "NORTHERN" "NORTHERN" "INGLESIDE" ...
## $ resolution: chr "NONE" "NONE" "NONE" "NONE" ...
## $ address : chr "0 Block of HERMANN ST" "0 Block of HERMANN ST" "HAIGHT ST / STEINER ST" "100 Block of DUNCAN ST" ...
## $ x : num -122 -122 -122 -122 -122 ...
## $ y : num 37.8 37.8 37.8 37.7 37.8 ...
## $ location : chr "(37.77070812317366, -122.42596301730948)" "(37.77070812317366, -122.42596301730948)" "(37.77189574196664, -122.43209555515361)" "(37.74636244735492, -122.4234838326707)" ...
## $ pdid : num 1.86e+13 1.86e+13 1.86e+13 1.80e+13 1.80e+13 ...
## - attr(*, ".internal.selfref")=<externalptr>
incidents.bydistrict <- setDT(data.frame(table(incidents.crime$pddistrict)))
setnames(incidents.bydistrict, c("Var1", "Freq"), c("pddistrict", "frequency"))
## merge with population data
incidents.bydistrict[sfpopulation, on = "pddistrict", population := i.population]
incidents.bydistrict[, crime.per.capita := frequency/population]
ggplot(data=incidents.bydistrict,
aes(reorder(pddistrict, crime.per.capita),crime.per.capita)) +
geom_bar(stat="identity") +
coord_flip() +
labs(y="Numbers of Crimes per Capita", x="Police Department District"
, title="Crimes per Capita in San Francisco (by District)") +
theme_bw()

How many of the incidents have been solved? In which districts? Which categories?
incidents.crime$condition <- ifelse(grepl("UNFOUNDED", incidents.crime$resolution
,ignore.case = T), "unsolved"
,ifelse(grepl("NONE", incidents.crime$resolution
, ignore.case = T ), "unsolved", "solved"))
# 37.34%
table(incidents.crime$condition)
##
## solved unsolved
## 46746 125185
How many incidents/crimes have been solved?
crime.condition <- setDT(data.frame(table(incidents.crime$condition
, incidents.crime$pddistrict)))
setnames(crime.condition, c("Var1", "Var2", "Freq")
, c("condition", "pddistrict", "frequency"))
crime.condition[sfpopulation, on = "pddistrict", population := i.population]
crime.condition[, crime.per.capita := frequency/population]
crime.condition$condition <- factor(crime.condition$condition
, levels=c("unsolved","solved"))
ggplot(data=crime.condition,
aes(reorder(pddistrict, crime.per.capita),crime.per.capita, fill = condition)) +
geom_bar(stat="identity") +
coord_flip() +
labs(y="Numbers of Crimes per Capita", x="Police Department District"
, title="Crimes per Capita in San Francisco (by District)") +
theme_bw() +
scale_fill_brewer(palette = "Paired")

Which categories?
table(incidents.crime$category)
##
## ARSON ASSAULT
## 424 18254
## BRIBERY BURGLARY
## 84 7955
## DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE
## 494 391
## DRUG/NARCOTIC DRUNKENNESS
## 4790 423
## EMBEZZLEMENT EXTORTION
## 223 75
## FORGERY/COUNTERFEITING FRAUD
## 704 3360
## GAMBLING KIDNAPPING
## 23 280
## LARCENY/THEFT LIQUOR LAWS
## 61458 91
## LOITERING OTHER OFFENSES
## 48 23884
## PORNOGRAPHY/OBSCENE MAT PROSTITUTION
## 9 575
## ROBBERY SECONDARY CODES
## 4422 2681
## SEX OFFENSES, FORCIBLE SEX OFFENSES, NON FORCIBLE
## 1368 55
## STOLEN PROPERTY SUICIDE
## 1083 102
## SUSPICIOUS OCC TRESPASS
## 7833 2127
## VANDALISM VEHICLE THEFT
## 12561 7207
## WARRANTS WEAPON LAWS
## 6638 2309
crime.category <- setDT(data.frame(table(incidents.crime$condition
, incidents.crime$category)))
setnames(crime.category, c("Var1", "Var2", "Freq")
, c("condition", "category", "frequency"))
crime.category$condition <- factor(crime.category$condition
, levels=c("unsolved","solved"))
# There are many categories. Here we plot only the largest frequency categories
crime.category[, sum.freq := sum(frequency), by = category]
crime.category.high <- crime.category[sum.freq > median(crime.category$sum.freq)]
ggplot(data=crime.category.high,
aes(reorder(category, frequency),frequency, fill = condition)) +
geom_bar(stat="identity") +
coord_flip() +
labs(y="Numbers of Crimes", x="Category Name"
, title="Crimes by Category in San Francisco") +
theme_bw() +
scale_fill_brewer(palette = "Paired")

crime.category.high[, percentage := (frequency/sum.freq)*100]
ggplot(data=crime.category.high,
aes(reorder(category, percentage),percentage, fill = condition)) +
geom_bar(stat="identity") +
coord_flip() +
labs(y="Proportion of Crimes", x="Category Name"
, title="Crimes by Condition in San Francisco") +
theme_bw() +
scale_fill_brewer(palette = "Paired")

Do some criminal categories have significant differences in freq among districts?
# Here I pick some particular crimes to investigate
crime.district <- setDT(data.frame(table(incidents.crime$condition
, incidents.crime$category
, incidents.crime$pddistrict)))
setnames(crime.district, c("Var1", "Var2", "Var3", "Freq")
, c("condition","category", "pddistrict", "frequency"))
# Take "unsolved" criminal cases only
crimes <- c("VEHICLE THEFT","LARCENY/THEFT","VANDALISM"
,"ASSAULT", "SUSPICIOUS OCC")
crime.district <- crime.district[grep(paste(crimes, collapse = "|")
,category),]
crime.district[ , sum.freq := sum(frequency), by = pddistrict]
crime.district[, percentage := (frequency/sum.freq)*100]
ggplot(crime.district,aes(x=pddistrict,y=frequency,fill=category))+
geom_bar(stat="identity",position="dodge")+
scale_fill_brewer(palette = "RdYlBu") +
labs(y="Count", x="District"
, title="Particular Crimes by District in San Francisco") +
theme_bw() +
theme(text = element_text(size=15),
axis.text.x = element_text(angle=90, hjust=1))

What day of the week, month of the year has the most criminal incidents?
incidents.crime[, frequency := 1]
crime.summary <- incidents.crime[ ,sum(frequency), by= list(date, dayofweek)]
crime.summary <- crime.summary[date < "2018-05-01"]
setnames(crime.summary, "V1", "crime")
ggplot(crime.summary , aes(date, crime)) +
geom_line() +
scale_y_continuous() +
scale_x_date(date_breaks = "1 month", date_labels = "%Y/%m") +
labs(y="Count", x="Time"
, title="Crime in San Francisco (time series)") +
theme_bw() +
theme(text = element_text(size=15),
axis.text.x = element_text(angle=70, hjust=1))

# average crime (monthly) blox plot in 2017
monthOrder <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
crime.summary$month <- factor(format(crime.summary$date, "%b"), levels = monthOrder)
crime.summary.2017 <- crime.summary[date < "2018-01-01"]
crime.summary.2017 <- crime.summary[, mean.crime := mean(crime), by = month]
ggplot(crime.summary.2017, aes(month, mean.crime, group = 1)) +
geom_line()+
geom_point() +
ggtitle("Average Crime in San Francisco by Month in 2017") +
theme_bw()

crime.summary$dayofweek <- factor(crime.summary$dayofweek, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday", "Sunday"))
# day of week
ggplot(crime.summary, aes(dayofweek, crime)) +
geom_boxplot() + stat_boxplot(geom ='errorbar') +
ggtitle("Crime in San Francisco by Weekday")
