Setup

tmp.library.list <- c("haven", "zoo", "fUnitRoots", "tseries", "urca", "lmtest", "forecast", "data.table", "readxl","reshape", "quantmod", "ggplot2", "reshape2", "plyr","scales", "RColorBrewer")
for (i in 1:length(tmp.library.list)) {
  if (!tmp.library.list[i] %in% rownames(installed.packages())) {
    install.packages(tmp.library.list[i])
  }
  library(tmp.library.list[i], character.only = TRUE)
}
rm(tmp.library.list)

Data Import

incidents.2017 <- paste0(RawData,"/SFIncidents2017.csv")
incidents.2018 <- paste0(RawData,"/SFIncidents2018.csv")
sf.population <- paste0(RawData,"/SFpopulation_by_pddistrict.csv")

incidents2017 <- data.table(read.csv(incidents.2017, sep = ',', stringsAsFactors = F))
incidents2018 <- data.table(read.csv(incidents.2018, sep = ',', stringsAsFactors = F))
sfpopulation <- data.table(read.csv(sf.population, sep = ',', stringsAsFactors = F))

str(incidents2018)
## Classes 'data.table' and 'data.frame':   46668 obs. of  13 variables:
##  $ IncidntNum: int  189014013 186095971 186084021 186072593 186063279 186042336 186036248 186034026 186030579 186029102 ...
##  $ Category  : chr  "NON-CRIMINAL" "OTHER OFFENSES" "FRAUD" "NON-CRIMINAL" ...
##  $ Descript  : chr  "STAY AWAY OR COURT ORDER, NON-DV RELATED" "OBSCENE PHONE CALLS(S)" "CREDIT CARD, THEFT BY USE OF" "LOST PROPERTY" ...
##  $ DayOfWeek : chr  "Monday" "Monday" "Monday" "Monday" ...
##  $ Date      : chr  "01/01/2018" "01/01/2018" "01/01/2018" "01/01/2018" ...
##  $ Time      : chr  "22:37" "08:00" "00:01" "17:00" ...
##  $ PdDistrict: chr  "TARAVAL" "TENDERLOIN" "NORTHERN" "MISSION" ...
##  $ Resolution: chr  "ARREST, BOOKED" "NONE" "NONE" "NONE" ...
##  $ Address   : chr  "3500 Block of 19TH AV" "500 Block of EDDY ST" "1200 Block of EDDY ST" "2100 Block of MARKET ST" ...
##  $ X         : num  -122 -122 -122 -122 -122 ...
##  $ Y         : num  37.7 37.8 37.8 37.8 37.8 ...
##  $ Location  : chr  "(37.723975823482135, -122.47482947577244)" "(37.78335703909351, -122.41671073384802)" "(37.78189594886033, -122.42822330317604)" "(37.766652147476215, -122.4297889869883)" ...
##  $ PdId      : num  1.89e+13 1.86e+13 1.86e+13 1.86e+13 1.86e+13 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# check if variables from both sets are the same
names(incidents2017) <- tolower(names(incidents2017))
names(incidents2018) <- tolower(names(incidents2018))
names(incidents2017) == names(incidents2018)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# convert date to date format
incidents2017[, date := as.Date(date, format = "%m/%d/%Y")]
incidents2018[, date := as.Date(date, format = "%m/%d/%Y")]

# combine 2 data sets since we want both 2017 & 2018 data)
incidents <- rbind(incidents2017, incidents2018)

Cleaning

table(incidents$category)
## 
##                       ARSON                     ASSAULT 
##                         424                       18254 
##                  BAD CHECKS                     BRIBERY 
##                          33                          84 
##                    BURGLARY          DISORDERLY CONDUCT 
##                        7955                         494 
## DRIVING UNDER THE INFLUENCE               DRUG/NARCOTIC 
##                         391                        4790 
##                 DRUNKENNESS                EMBEZZLEMENT 
##                         423                         223 
##                   EXTORTION             FAMILY OFFENSES 
##                          75                          57 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                         704                        3360 
##                    GAMBLING                  KIDNAPPING 
##                          23                         280 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                       61458                          91 
##                   LOITERING              MISSING PERSON 
##                          48                        5751 
##                NON-CRIMINAL              OTHER OFFENSES 
##                       22615                       23884 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                           9                         575 
##           RECOVERED VEHICLE                     ROBBERY 
##                         718                        4422 
##                     RUNAWAY             SECONDARY CODES 
##                         335                        2681 
##      SEX OFFENSES, FORCIBLE  SEX OFFENSES, NON FORCIBLE 
##                        1368                          55 
##             STOLEN PROPERTY                     SUICIDE 
##                        1083                         102 
##              SUSPICIOUS OCC                        TREA 
##                        7833                           1 
##                    TRESPASS                   VANDALISM 
##                        2127                       12561 
##               VEHICLE THEFT                    WARRANTS 
##                        7207                        6638 
##                 WEAPON LAWS 
##                        2309
# Take criminal cases only
non.criminal <- c("FAMILY OFFENSES","NON-CRIMINAL","MISSING PERSON"
                  ,"BAD CHECKS","RUNAWAY","RECOVERED VEHICLE", "TREA")

incidents.crime <- incidents[-grep(paste(non.criminal, collapse = "|")
                                   ,category),]

Descriptive Analysis

Which district has the most criminal incidents per capita?

str(incidents.crime)
## Classes 'data.table' and 'data.frame':   171931 obs. of  13 variables:
##  $ incidntnum: int  186043061 186043061 186006116 180080782 180002015 180002015 180001675 179000044 176111991 176025386 ...
##  $ category  : chr  "LARCENY/THEFT" "LARCENY/THEFT" "LARCENY/THEFT" "SEX OFFENSES, FORCIBLE" ...
##  $ descript  : chr  "GRAND THEFT OF PROPERTY" "PETTY THEFT OF PROPERTY" "GRAND THEFT OF PROPERTY" "FORCIBLE RAPE, BODILY FORCE" ...
##  $ dayofweek : chr  "Sunday" "Sunday" "Sunday" "Sunday" ...
##  $ date      : Date, format: "2017-01-01" "2017-01-01" ...
##  $ time      : chr  "00:01" "00:01" "09:00" "00:01" ...
##  $ pddistrict: chr  "NORTHERN" "NORTHERN" "NORTHERN" "INGLESIDE" ...
##  $ resolution: chr  "NONE" "NONE" "NONE" "NONE" ...
##  $ address   : chr  "0 Block of HERMANN ST" "0 Block of HERMANN ST" "HAIGHT ST / STEINER ST" "100 Block of DUNCAN ST" ...
##  $ x         : num  -122 -122 -122 -122 -122 ...
##  $ y         : num  37.8 37.8 37.8 37.7 37.8 ...
##  $ location  : chr  "(37.77070812317366, -122.42596301730948)" "(37.77070812317366, -122.42596301730948)" "(37.77189574196664, -122.43209555515361)" "(37.74636244735492, -122.4234838326707)" ...
##  $ pdid      : num  1.86e+13 1.86e+13 1.86e+13 1.80e+13 1.80e+13 ...
##  - attr(*, ".internal.selfref")=<externalptr>
incidents.bydistrict <- setDT(data.frame(table(incidents.crime$pddistrict)))

setnames(incidents.bydistrict, c("Var1", "Freq"), c("pddistrict", "frequency"))

## merge with population data
incidents.bydistrict[sfpopulation, on = "pddistrict", population := i.population]

incidents.bydistrict[, crime.per.capita := frequency/population]

ggplot(data=incidents.bydistrict, 
       aes(reorder(pddistrict, crime.per.capita),crime.per.capita)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes per Capita", x="Police Department District"
       , title="Crimes per Capita in San Francisco (by District)")  + 
  theme_bw()

How many of the incidents have been solved? In which districts? Which categories?

incidents.crime$condition <- ifelse(grepl("UNFOUNDED", incidents.crime$resolution
                                          ,ignore.case = T), "unsolved"
                            ,ifelse(grepl("NONE", incidents.crime$resolution
                                          , ignore.case = T ), "unsolved", "solved"))
# 37.34%
table(incidents.crime$condition) 
## 
##   solved unsolved 
##    46746   125185

How many incidents/crimes have been solved?

crime.condition <- setDT(data.frame(table(incidents.crime$condition
                                          , incidents.crime$pddistrict)))

setnames(crime.condition, c("Var1", "Var2", "Freq")
         , c("condition", "pddistrict", "frequency"))

crime.condition[sfpopulation, on = "pddistrict", population := i.population]

crime.condition[, crime.per.capita := frequency/population]

crime.condition$condition <- factor(crime.condition$condition
                                    , levels=c("unsolved","solved"))

ggplot(data=crime.condition, 
       aes(reorder(pddistrict, crime.per.capita),crime.per.capita, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes per Capita", x="Police Department District"
       , title="Crimes per Capita in San Francisco (by District)") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired") 

Which categories?

table(incidents.crime$category)
## 
##                       ARSON                     ASSAULT 
##                         424                       18254 
##                     BRIBERY                    BURGLARY 
##                          84                        7955 
##          DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE 
##                         494                         391 
##               DRUG/NARCOTIC                 DRUNKENNESS 
##                        4790                         423 
##                EMBEZZLEMENT                   EXTORTION 
##                         223                          75 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                         704                        3360 
##                    GAMBLING                  KIDNAPPING 
##                          23                         280 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                       61458                          91 
##                   LOITERING              OTHER OFFENSES 
##                          48                       23884 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                           9                         575 
##                     ROBBERY             SECONDARY CODES 
##                        4422                        2681 
##      SEX OFFENSES, FORCIBLE  SEX OFFENSES, NON FORCIBLE 
##                        1368                          55 
##             STOLEN PROPERTY                     SUICIDE 
##                        1083                         102 
##              SUSPICIOUS OCC                    TRESPASS 
##                        7833                        2127 
##                   VANDALISM               VEHICLE THEFT 
##                       12561                        7207 
##                    WARRANTS                 WEAPON LAWS 
##                        6638                        2309
crime.category <- setDT(data.frame(table(incidents.crime$condition
                                          , incidents.crime$category)))
setnames(crime.category, c("Var1", "Var2", "Freq")
         , c("condition", "category", "frequency"))

crime.category$condition <- factor(crime.category$condition
                                    , levels=c("unsolved","solved"))

# There are many categories. Here we plot only the largest frequency categories
crime.category[, sum.freq := sum(frequency), by = category]

crime.category.high <- crime.category[sum.freq > median(crime.category$sum.freq)]

ggplot(data=crime.category.high, 
       aes(reorder(category, frequency),frequency, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes", x="Category Name"
       , title="Crimes by Category in San Francisco") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired")

crime.category.high[, percentage := (frequency/sum.freq)*100]

ggplot(data=crime.category.high, 
       aes(reorder(category, percentage),percentage, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Proportion of Crimes", x="Category Name"
       , title="Crimes by Condition in San Francisco") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired") 

Do some criminal categories have significant differences in freq among districts?

# Here I pick some particular crimes to investigate
crime.district <- setDT(data.frame(table(incidents.crime$condition
                                         , incidents.crime$category
                                         , incidents.crime$pddistrict)))
setnames(crime.district, c("Var1", "Var2", "Var3", "Freq")
         , c("condition","category", "pddistrict", "frequency"))

# Take "unsolved" criminal cases only
crimes <- c("VEHICLE THEFT","LARCENY/THEFT","VANDALISM"
            ,"ASSAULT", "SUSPICIOUS OCC")

crime.district <- crime.district[grep(paste(crimes, collapse = "|")
                                      ,category),]

crime.district[ , sum.freq := sum(frequency), by = pddistrict]
crime.district[, percentage := (frequency/sum.freq)*100]


ggplot(crime.district,aes(x=pddistrict,y=frequency,fill=category))+
  geom_bar(stat="identity",position="dodge")+
  scale_fill_brewer(palette = "RdYlBu")  +
  labs(y="Count", x="District"
       , title="Particular Crimes by District in San Francisco") +
  theme_bw() +
  theme(text = element_text(size=15),
        axis.text.x = element_text(angle=90, hjust=1)) 

What day of the week, month of the year has the most criminal incidents?

incidents.crime[, frequency := 1]

crime.summary <- incidents.crime[ ,sum(frequency), by= list(date, dayofweek)]

crime.summary <- crime.summary[date < "2018-05-01"]

setnames(crime.summary, "V1", "crime")

ggplot(crime.summary , aes(date, crime)) + 
  geom_line() + 
  scale_y_continuous() +
  scale_x_date(date_breaks = "1 month", date_labels = "%Y/%m") +
  labs(y="Count", x="Time"
       , title="Crime in San Francisco (time series)") +
  theme_bw() +
  theme(text = element_text(size=15),
        axis.text.x = element_text(angle=70, hjust=1)) 

# average crime (monthly) blox plot in 2017

monthOrder <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
crime.summary$month <- factor(format(crime.summary$date, "%b"), levels = monthOrder)


crime.summary.2017 <- crime.summary[date < "2018-01-01"]
crime.summary.2017 <- crime.summary[, mean.crime := mean(crime), by = month]

ggplot(crime.summary.2017, aes(month, mean.crime, group = 1)) +
  geom_line()+
  geom_point() +
  ggtitle("Average Crime in San Francisco by Month in 2017") +
  theme_bw()

crime.summary$dayofweek <- factor(crime.summary$dayofweek, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday", "Sunday"))

# day of week

ggplot(crime.summary, aes(dayofweek, crime)) +
  geom_boxplot() + stat_boxplot(geom ='errorbar') + 
  ggtitle("Crime in San Francisco by Weekday")