Setup

tmp.library.list <- c("haven", "zoo", "fUnitRoots", "tseries", "urca", "lmtest", "forecast", "data.table", "readxl","reshape", "quantmod", "ggplot2", "reshape2", "plyr","scales", "RColorBrewer")
for (i in 1:length(tmp.library.list)) {
  if (!tmp.library.list[i] %in% rownames(installed.packages())) {
    install.packages(tmp.library.list[i])
  }
  library(tmp.library.list[i], character.only = TRUE)
}
rm(tmp.library.list)

Data Import

incidents.2017 <- paste0(RawData,"/SFIncidents2017.csv")
incidents.2018 <- paste0(RawData,"/SFIncidents2018.csv")
sf.population <- paste0(RawData,"/SFpopulation_by_pddistrict.csv")

incidents2017 <- data.table(read.csv(incidents.2017, sep = ',', stringsAsFactors = F))
incidents2018 <- data.table(read.csv(incidents.2018, sep = ',', stringsAsFactors = F))
sfpopulation <- data.table(read.csv(sf.population, sep = ',', stringsAsFactors = F))

str(incidents2018)

## Classes 'data.table' and 'data.frame':   46668 obs. of  13 variables:
##  $ IncidntNum: int  189014013 186095971 186084021 186072593 186063279 186042336 186036248 186034026 186030579 186029102 ...
##  $ Category  : chr  "NON-CRIMINAL" "OTHER OFFENSES" "FRAUD" "NON-CRIMINAL" ...
##  $ Descript  : chr  "STAY AWAY OR COURT ORDER, NON-DV RELATED" "OBSCENE PHONE CALLS(S)" "CREDIT CARD, THEFT BY USE OF" "LOST PROPERTY" ...
##  $ DayOfWeek : chr  "Monday" "Monday" "Monday" "Monday" ...
##  $ Date      : chr  "01/01/2018" "01/01/2018" "01/01/2018" "01/01/2018" ...
##  $ Time      : chr  "22:37" "08:00" "00:01" "17:00" ...
##  $ PdDistrict: chr  "TARAVAL" "TENDERLOIN" "NORTHERN" "MISSION" ...
##  $ Resolution: chr  "ARREST, BOOKED" "NONE" "NONE" "NONE" ...
##  $ Address   : chr  "3500 Block of 19TH AV" "500 Block of EDDY ST" "1200 Block of EDDY ST" "2100 Block of MARKET ST" ...
##  $ X         : num  -122 -122 -122 -122 -122 ...
##  $ Y         : num  37.7 37.8 37.8 37.8 37.8 ...
##  $ Location  : chr  "(37.723975823482135, -122.47482947577244)" "(37.78335703909351, -122.41671073384802)" "(37.78189594886033, -122.42822330317604)" "(37.766652147476215, -122.4297889869883)" ...
##  $ PdId      : num  1.89e+13 1.86e+13 1.86e+13 1.86e+13 1.86e+13 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# check if variables from both sets are the same
names(incidents2017) <- tolower(names(incidents2017))
names(incidents2018) <- tolower(names(incidents2018))
names(incidents2017) == names(incidents2018)

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

# convert date to date format
incidents2017[, date := as.Date(date, format = "%m/%d/%Y")]
incidents2018[, date := as.Date(date, format = "%m/%d/%Y")]

# combine 2 data sets since we want both 2017 & 2018 data)
incidents <- rbind(incidents2017, incidents2018)

Cleaning

table(incidents$category)

## 
##                       ARSON                     ASSAULT 
##                         424                       18254 
##                  BAD CHECKS                     BRIBERY 
##                          33                          84 
##                    BURGLARY          DISORDERLY CONDUCT 
##                        7955                         494 
## DRIVING UNDER THE INFLUENCE               DRUG/NARCOTIC 
##                         391                        4790 
##                 DRUNKENNESS                EMBEZZLEMENT 
##                         423                         223 
##                   EXTORTION             FAMILY OFFENSES 
##                          75                          57 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                         704                        3360 
##                    GAMBLING                  KIDNAPPING 
##                          23                         280 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                       61458                          91 
##                   LOITERING              MISSING PERSON 
##                          48                        5751 
##                NON-CRIMINAL              OTHER OFFENSES 
##                       22615                       23884 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                           9                         575 
##           RECOVERED VEHICLE                     ROBBERY 
##                         718                        4422 
##                     RUNAWAY             SECONDARY CODES 
##                         335                        2681 
##      SEX OFFENSES, FORCIBLE  SEX OFFENSES, NON FORCIBLE 
##                        1368                          55 
##             STOLEN PROPERTY                     SUICIDE 
##                        1083                         102 
##              SUSPICIOUS OCC                        TREA 
##                        7833                           1 
##                    TRESPASS                   VANDALISM 
##                        2127                       12561 
##               VEHICLE THEFT                    WARRANTS 
##                        7207                        6638 
##                 WEAPON LAWS 
##                        2309

# Take criminal cases only
non.criminal <- c("FAMILY OFFENSES","NON-CRIMINAL","MISSING PERSON"
                  ,"BAD CHECKS","RUNAWAY","RECOVERED VEHICLE", "TREA")

incidents.crime <- incidents[-grep(paste(non.criminal, collapse = "|")
                                   ,category),]

Descriptive Analysis

Which district has the most criminal incidents per capita?

str(incidents.crime)

## Classes 'data.table' and 'data.frame':   171931 obs. of  13 variables:
##  $ incidntnum: int  186043061 186043061 186006116 180080782 180002015 180002015 180001675 179000044 176111991 176025386 ...
##  $ category  : chr  "LARCENY/THEFT" "LARCENY/THEFT" "LARCENY/THEFT" "SEX OFFENSES, FORCIBLE" ...
##  $ descript  : chr  "GRAND THEFT OF PROPERTY" "PETTY THEFT OF PROPERTY" "GRAND THEFT OF PROPERTY" "FORCIBLE RAPE, BODILY FORCE" ...
##  $ dayofweek : chr  "Sunday" "Sunday" "Sunday" "Sunday" ...
##  $ date      : Date, format: "2017-01-01" "2017-01-01" ...
##  $ time      : chr  "00:01" "00:01" "09:00" "00:01" ...
##  $ pddistrict: chr  "NORTHERN" "NORTHERN" "NORTHERN" "INGLESIDE" ...
##  $ resolution: chr  "NONE" "NONE" "NONE" "NONE" ...
##  $ address   : chr  "0 Block of HERMANN ST" "0 Block of HERMANN ST" "HAIGHT ST / STEINER ST" "100 Block of DUNCAN ST" ...
##  $ x         : num  -122 -122 -122 -122 -122 ...
##  $ y         : num  37.8 37.8 37.8 37.7 37.8 ...
##  $ location  : chr  "(37.77070812317366, -122.42596301730948)" "(37.77070812317366, -122.42596301730948)" "(37.77189574196664, -122.43209555515361)" "(37.74636244735492, -122.4234838326707)" ...
##  $ pdid      : num  1.86e+13 1.86e+13 1.86e+13 1.80e+13 1.80e+13 ...
##  - attr(*, ".internal.selfref")=<externalptr>

incidents.bydistrict <- setDT(data.frame(table(incidents.crime$pddistrict)))

setnames(incidents.bydistrict, c("Var1", "Freq"), c("pddistrict", "frequency"))

## merge with population data
incidents.bydistrict[sfpopulation, on = "pddistrict", population := i.population]

incidents.bydistrict[, crime.per.capita := frequency/population]

ggplot(data=incidents.bydistrict, 
       aes(reorder(pddistrict, crime.per.capita),crime.per.capita)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes per Capita", x="Police Department District"
       , title="Crimes per Capita in San Francisco (by District)")  + 
  theme_bw()

How many of the incidents have been solved? In which districts? Which categories?

incidents.crime$condition <- ifelse(grepl("UNFOUNDED", incidents.crime$resolution
                                          ,ignore.case = T), "unsolved"
                            ,ifelse(grepl("NONE", incidents.crime$resolution
                                          , ignore.case = T ), "unsolved", "solved"))
# 37.34%
table(incidents.crime$condition)

## 
##   solved unsolved 
##    46746   125185

How many incidents/crimes have been solved?

crime.condition <- setDT(data.frame(table(incidents.crime$condition
                                          , incidents.crime$pddistrict)))

setnames(crime.condition, c("Var1", "Var2", "Freq")
         , c("condition", "pddistrict", "frequency"))

crime.condition[sfpopulation, on = "pddistrict", population := i.population]

crime.condition[, crime.per.capita := frequency/population]

crime.condition$condition <- factor(crime.condition$condition
                                    , levels=c("unsolved","solved"))

ggplot(data=crime.condition, 
       aes(reorder(pddistrict, crime.per.capita),crime.per.capita, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes per Capita", x="Police Department District"
       , title="Crimes per Capita in San Francisco (by District)") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired")

Which categories?

table(incidents.crime$category)

## 
##                       ARSON                     ASSAULT 
##                         424                       18254 
##                     BRIBERY                    BURGLARY 
##                          84                        7955 
##          DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE 
##                         494                         391 
##               DRUG/NARCOTIC                 DRUNKENNESS 
##                        4790                         423 
##                EMBEZZLEMENT                   EXTORTION 
##                         223                          75 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                         704                        3360 
##                    GAMBLING                  KIDNAPPING 
##                          23                         280 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                       61458                          91 
##                   LOITERING              OTHER OFFENSES 
##                          48                       23884 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                           9                         575 
##                     ROBBERY             SECONDARY CODES 
##                        4422                        2681 
##      SEX OFFENSES, FORCIBLE  SEX OFFENSES, NON FORCIBLE 
##                        1368                          55 
##             STOLEN PROPERTY                     SUICIDE 
##                        1083                         102 
##              SUSPICIOUS OCC                    TRESPASS 
##                        7833                        2127 
##                   VANDALISM               VEHICLE THEFT 
##                       12561                        7207 
##                    WARRANTS                 WEAPON LAWS 
##                        6638                        2309

crime.category <- setDT(data.frame(table(incidents.crime$condition
                                          , incidents.crime$category)))
setnames(crime.category, c("Var1", "Var2", "Freq")
         , c("condition", "category", "frequency"))

crime.category$condition <- factor(crime.category$condition
                                    , levels=c("unsolved","solved"))

# There are many categories. Here we plot only the largest frequency categories
crime.category[, sum.freq := sum(frequency), by = category]

crime.category.high <- crime.category[sum.freq > median(crime.category$sum.freq)]

ggplot(data=crime.category.high, 
       aes(reorder(category, frequency),frequency, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Numbers of Crimes", x="Category Name"
       , title="Crimes by Category in San Francisco") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired")

crime.category.high[, percentage := (frequency/sum.freq)*100]

ggplot(data=crime.category.high, 
       aes(reorder(category, percentage),percentage, fill = condition)) + 
  geom_bar(stat="identity") +
  coord_flip() +
  labs(y="Proportion of Crimes", x="Category Name"
       , title="Crimes by Condition in San Francisco") +
  theme_bw() +
  scale_fill_brewer(palette = "Paired")

Do some criminal categories have significant differences in freq among districts?

# Here I pick some particular crimes to investigate
crime.district <- setDT(data.frame(table(incidents.crime$condition
                                         , incidents.crime$category
                                         , incidents.crime$pddistrict)))
setnames(crime.district, c("Var1", "Var2", "Var3", "Freq")
         , c("condition","category", "pddistrict", "frequency"))

# Take "unsolved" criminal cases only
crimes <- c("VEHICLE THEFT","LARCENY/THEFT","VANDALISM"
            ,"ASSAULT", "SUSPICIOUS OCC")

crime.district <- crime.district[grep(paste(crimes, collapse = "|")
                                      ,category),]

crime.district[ , sum.freq := sum(frequency), by = pddistrict]
crime.district[, percentage := (frequency/sum.freq)*100]


ggplot(crime.district,aes(x=pddistrict,y=frequency,fill=category))+
  geom_bar(stat="identity",position="dodge")+
  scale_fill_brewer(palette = "RdYlBu")  +
  labs(y="Count", x="District"
       , title="Particular Crimes by District in San Francisco") +
  theme_bw() +
  theme(text = element_text(size=15),
        axis.text.x = element_text(angle=90, hjust=1))

What day of the week, month of the year has the most criminal incidents?

incidents.crime[, frequency := 1]

crime.summary <- incidents.crime[ ,sum(frequency), by= list(date, dayofweek)]

crime.summary <- crime.summary[date < "2018-05-01"]

setnames(crime.summary, "V1", "crime")

ggplot(crime.summary , aes(date, crime)) + 
  geom_line() + 
  scale_y_continuous() +
  scale_x_date(date_breaks = "1 month", date_labels = "%Y/%m") +
  labs(y="Count", x="Time"
       , title="Crime in San Francisco (time series)") +
  theme_bw() +
  theme(text = element_text(size=15),
        axis.text.x = element_text(angle=70, hjust=1))

# average crime (monthly) blox plot in 2017

monthOrder <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
crime.summary$month <- factor(format(crime.summary$date, "%b"), levels = monthOrder)


crime.summary.2017 <- crime.summary[date < "2018-01-01"]
crime.summary.2017 <- crime.summary[, mean.crime := mean(crime), by = month]

ggplot(crime.summary.2017, aes(month, mean.crime, group = 1)) +
  geom_line()+
  geom_point() +
  ggtitle("Average Crime in San Francisco by Month in 2017") +
  theme_bw()

crime.summary$dayofweek <- factor(crime.summary$dayofweek, levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday", "Sunday"))

# day of week

ggplot(crime.summary, aes(dayofweek, crime)) +
  geom_boxplot() + stat_boxplot(geom ='errorbar') + 
  ggtitle("Crime in San Francisco by Weekday")

San Francisco Crime

Nam Nguyen

7/13/2018

Setup

Data Import

Cleaning

Descriptive Analysis

Which district has the most criminal incidents per capita?

How many of the incidents have been solved? In which districts? Which categories?

How many incidents/crimes have been solved?

Which categories?

Do some criminal categories have significant differences in freq among districts?

What day of the week, month of the year has the most criminal incidents?