Visualization of Criminal Incidient Data in San Fransisco

–Overview–

In this project, we will analyze criminal incident data from San Fransisco to visualize and compare patterns. The objective is to use data visulization skills to help people better understand the dataset. The following questions need to be answered through analysis.

How do incidents vary by time of day?

Which incidents are most common in the evening?

During what periods of the day are robberies most common?

How do incidents vary by neighborhood?

Which incidents are most common in the city center?

In what areas or neighborhoods are robberies or thefts most common?

How do incidents vary month to month in the Summer 2014 dataset?

Which incident types tend to correlate with each other on a day-by-day basis?

—Analysis—

How do incidents vary by time of day?

library(ggplot2)
san <- read.csv("/Users/z001ldk/datasci_course_materials/assignment6/sanfrancisco_incidents_summer_2014.csv")
hr <- as.POSIXlt(san$Time,format="%H:%M")
san$Hour <- hr$hour

ggplot(data=san, aes(san$Hour)) + 
  geom_histogram(breaks=seq(0, 24, by = 1), 
                 alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Number of Incidents vs Time") +
  labs(x="Hours", y="Number of Incidents")

Which incidents are most common in the evening?

san1 <- san[san$Hour >=18,]
san2 <- san[san$Hour <= 6,]
san1 <- rbind(san1,san2)

ggplot(data=san, aes(factor(san$Category))) + 
  geom_bar( alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Number of Incidents vs Incident Category") +
  labs(x="Incident Category", y="Number of Incidents")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

During what periods of the day are robberies most common?

san1 <- san[san$Category == 'ROBBERY',]

ggplot(data=san1, aes(san1$Hour)) + 
  geom_histogram(breaks=seq(0, 24, by = 1), 
                 alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Robbery Incidents vs Time") +
  labs(x="Hours", y="Number of Robbery Incidents")

How do incidents vary by neighborhood?

ggplot(data=san, aes(factor(san$PdDistrict))) + 
  geom_histogram( alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Distribution of Incidents in the neighborhood") +
  labs(x="Zones", y="Number of Incidents")

Which incidents are most common in the city center?

san1 <- san[san$PdDistrict == 'CENTRAL',]

ggplot(data=san1, aes(factor(san1$Category))) + 
  geom_histogram( alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Number of Incidents in City Centre vs Incident Category") +
  labs(x="Incident Category", y="Number of Incidents")+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))

In what areas or neighborhoods are robberies or thefts most common?

san1 <- san[san$Category== 'ROBBERY' | san$Category =='LARCENY/THEFT',]

ggplot(data=san1, aes(factor(san1$PdDistrict))) + 
  geom_histogram( alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Number of Robbery or Theft Incidents vs District") +
  labs(x="District", y="Number of Robbery or Theft Incidents")

How do incidents vary month to month in the Summer 2014 dataset?

library(lubridate)

san$Date <- as.Date(san$Date, "%m/%d/%Y")
san$Month <- month(san$Date)

ggplot(data=san, aes(factor(san$Month))) + 
  geom_histogram( 
    breaks=seq(0, 8, by = 1),alpha = 0.8,
                  col="red",
                 aes(fill=..count..)) + 
  scale_fill_gradient("Count", low = "green", high = "red")+
  labs(title="Incidents vs Month") +
  labs(x="Month", y="Number of Incidents")

where do incidents occur?

library(ggmap)
library(knitr)
san$Latitude <- san$X
san$Longitude <- san$Y
san1 <- subset(san,Category == 'LARCENY/THEFT' | Category == 'ASSAULT' | Category == 'NON-CRIMINAL' | Category == 'VEHICLE THEFT' | Category =='WARRANTS' | Category == 'DRUG/NARCOTIC')
san1$cat <- factor(san1$Category)
qmplot(x=Latitude,y=Longitude,data=san1,color=cat)
## Using zoom = 13...
## Map from URL : http://tile.stamen.com/toner-lite/13/1307/3165.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1308/3165.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1309/3165.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1310/3165.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1311/3165.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1307/3166.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1308/3166.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1309/3166.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1310/3166.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1311/3166.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1307/3167.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1308/3167.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1309/3167.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1310/3167.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1311/3167.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1307/3168.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1308/3168.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1309/3168.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1310/3168.png
## Map from URL : http://tile.stamen.com/toner-lite/13/1311/3168.png

kable(t(table(san1$Category)))
ARSON ASSAULT BRIBERY BURGLARY DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC DRUNKENNESS EMBEZZLEMENT EXTORTION FAMILY OFFENSES FORGERY/COUNTERFEITING FRAUD GAMBLING KIDNAPPING LARCENY/THEFT LIQUOR LAWS LOITERING MISSING PERSON NON-CRIMINAL OTHER OFFENSES PORNOGRAPHY/OBSCENE MAT PROSTITUTION ROBBERY RUNAWAY SECONDARY CODES STOLEN PROPERTY SUICIDE SUSPICIOUS OCC TRESPASS VANDALISM VEHICLE THEFT WARRANTS WEAPON LAWS
0 2882 0 0 0 0 1345 0 0 0 0 0 0 0 0 9466 0 0 0 3023 0 0 0 0 0 0 0 0 0 0 0 1966 1782 0

Incidents by Weekdays

library(knitr)
qmplot(x=Latitude,y=Longitude,data=san1,color=DayOfWeek)
## Using zoom = 13...

kable(t(table(san1$DayOfWeek)))
Friday Monday Saturday Sunday Thursday Tuesday Wednesday
3113 2801 3154 3094 2760 2717 2825
san_DayCate <- aggregate(IncidntNum ~ DayOfWeek + Category, data = san, FUN = length)
san_DayCatesum <- aggregate(IncidntNum ~ DayOfWeek, data = san_DayCate, FUN = sum)
san_DayCatesum$Category <- "x"
ggplot(san_DayCate, aes(x=DayOfWeek, y=IncidntNum, fill=Category)) +
        geom_bar(binwidth=.8, alpha=0.8,stat = "identity") +
        geom_text(aes(y = IncidntNum + 42, label=IncidntNum), data = san_DayCatesum,stat = "identity")

Incidents by Hour

dfSanCrime_HourCate <- aggregate(IncidntNum ~ Hour + Category, data = san, FUN = length)
dfSanCrime_HourCate_hoursum <- aggregate(IncidntNum ~ Hour, data = dfSanCrime_HourCate, FUN = sum)
dfSanCrime_HourCate_hoursum$Category <- "x"

ggplot(dfSanCrime_HourCate, aes(x=Hour, y=IncidntNum, fill=Category)) +
        geom_bar(binwidth=1, alpha=.8, stat="identity") +
        geom_text(aes(y = IncidntNum + 34, label=IncidntNum), data = dfSanCrime_HourCate_hoursum,
                  stat = "identity")