Loading Libraries and Reading Data
library(lubridate)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(mapproj)
## Loading required package: maps
##
## # ATTENTION: maps v3.0 has an updated 'world' map. #
## # Many country borders and names have changed since 1990. #
## # Type '?world' or 'news(package="maps")'. See README_v3. #
library(maps)
library(ggmap)
# Load the rjsonlite library. The fromJSON function will convert the json file into a data frame. The data set includes only 1000 observations, and is the one use to illustrate the use of dplyr for data manipulation and processing.
library(jsonlite)
json_file <- "https://data.sfgov.org/resource/cuks-n6tp.json"
crimes <- fromJSON(json_file, flatten = T)
# The bigger data set used to create San Francisco District map. The number of rows of the data set is a little more than 300000 rows.
setwd("/home/hduser/DataWrangling")
SF_Crimes <- read.csv("SFCrimes.csv")
# Convert x and y to numeric
crimes$x <- as.numeric(crimes$x)
crimes$y <- as.numeric(crimes$y)
Data Manipulation
# Summarize the data frame using tbl. tbl is just a special kind of data.frame. They make your data easier to look at, but also easier to work with. On top of this, it is straightforward to derive a tbl from a data.frame structure.
crimes_tbl <- tbl_df(crimes)
# Or glimpse function
glimpse(crimes_tbl)
## Observations: 1,000
## Variables: 14
## $ address (chr) "2000 Block of IRVING ST", "2000 Block of...
## $ category (chr) "WARRANTS", "NON-CRIMINAL", "NON-CRIMINAL...
## $ date (chr) "2015-03-23T00:00:00.000", "2015-03-23T00...
## $ dayofweek (chr) "Monday", "Monday", "Friday", "Thursday",...
## $ descript (chr) "WARRANT ARREST", "CASE CLOSURE", "SEARCH...
## $ incidntnum (chr) "150256446", "150256446", "150469649", "1...
## $ pddistrict (chr) "TARAVAL", "TARAVAL", "BAYVIEW", "BAYVIEW...
## $ pdid (chr) "15025644663010", "15025644675030", "1504...
## $ resolution (chr) "ARREST, BOOKED", "ARREST, BOOKED", "NONE...
## $ time (chr) "12:10", "12:10", "18:54", "21:44", "21:0...
## $ x (dbl) -122.4798, -122.4798, -122.3877, -122.388...
## $ y (dbl) 37.76339, 37.76339, 37.73123, 37.76305, 3...
## $ location.type (chr) "Point", "Point", "Point", "Point", "Poin...
## $ location.coordinates (list) -122.47981, 37.76339, -122.47981, 37.763...
# Crimes committed on weekends using filter function in dplyr
weekendCrimes <- crimes_tbl %>%
filter(dayofweek == "Saturday" | dayofweek == "Sunday") %>%
arrange(date)
# Crimes committed on weekdays
weekdaysCrimes <- filter(crimes_tbl, dayofweek %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))
#Select a subset of the data
crimes_subset <- crimes_tbl %>%
select(category, dayofweek, time, address)
# Aggregate crimes by category and filter for crimes that occured more often from February 1, 2016 to June 18, 2015. Then sort the data in descending order of crimes count
category_grp <- crimes_tbl %>%
group_by(category) %>%
summarize(category_count=n()) %>%
filter(category_count > 15) %>%
arrange(desc(category_count))
# Convert to normal data frame
category_df <- data.frame(category_grp)
# Group by category and district. Count the number of crimes in each district and category. filter to only crimes that occur more than 5 times in each district, category group. Finally sort the data in descending order
category_dist <- crimes_tbl %>%
group_by(category, pddistrict) %>%
summarize(n = n()) %>%
filter(n > 2) %>%
arrange(desc(n))
# Convert to regular data frame
cat_dist_df <- data.frame(category_dist)
# Sort in descending order of number of crimes in each district and category
cate_dist <- cat_dist_df %>%
arrange(desc(n))
# Choose the top 10 crimes
crimes_top10 <- head(cate_dist,10)
crimes_top10
## category pddistrict n
## 1 LARCENY/THEFT SOUTHERN 63
## 2 LARCENY/THEFT CENTRAL 38
## 3 LARCENY/THEFT NORTHERN 29
## 4 OTHER OFFENSES INGLESIDE 25
## 5 ASSAULT SOUTHERN 21
## 6 NON-CRIMINAL SOUTHERN 21
## 7 OTHER OFFENSES MISSION 21
## 8 OTHER OFFENSES BAYVIEW 20
## 9 OTHER OFFENSES NORTHERN 17
## 10 NON-CRIMINAL MISSION 16
# Crimes by day of week in each district
crimes_day <- crimes_tbl %>%
group_by(pddistrict, dayofweek) %>%
summarise(crimes_count = n()) %>%
arrange(desc(crimes_count))
# Convert to a regular data frame
crimes_day_df <- data.frame(crimes_day)
# Order by crimes_count. Return only days in which 5 or more crimes take place
crimes_day_df1 <- crimes_day_df %>%
arrange(desc(crimes_count)) %>%
filter(crimes_count > 4)
# Create a data frame in which crimes are grouped by average number of crimes occuring each day in all districts
crimes_day_avg <- crimes_day %>%
group_by(dayofweek) %>%
summarise(avg_crimes = round(mean(crimes_count))) %>%
arrange(desc(avg_crimes))
crimes_day_avg <- data.frame(crimes_day_avg)
# Since date column is a character vector, we shall convert it to Date() type vector
crimes_tbl$date <- as.Date(crimes_tbl$date)
# Total daily crimes
crimes_daily <- crimes_tbl %>%
group_by(date, pddistrict) %>%
summarise(daily_count = n()) %>%
filter(daily_count > 5)
crimes_daily_df <- data.frame(crimes_daily)
San Francisco Crime Sites
qmplot(X, Y, data = SF_Crimes, size = I(4), alpha = I(0.8), color=PdDistrict) +
ggtitle("San Francisco Crime Sites")
## Using zoom = 13...
## Map from URL : http://tile.stamen.com/terrain/13/1307/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3168.jpg
## Warning: Removed 1 rows containing missing values (geom_point).

Crimes Grouped by Category
p <- ggplot(category_df, aes(x = reorder(factor(category), -category_count), y=category_count, fill=category))
p + geom_bar(stat="identity") +
theme(axis.text.x = element_blank(), axis.ticks=element_blank(),
axis.title.x=element_blank()) +
ggtitle("Top Crimes Grouped by Category in the City of San Francisco\n February 1, 2015 to June 18, 2015") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none") +
ylab("Number of Crimes in Each Category")

Crimes Grouped by Category in Each District
plt <- ggplot(crimes_top10, aes(x = category, y=n, fill = pddistrict))
plt + geom_bar(stat = "identity") +
labs(x = "District", y = "Number of Crimes",
title = "Number of Crimes grouped by Category in Each District") +
scale_fill_discrete(name = "Disttrict")

Total Number of Crimes Grouped by Day of Week
pdays <- ggplot(crimes_day, aes(x = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), y=crimes_count, fill = pddistrict))
pdays + geom_bar(stat = "identity") +
labs(x = "", y="Number of Crimes", title = "Total Number of crimes each day of the week in Each District")

# Let's check which district has the most crimes each day
pdays + geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y="Number of Crimes", title = "Total Number of crimes each day of the week in Each District")

Daily Average Number of Crimes in all Districts
avg <- ggplot(crimes_day_avg, aes(x=factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), y=avg_crimes))
avg + geom_bar(stat="identity", fill = "blue") + xlab(" ") + theme_grey() +
labs(y = "Daily Average Number of Crimes", title="Daily total average Number of Crimes in all the Districts")

'To change the background color, one could use theme(panel.background = element_rect(fill = "green"", colour = "red"")) for example. To change the color of the plot (but not the color of the panel), use this theme(plot.background = element_rect(fill = "green"", colour = "red"")) instead for example.'
## [1] "To change the background color, one could use theme(panel.background = element_rect(fill = \"green\"\", colour = \"red\"\")) for example. To change the color of the plot (but not the color of the panel), use this theme(plot.background = element_rect(fill = \"green\"\", colour = \"red\"\")) instead for example."
Total Number of Crimes grouped by Date
ggplot(data=crimes_daily_df, aes(x=date, y=daily_count, fill = pddistrict)) + geom_histogram(stat = "identity") +
scale_fill_discrete(name = "District") +
labs(x = "Date", y = "Number of Crimes", title = "Number of Crimes grouped by Date in each District")
