Data Manipulation and Visualization with dplyr and ggplot2

Loading Libraries and Reading Data

library(lubridate)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(mapproj)

## Loading required package: maps
## 
##  # ATTENTION: maps v3.0 has an updated 'world' map.        #
##  # Many country borders and names have changed since 1990. #
##  # Type '?world' or 'news(package="maps")'. See README_v3. #

library(maps)
library(ggmap)
# Load the rjsonlite library. The fromJSON function will convert the json file into a data frame. The data set includes only 1000 observations, and is the one use to illustrate the use of dplyr for data manipulation and processing.
library(jsonlite)
json_file <- "https://data.sfgov.org/resource/cuks-n6tp.json"
crimes <- fromJSON(json_file, flatten = T)
# The bigger data set used to create San Francisco District map. The number of rows of the data set is a little more than 300000 rows.
setwd("/home/hduser/DataWrangling")
SF_Crimes <- read.csv("SFCrimes.csv")
# Convert x and y to numeric
crimes$x <- as.numeric(crimes$x)
crimes$y <- as.numeric(crimes$y)

Data Manipulation

# Summarize the data frame using tbl. tbl is just a special kind of data.frame. They make your data easier to look at, but also easier to work with. On top of this, it is straightforward to derive a tbl from a data.frame structure.
crimes_tbl <- tbl_df(crimes)
# Or glimpse function
glimpse(crimes_tbl)

## Observations: 1,000
## Variables: 14
## $ address              (chr) "2000 Block of IRVING ST", "2000 Block of...
## $ category             (chr) "WARRANTS", "NON-CRIMINAL", "NON-CRIMINAL...
## $ date                 (chr) "2015-03-23T00:00:00.000", "2015-03-23T00...
## $ dayofweek            (chr) "Monday", "Monday", "Friday", "Thursday",...
## $ descript             (chr) "WARRANT ARREST", "CASE CLOSURE", "SEARCH...
## $ incidntnum           (chr) "150256446", "150256446", "150469649", "1...
## $ pddistrict           (chr) "TARAVAL", "TARAVAL", "BAYVIEW", "BAYVIEW...
## $ pdid                 (chr) "15025644663010", "15025644675030", "1504...
## $ resolution           (chr) "ARREST, BOOKED", "ARREST, BOOKED", "NONE...
## $ time                 (chr) "12:10", "12:10", "18:54", "21:44", "21:0...
## $ x                    (dbl) -122.4798, -122.4798, -122.3877, -122.388...
## $ y                    (dbl) 37.76339, 37.76339, 37.73123, 37.76305, 3...
## $ location.type        (chr) "Point", "Point", "Point", "Point", "Poin...
## $ location.coordinates (list) -122.47981, 37.76339, -122.47981, 37.763...

# Crimes committed on weekends using filter function in dplyr
weekendCrimes <- crimes_tbl %>% 
                      filter(dayofweek == "Saturday" | dayofweek == "Sunday") %>%
                      arrange(date)
# Crimes committed on weekdays
weekdaysCrimes <- filter(crimes_tbl, dayofweek %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))

#Select a subset of the data 
crimes_subset <- crimes_tbl %>% 
                        select(category, dayofweek, time, address)

# Aggregate crimes by category and filter for crimes that occured more often from February 1, 2016 to June 18, 2015. Then sort the data in descending order of crimes count 
category_grp <- crimes_tbl %>%
                      group_by(category) %>%
                      summarize(category_count=n()) %>%
                      filter(category_count > 15) %>%
                      arrange(desc(category_count))
# Convert to normal data frame
category_df <- data.frame(category_grp)

# Group by category and district. Count the number of crimes in each district and category. filter to only crimes that occur more than 5 times in each district, category group. Finally sort the data in descending order
category_dist <- crimes_tbl %>%
                        group_by(category, pddistrict) %>%
                        summarize(n = n()) %>%
                        filter(n > 2) %>%
                        arrange(desc(n))
# Convert to regular data frame 
cat_dist_df <- data.frame(category_dist)
# Sort in descending order of number of crimes in each district and category
cate_dist <- cat_dist_df %>%
                      arrange(desc(n))
# Choose the top 10 crimes
crimes_top10 <- head(cate_dist,10)
crimes_top10

##          category pddistrict  n
## 1   LARCENY/THEFT   SOUTHERN 63
## 2   LARCENY/THEFT    CENTRAL 38
## 3   LARCENY/THEFT   NORTHERN 29
## 4  OTHER OFFENSES  INGLESIDE 25
## 5         ASSAULT   SOUTHERN 21
## 6    NON-CRIMINAL   SOUTHERN 21
## 7  OTHER OFFENSES    MISSION 21
## 8  OTHER OFFENSES    BAYVIEW 20
## 9  OTHER OFFENSES   NORTHERN 17
## 10   NON-CRIMINAL    MISSION 16

# Crimes by day of week in each district
crimes_day <- crimes_tbl %>%
                    group_by(pddistrict, dayofweek) %>%
                    summarise(crimes_count = n()) %>%
                    arrange(desc(crimes_count))
# Convert to a regular data frame
crimes_day_df <- data.frame(crimes_day)
# Order by crimes_count. Return only days in which 5 or more crimes take place
crimes_day_df1 <- crimes_day_df %>%
                            arrange(desc(crimes_count)) %>%
                            filter(crimes_count > 4)
# Create a data frame in which crimes are grouped by average number of crimes occuring each day in all districts
crimes_day_avg <- crimes_day %>% 
  group_by(dayofweek) %>% 
  summarise(avg_crimes = round(mean(crimes_count))) %>% 
  arrange(desc(avg_crimes))
crimes_day_avg <- data.frame(crimes_day_avg)

# Since date column is a character vector, we shall convert it to Date() type vector
crimes_tbl$date <- as.Date(crimes_tbl$date)
# Total daily crimes
crimes_daily <- crimes_tbl %>% 
                group_by(date, pddistrict) %>%
                summarise(daily_count = n()) %>%
                filter(daily_count > 5)
crimes_daily_df <- data.frame(crimes_daily)

San Francisco Crime Sites

qmplot(X, Y, data = SF_Crimes, size = I(4), alpha = I(0.8), color=PdDistrict) +
  ggtitle("San Francisco Crime Sites")

## Using zoom = 13...
## Map from URL : http://tile.stamen.com/terrain/13/1307/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3164.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3165.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3166.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3167.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1307/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1308/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1309/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1310/3168.jpg
## Map from URL : http://tile.stamen.com/terrain/13/1311/3168.jpg

## Warning: Removed 1 rows containing missing values (geom_point).

Crimes Grouped by Category

p <- ggplot(category_df, aes(x = reorder(factor(category), -category_count),  y=category_count, fill=category))
p + geom_bar(stat="identity") +
    theme(axis.text.x = element_blank(), axis.ticks=element_blank(), 
            axis.title.x=element_blank()) + 
    ggtitle("Top Crimes Grouped by Category in the City of San Francisco\n February 1, 2015 to June 18, 2015") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none") +
    ylab("Number of Crimes in Each Category")

Crimes Grouped by Category in Each District

plt <- ggplot(crimes_top10, aes(x = category, y=n, fill = pddistrict))
plt + geom_bar(stat = "identity") + 
  labs(x = "District", y = "Number of Crimes", 
       title = "Number of Crimes grouped by Category in Each District") +
  scale_fill_discrete(name = "Disttrict")

Total Number of Crimes Grouped by Day of Week

pdays <- ggplot(crimes_day, aes(x = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), y=crimes_count, fill = pddistrict))
pdays + geom_bar(stat = "identity") + 
  labs(x = "", y="Number of Crimes", title = "Total Number of crimes each day of the week in Each District")

# Let's check which district has the most crimes each day
pdays + geom_bar(stat = "identity", position = "dodge") + 
  labs(x = "", y="Number of Crimes", title = "Total Number of crimes each day of the week in Each District")

Daily Average Number of Crimes in all Districts

avg <- ggplot(crimes_day_avg, aes(x=factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), y=avg_crimes))
avg + geom_bar(stat="identity", fill = "blue") + xlab(" ") + theme_grey() +
  labs(y = "Daily Average Number of Crimes", title="Daily total average Number of Crimes in all the Districts")

'To change the background color, one could use theme(panel.background = element_rect(fill = "green"", colour = "red"")) for example. To change the color of the plot (but not the color of the panel), use this theme(plot.background = element_rect(fill = "green"", colour = "red"")) instead for example.'

## [1] "To change the background color, one could use theme(panel.background = element_rect(fill = \"green\"\", colour = \"red\"\")) for example. To change the color of the plot (but not the color of the panel), use this theme(plot.background = element_rect(fill = \"green\"\", colour = \"red\"\")) instead for example."

Total Number of Crimes grouped by Date

ggplot(data=crimes_daily_df, aes(x=date, y=daily_count, fill = pddistrict)) + geom_histogram(stat = "identity") +
  scale_fill_discrete(name = "District") +
  labs(x = "Date", y = "Number of Crimes", title = "Number of Crimes grouped by Date in each District")

Data Manipulation and Visualization with dplyr and ggplot2

Tarek Dib

December 31, 2015

Loading Libraries and Reading Data

Data Manipulation

San Francisco Crime Sites

Crimes Grouped by Category

Crimes Grouped by Category in Each District

Total Number of Crimes Grouped by Day of Week

Daily Average Number of Crimes in all Districts

Total Number of Crimes grouped by Date