crime = read.csv('/Users/ruiqianli/Desktop/EBAC_DABP/Day_2/data/crime\ case/crime.csv')
head(crime,4)
## ID Case.Number Date Block IUCR
## 1 10000092 HY189866 18/03/2015 19:44 047XX W OHIO ST 041A
## 2 10000094 HY190059 18/03/2015 23:00 066XX S MARSHFIELD AVE 4625
## 3 10000095 HY190052 18/03/2015 22:45 044XX S LAKE PARK AVE 486
## 4 10000096 HY190054 18/03/2015 22:30 051XX S MICHIGAN AVE 460
## Primary.Type Description Location.Description Arrest Domestic
## 1 BATTERY AGGRAVATED: HANDGUN STREET FALSE FALSE
## 2 OTHER OFFENSE PAROLE VIOLATION STREET TRUE FALSE
## 3 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT FALSE TRUE
## 4 BATTERY SIMPLE APARTMENT FALSE FALSE
## Beat District Ward Community.Area FBI.Code X.Coordinate Y.Coordinate Year
## 1 1111 11 28 25 04B 1144606 1903566 2015
## 2 725 7 15 67 26 1166468 1860715 2015
## 3 222 2 4 39 08B 1185075 1875622 2015
## 4 225 2 3 40 08B 1178033 1870804 2015
## Updated.On Latitude Longitude Location
## 1 02/10/2018 15:50 41.89140 -87.74438 (41.891398861, -87.744384567)
## 2 02/10/2018 15:50 41.77337 -87.66532 (41.773371528, -87.665319468)
## 3 02/10/2018 15:50 41.81386 -87.59664 (41.81386068, -87.596642837)
## 4 02/10/2018 15:50 41.80080 -87.62262 (41.800802415, -87.622619343)
#Task1 Select the specified column in Figure 1
crime.figure1=subset(crime,select=c("ID","Date","Primary.Type","Location.Description","Arrest","Domestic","District","Year"))
head(crime.figure1)
## ID Date Primary.Type Location.Description Arrest Domestic
## 1 10000092 18/03/2015 19:44 BATTERY STREET FALSE FALSE
## 2 10000094 18/03/2015 23:00 OTHER OFFENSE STREET TRUE FALSE
## 3 10000095 18/03/2015 22:45 BATTERY APARTMENT FALSE TRUE
## 4 10000096 18/03/2015 22:30 BATTERY APARTMENT FALSE FALSE
## 5 10000097 18/03/2015 21:00 ROBERY SIDEWALK FALSE FALSE
## 6 10000098 18/03/2015 22:00 BATTERY APARTMENT FALSE FALSE
## District Year
## 1 11 2015
## 2 7 2015
## 3 2 2015
## 4 2 2015
## 5 11 2015
## 6 2 2015
Transform several column names
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
crime.figure2=rename(crime.figure1, Location=Location.Description, CrimeType=Primary.Type)
head(crime.figure2,4)
## ID Date CrimeType Location Arrest Domestic District
## 1 10000092 18/03/2015 19:44 BATTERY STREET FALSE FALSE 11
## 2 10000094 18/03/2015 23:00 OTHER OFFENSE STREET TRUE FALSE 7
## 3 10000095 18/03/2015 22:45 BATTERY APARTMENT FALSE TRUE 2
## 4 10000096 18/03/2015 22:30 BATTERY APARTMENT FALSE FALSE 2
## Year
## 1 2015
## 2 2015
## 3 2015
## 4 2015
Create more columns
pacman::p_load(tidyverse, lubridate)
crime.figure2$Date = ymd_hms(crime.figure2$Date)
crime.figure2$Date2 = date(crime.figure2$Date)
crime.figure2$time = hms(format(crime.figure2$Date, format='%T') )
crime.figure2$weekday = weekdays((crime.figure2$Date2))
crime.figure2$week = week((crime.figure2$Date2))
crime.figure2$weekend = as.factor(ifelse(crime.figure2$weekday %in% c("Saturday", "Sunday"), "we
ekend", "weekday"))
head(crime.figure2)
## ID Date CrimeType Location Arrest Domestic District
## 1 10000092 2018-03-20 15:19:44 BATTERY STREET FALSE FALSE 11
## 2 10000094 2018-03-20 15:23:00 OTHER OFFENSE STREET TRUE FALSE 7
## 3 10000095 2018-03-20 15:22:45 BATTERY APARTMENT FALSE TRUE 2
## 4 10000096 2018-03-20 15:22:30 BATTERY APARTMENT FALSE FALSE 2
## 5 10000097 2018-03-20 15:21:00 ROBERY SIDEWALK FALSE FALSE 11
## 6 10000098 2018-03-20 15:22:00 BATTERY APARTMENT FALSE FALSE 2
## Year Date2 time weekday week weekend
## 1 2015 2018-03-20 15H 19M 44S Tuesday 12 weekday
## 2 2015 2018-03-20 15H 23M 0S Tuesday 12 weekday
## 3 2015 2018-03-20 15H 22M 45S Tuesday 12 weekday
## 4 2015 2018-03-20 15H 22M 30S Tuesday 12 weekday
## 5 2015 2018-03-20 15H 21M 0S Tuesday 12 weekday
## 6 2015 2018-03-20 15H 22M 0S Tuesday 12 weekday
Create breaks and labels for the breaks
breaks = hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
breaks
## [1] 0 6 12 18 23
labels = c("Night", "Morning", "Afternoon", "Evening")
crime.figure2$Time_of_day = cut(x=hour(crime.figure2$time), breaks = breaks, labels = labels, include.lowest=TRUE)
head(crime.figure2,4)
## ID Date CrimeType Location Arrest Domestic District
## 1 10000092 2018-03-20 15:19:44 BATTERY STREET FALSE FALSE 11
## 2 10000094 2018-03-20 15:23:00 OTHER OFFENSE STREET TRUE FALSE 7
## 3 10000095 2018-03-20 15:22:45 BATTERY APARTMENT FALSE TRUE 2
## 4 10000096 2018-03-20 15:22:30 BATTERY APARTMENT FALSE FALSE 2
## Year Date2 time weekday week weekend Time_of_day
## 1 2015 2018-03-20 15H 19M 44S Tuesday 12 weekday Afternoon
## 2 2015 2018-03-20 15H 23M 0S Tuesday 12 weekday Afternoon
## 3 2015 2018-03-20 15H 22M 45S Tuesday 12 weekday Afternoon
## 4 2015 2018-03-20 15H 22M 30S Tuesday 12 weekday Afternoon
Delete a column and change the order of a column
crime.figure2=crime.figure2[,-2]
head(crime.figure2,4)
## ID CrimeType Location Arrest Domestic District Year Date2
## 1 10000092 BATTERY STREET FALSE FALSE 11 2015 2018-03-20
## 2 10000094 OTHER OFFENSE STREET TRUE FALSE 7 2015 2018-03-20
## 3 10000095 BATTERY APARTMENT FALSE TRUE 2 2015 2018-03-20
## 4 10000096 BATTERY APARTMENT FALSE FALSE 2 2015 2018-03-20
## time weekday week weekend Time_of_day
## 1 15H 19M 44S Tuesday 12 weekday Afternoon
## 2 15H 23M 0S Tuesday 12 weekday Afternoon
## 3 15H 22M 45S Tuesday 12 weekday Afternoon
## 4 15H 22M 30S Tuesday 12 weekday Afternoon
crime.figure2.final = crime.figure2[,c('ID','Date2','Time_of_day','weekday','weekend','week','CrimeType','Location','District','Arrest','Domestic')]
crime.figure2.final$CrimeType[crime.figure2.final$CrimeType=='OTHER OFFENSE']='oth'
head(crime.figure2.final,4)
## ID Date2 Time_of_day weekday weekend week CrimeType Location
## 1 10000092 2018-03-20 Afternoon Tuesday weekday 12 BATTERY STREET
## 2 10000094 2018-03-20 Afternoon Tuesday weekday 12 oth STREET
## 3 10000095 2018-03-20 Afternoon Tuesday weekday 12 BATTERY APARTMENT
## 4 10000096 2018-03-20 Afternoon Tuesday weekday 12 BATTERY APARTMENT
## District Arrest Domestic
## 1 11 FALSE FALSE
## 2 7 TRUE FALSE
## 3 2 FALSE TRUE
## 4 2 FALSE FALSE
#Task2 To study the crime pattern over time the data needs to be prepared in a time series format. Take the data from task-1 and further prepare it to in a format that has crime incident counts aggregated yearly by week for every district as shown in fig-3
df_tseries = crime.figure2 %>%
group_by(Year,week, District)%>%
summarise(count = n())
## `summarise()` has grouped output by 'Year', 'week'. You can override using the `.groups` argument.
head(df_tseries,4)
## # A tibble: 4 x 4
## # Groups: Year, week [1]
## Year week District count
## <int> <dbl> <int> <int>
## 1 2011 3 1 869
## 2 2011 3 2 1194
## 3 2011 3 3 1400
## 4 2011 3 4 1598