In this post, the Toronto crime data set (167525 records and 29 variables) is explored. The package tidyverse is used to wrangle the data.
Toronto’s crime data is available on the Toronto Police Service Public Safety Data Portal.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(dplyr)
library(DataExplorer)
library(ggplot2)
getwd()
## [1] "C:/Users/user/Documents/Data Science/Analytics projects/7-Theme/Crime analysis/Toronto police/Toronto Police data-2014-2018"
data <- read_csv("MCI_2014_to_2018.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## event_unique_id = col_character(),
## occurrencedate = col_datetime(format = ""),
## reporteddate = col_datetime(format = ""),
## premisetype = col_character(),
## offence = col_character(),
## reportedmonth = col_character(),
## reporteddayofweek = col_character(),
## occurrencemonth = col_character(),
## occurrencedayofweek = col_character(),
## MCI = col_character(),
## Division = col_character(),
## Neighbourhood = col_character()
## )
## i Use `spec()` for the full column specifications.
str(data)
## tibble [167,525 x 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
## $ Y : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
## $ Index_ : num [1:167525] 2349 2350 2351 2352 2354 ...
## $ event_unique_id : chr [1:167525] "GO-20149004286" "GO-20142411379" "GO-20142411379" "GO-20142412127" ...
## $ occurrencedate : POSIXct[1:167525], format: "2014-06-20 10:55:00" "2014-07-02 00:20:00" ...
## $ reporteddate : POSIXct[1:167525], format: "2014-06-20 13:20:00" "2014-07-02 02:58:00" ...
## $ premisetype : chr [1:167525] "Apartment" "Outside" "Outside" "House" ...
## $ ucr_code : num [1:167525] 2130 1457 1610 2120 1430 ...
## $ ucr_ext : num [1:167525] 210 100 100 200 100 200 100 100 220 220 ...
## $ offence : chr [1:167525] "Theft Over" "Pointing A Firearm" "Robbery With Weapon" "B&E" ...
## $ reportedyear : num [1:167525] 2014 2014 2014 2014 2014 ...
## $ reportedmonth : chr [1:167525] "June" "July" "July" "July" ...
## $ reportedday : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
## $ reporteddayofyear : num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
## $ reporteddayofweek : chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
## $ reportedhour : num [1:167525] 13 2 2 5 20 17 23 11 22 16 ...
## $ occurrenceyear : num [1:167525] 2014 2014 2014 2014 2014 ...
## $ occurrencemonth : chr [1:167525] "June" "July" "July" "July" ...
## $ occurrenceday : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
## $ occurrencedayofyear: num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
## $ occurrencedayofweek: chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
## $ occurrencehour : num [1:167525] 10 0 0 1 20 0 23 1 15 1 ...
## $ MCI : chr [1:167525] "Theft Over" "Assault" "Robbery" "Break and Enter" ...
## $ Division : chr [1:167525] "D52" "D32" "D32" "D42" ...
## $ Hood_ID : num [1:167525] 76 36 36 132 130 120 31 76 132 32 ...
## $ Neighbourhood : chr [1:167525] "Bay Street Corridor (76)" "Newtonbrook West (36)" "Newtonbrook West (36)" "Malvern (132)" ...
## $ Lat : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
## $ Long : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
## $ ObjectId : num [1:167525] 2001 2002 2003 2004 2005 ...
## - attr(*, "spec")=
## .. cols(
## .. X = col_double(),
## .. Y = col_double(),
## .. Index_ = col_double(),
## .. event_unique_id = col_character(),
## .. occurrencedate = col_datetime(format = ""),
## .. reporteddate = col_datetime(format = ""),
## .. premisetype = col_character(),
## .. ucr_code = col_double(),
## .. ucr_ext = col_double(),
## .. offence = col_character(),
## .. reportedyear = col_double(),
## .. reportedmonth = col_character(),
## .. reportedday = col_double(),
## .. reporteddayofyear = col_double(),
## .. reporteddayofweek = col_character(),
## .. reportedhour = col_double(),
## .. occurrenceyear = col_double(),
## .. occurrencemonth = col_character(),
## .. occurrenceday = col_double(),
## .. occurrencedayofyear = col_double(),
## .. occurrencedayofweek = col_character(),
## .. occurrencehour = col_double(),
## .. MCI = col_character(),
## .. Division = col_character(),
## .. Hood_ID = col_double(),
## .. Neighbourhood = col_character(),
## .. Lat = col_double(),
## .. Long = col_double(),
## .. ObjectId = col_double()
## .. )
introduce(data)
## # A tibble: 1 x 9
## rows columns discrete_columns continuous_colu~ all_missing_col~
## <int> <int> <int> <int> <int>
## 1 167525 29 12 17 0
## # ... with 4 more variables: total_missing_values <int>, complete_rows <int>,
## # total_observations <int>, memory_usage <dbl>
The data contains 167525 rows and 29 columns.
There are also 245 missing values in total.
The data contain 29 variables.
The variables such as reportedyear, reportedmonth, reportedday, reporteddayofweek were extracted from the native variable reporteddate.
The variables such as occurenceyear, occurencemonth, occurencedday, occurencedayofweek were extracted from the native variable occurenceddate.
data %>%
distinct(event_unique_id) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 145817
There are 145817 unique ids while there are 167525 rows in the data set. Therefore, there are maybe 21708 cases (167525-145817 = 21708) that were considered as multiple crime types, such as assault and theft.
data %>%
group_by(event_unique_id) %>%
summarise(countid = n()) %>%
arrange(desc(countid)) %>%
head(10)
## # A tibble: 10 x 2
## event_unique_id countid
## <chr> <int>
## 1 GO-20151785704 24
## 2 GO-2015840772 16
## 3 GO-20152031940 12
## 4 GO-201768934 11
## 5 GO-20142573299 10
## 6 GO-20161194129 10
## 7 GO-2016666713 10
## 8 GO-201669300 10
## 9 GO-20172020713 10
## 10 GO-20181906437 10
The crime with an id GO-20151785704 appears 24 times. I wonder if one occurrence can be recorded as many type of crimes.
data %>%
filter(event_unique_id == "GO-20151785704") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 24 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 2 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 3 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 4 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 5 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 6 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 7 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 8 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 9 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 10 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## # ... with 14 more rows
In this case, these row with id GO-20151785704 are duplicated. The 24 rows are identical. We can remove these duplicates but what if a crime are reported as multiple types?
data %>%
filter(event_unique_id == "GO-20161246113") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 8 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 2 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 3 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 4 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Assault Assau~
## 5 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 6 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 7 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 8 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
In this case, the crime with an id GO-20161246113 was considered as both assault and robbery. Another example of multiple offenses was found for the crime with id GO-20142139964
data %>%
filter(event_unique_id == "GO-20142139964") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 7 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault With~ Assau~
## 2 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Assau~
## 3 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 4 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 5 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 6 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 7 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
Here, the crime with the id GO-20142139964 is classified as Assault With Weapon, Assault Bodily Harm, and simply Assault. However, the row where the offense is classified as Assault Bodily Harm is duplicated.
nrow(distinct(data))
## [1] 167525
The above code output shows that there are 167525 distinct rows despite the fact that we found there are many duplicate rows. To find the reason, we will examine all the columns of the row id GO-20142139964.
data %>%
filter(event_unique_id == "GO-20142139964")
## # A tibble: 7 x 29
## X Y Index_ event_unique_id occurrencedate reporteddate
## <dbl> <dbl> <dbl> <chr> <dttm> <dttm>
## 1 -79.3 43.7 97670 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 2 -79.3 43.7 97671 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 3 -79.3 43.7 97672 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 4 -79.3 43.7 97673 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 5 -79.3 43.7 97674 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 6 -79.3 43.7 97675 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 7 -79.3 43.7 97700 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## # ... with 23 more variables: premisetype <chr>, ucr_code <dbl>, ucr_ext <dbl>,
## # offence <chr>, reportedyear <dbl>, reportedmonth <chr>, reportedday <dbl>,
## # reporteddayofyear <dbl>, reporteddayofweek <chr>, reportedhour <dbl>,
## # occurrenceyear <dbl>, occurrencemonth <chr>, occurrenceday <dbl>,
## # occurrencedayofyear <dbl>, occurrencedayofweek <chr>, occurrencehour <dbl>,
## # MCI <chr>, Division <chr>, Hood_ID <dbl>, Neighbourhood <chr>, Lat <dbl>,
## # Long <dbl>, ObjectId <dbl>
Here, all the values in the column index_ are unique. Therefore, the search will not find any duplicate. Therefore, we should remove the column Index_ before removing duplicates. It should be noted that the two column X and Y are exactly Lat and Lon
In addition, we can also remove the following columns: ucr_code, ucr_ext, Division, ObjectId.
data1 <- data %>%
select (-c(X, Y, Index_, ucr_code,
ucr_ext, Division, ObjectId))
To remove all the duplicates, we use the function distinct() in tidyverse
data2 <- data1 %>%
distinct()
nrow(distinct(data2))
## [1] 155552
In consequence, the data2 contains only 155552 records.
We can check the row GO-20142139964 again
data2 %>%
filter(event_unique_id == "GO-20142139964") %>%
select(occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)
## # A tibble: 3 x 5
## occurrenceyear occurrencemonth occurrenceday offence MCI
## <dbl> <chr> <dbl> <chr> <chr>
## 1 2014 May 24 Assault With Weapon Assault
## 2 2014 May 24 Assault Assault
## 3 2014 May 24 Assault Bodily Harm Assault
So three offenses were committed for GO-20142139964 : Assault With Weapon, Assault, Assault Bodily Harm.
We can check another crimw id.
data2 %>%
filter(event_unique_id == "GO-20161246113") %>%
select(event_unique_id, occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)
## # A tibble: 2 x 6
## event_unique_id occurrenceyear occurrencemonth occurrenceday offence MCI
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 2 GO-20161246113 2016 July 16 Assault Assa~
The crime with id GO-20161246113 was found to have two offenses: Robbery with Weapon and Assault.
The MCIs for these two offenses are Robbery and Assault, respectively.
plot_missing(data2)
There are 0.03 percent of rows that contain missing values (0.03% *155552 = 46.6656 = 47).
These 47 missing values are found in the five following columns: occurrencedayofweek, occurrencedayofyear, occurrenceday, occurrencemonth, occurrenceyear.
Another way to find the number of missing values for each column.
sapply(data2[,c("occurrencedayofweek", "occurrencedayofyear", "occurrenceday", "occurrencemonth", "occurrenceyear")], function(x) sum(is.na(x)))
## occurrencedayofweek occurrencedayofyear occurrenceday occurrencemonth
## 47 47 47 47
## occurrenceyear
## 47
We can see that these five columns have the same number of missing values. We can guess that these 47 rows are the same.
We can examine further rows with missing values, as follow.
head(data2[which(is.na(data2$occurrenceyear)) ,
c("occurrencedate","reporteddate",
"occurrenceyear","occurrencemonth","occurrenceday",
"occurrencedayofweek","occurrencedayofyear","occurrencehour")],50)
## # A tibble: 47 x 8
## occurrencedate reporteddate occurrenceyear occurrencemonth
## <dttm> <dttm> <dbl> <chr>
## 1 1996-01-31 00:00:00 2014-01-06 13:00:00 NA <NA>
## 2 1998-01-01 00:01:00 2014-04-26 15:20:00 NA <NA>
## 3 1998-01-01 00:01:00 2014-04-26 15:20:00 NA <NA>
## 4 1980-04-24 13:26:00 2015-02-03 12:22:00 NA <NA>
## 5 1999-03-24 00:01:00 2015-03-24 14:40:00 NA <NA>
## 6 1995-01-01 12:00:00 2014-03-06 16:58:00 NA <NA>
## 7 1995-01-01 12:00:00 2014-03-06 16:58:00 NA <NA>
## 8 1999-01-01 00:01:00 2016-05-09 16:10:00 NA <NA>
## 9 1998-06-01 12:00:00 2014-01-01 12:08:00 NA <NA>
## 10 1989-01-01 01:52:00 2014-03-04 01:52:00 NA <NA>
## # ... with 37 more rows, and 4 more variables: occurrenceday <dbl>,
## # occurrencedayofweek <chr>, occurrencedayofyear <dbl>, occurrencehour <dbl>
As we can see here that all these crimes happened before 2014. The values of year, month, day, day of week and day of year were not extracted from occurencedate.
We can remove all of these records.
If we need to keep these rows for further analysis, we have to fill these missing value with appropriate values that are extracted from the column occurrencedate and the column reporteddate.
One way to replace these missing values:
data2b <- data2 %>% dplyr::mutate( occurrenceyear = lubridate::year(occurrencedate), occurrencemonth = lubridate::month(occurrencedate), occurrenceday = lubridate::day(occurrencedate), reportedmonth = lubridate::month(reporteddate))
For this project, we will simply remove rows containing missing values.
Remove all the records with missing values
data2 <- data2 %>%
drop_na()
We can always check if the data2 contain any missing value
which(is.na(data2$occurrenceyear))
## integer(0)
In summary, after dealing with duplicates and missing values, we obtain a data frame which 19 variables and 155505 records.
occur_year <- data2 %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
head(10)
occur_year
## # A tibble: 10 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2018 32902
## 2 2017 31828
## 3 2016 30374
## 4 2015 30055
## 5 2014 29428
## 6 2013 481
## 7 2012 138
## 8 2011 81
## 9 2010 58
## 10 2009 38
There are crimes that happened before 2014.
To find the earliest year reported in the data
data2 %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(occurrenceyear)
## # A tibble: 19 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2000 14
## 2 2001 11
## 3 2002 7
## 4 2003 8
## 5 2004 10
## 6 2005 14
## 7 2006 7
## 8 2007 21
## 9 2008 30
## 10 2009 38
## 11 2010 58
## 12 2011 81
## 13 2012 138
## 14 2013 481
## 15 2014 29428
## 16 2015 30055
## 17 2016 30374
## 18 2017 31828
## 19 2018 32902
We have already remove 49 records where there are missing values. These records related to all the crimes happen before 2000. Therefore, at this stage, we only see the records where the occurrenceyear is from 2000.
data2 %>%
filter(occurrenceyear == 2002) %>%
select(premisetype, offence ,occurrenceyear, reportedyear)
## # A tibble: 7 x 4
## premisetype offence occurrenceyear reportedyear
## <chr> <chr> <dbl> <dbl>
## 1 House Assault 2002 2014
## 2 Apartment Assault 2002 2014
## 3 House Assault 2002 2016
## 4 Commercial B&E 2002 2017
## 5 Apartment B&E 2002 2016
## 6 House Assault 2002 2017
## 7 Apartment Assault 2002 2016
Apparently, crimes happened in 2002 but were reported way too long after.
Before doing further analysis, it is better that we remove these records.
The resulting data contain only the crimes that occurred from 2014.
data3 <- data2 %>%
filter(occurrenceyear >= 2014)
The data3 contains 154587 rows while the data2 contains 155505 rows. Therefore, 918 rows are removed. In other words, there were 918 crimes which happened between 2000 and 2013 but were reported only in 2014 or later. We can check if there are still any records where the occurrenceyear is before 2014.
data3 %>%
filter(occurrenceyear < 2014) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
NEIGHBOURHOOD
First, we should see how many neighborhood in Toronto?
n_distinct(data3$Hood_ID)
## [1] 140
n_distinct(data3$Neighbourhood)
## [1] 141
Since there are only 140 neighborhoods in Toronto, There is mistake in the column Neighborhood.
Let find out this mistake by examining a few lines of the column Neighborhood.
head(data3$Neighbourhood)
## [1] "Bay Street Corridor (76)" "Newtonbrook West (36)"
## [3] "Newtonbrook West (36)" "Malvern (132)"
## [5] "Milliken (130)" "Clairlea-Birchmount (120)"
The Neighborhood column contains the name of neighborhoods as well as the id number of these neighborhoods. The mistake could get from the name or the id.
First, we can check if there is any wrong number in the parenthesis. As Toronto has 140 neighborhoods, one possible mistake can be the number 141 that was assigned to one neighborhood.
sum(str_detect(data3$Neighbourhood, "141"))
## [1] 0
So, there is no number 141 in this column. However, we can not continue this method by increasing the number to 142, 143, 144… One way to tackle this problem is extracting the hoodid from the column Neighborhood, name the new column as Hood_id2, and compare with Hood_ID, row by row. We will see if there is a mismatch.
data4 <- data3 %>%
mutate(Hood_ID2 = regmatches(data3$Neighbourhood, regexpr("[0-9]+",data3$Neighbourhood)))
class(data4$Hood_ID)
## [1] "numeric"
class(data4$Hood_ID2)
## [1] "character"
We need to convert Hood_ID2 into numeric
data4$Hood_ID2 <- as.numeric(data4$Hood_ID2)
Checking
class(data4$Hood_ID)
## [1] "numeric"
class(data4$Hood_ID2)
## [1] "numeric"
Now, we can find the number of rows that two columns Hood_ID and Hood_ID2 have different/same values
data4 %>%
filter(Hood_ID != Hood_ID2) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
There are no mismatch. We can conclude that there is no mistake related to the id of neighborhood. Rather, mistakes come from the name of the neighborhood.
We can go back to the data4 as we do not need the Hood_ID2 column. If there is any mistake related to the name, we suspect that the number of the neighborhood will be minor. We can count neighborhood in the data.
data4 %>%
group_by(Neighbourhood) %>%
summarise(counts = n()) %>%
arrange((counts)) %>%
head(20)
## # A tibble: 20 x 2
## Neighbourhood counts
## <chr> <int>
## 1 Eringate-Centennial_West Deane (11) 23
## 2 Lambton Baby Point (114) 170
## 3 Yonge-St.Clair (97) 224
## 4 Centennial Scarborough (133) 281
## 5 Markland Wood (12) 284
## 6 Broadview North (57) 285
## 7 Yonge-Eglinton (100) 294
## 8 Caledonia-Fairbank (109) 297
## 9 Forest Hill South (101) 348
## 10 Princess-Rosethorn (10) 352
## 11 Humewood-Cedarvale (106) 360
## 12 Guildwood (140) 364
## 13 Pleasant View (46) 386
## 14 Mount Pleasant East (99) 395
## 15 Blake-Jones (69) 401
## 16 Bayview Woods-Steeles (49) 403
## 17 Bridle Path-Sunnybrook-York Mills (41) 404
## 18 Woodbine-Lumsden (60) 406
## 19 Kingsway South (15) 419
## 20 Maple Leaf (29) 420
We can see that the neighborhood Eringate-Centennial_West Deane (11) appears 23 times which is maybe not usual.
The mistake come from the neighborhood named Eringate-Centennial-West Deane and Eringate-Centennial_West Deane (11), it should be Eringate-Centennial-West Deane
data4 %>%
group_by(Neighbourhood) %>%
summarise(counts = n()) %>%
filter(Neighbourhood %in%
c("Eringate-Centennial_West Deane (11)","Eringate-Centennial-West Deane (11)"))
## # A tibble: 2 x 2
## Neighbourhood counts
## <chr> <int>
## 1 Eringate-Centennial-West Deane (11) 618
## 2 Eringate-Centennial_West Deane (11) 23
Now, we need to replace Eringate-Centennial_West Deane (11) by Eringate-Centennial-West Deane (11)
data4$Neighbourhood <- str_replace(data4$Neighbourhood, "Eringate-Centennial_West Deane (11)", "Eringate-Centennial-West Deane (11)")
We can save the final data
write.csv(data4,'Torontocrimes.csv')
Torontocrimes <- read_csv("Torontocrimes.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## event_unique_id = col_character(),
## occurrencedate = col_datetime(format = ""),
## reporteddate = col_datetime(format = ""),
## premisetype = col_character(),
## offence = col_character(),
## reportedmonth = col_character(),
## reporteddayofweek = col_character(),
## occurrencemonth = col_character(),
## occurrencedayofweek = col_character(),
## MCI = col_character(),
## Neighbourhood = col_character()
## )
## i Use `spec()` for the full column specifications.
premisetype1 <- Torontocrimes %>%
group_by(premisetype) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
premisetype1
## # A tibble: 5 x 2
## premisetype counts
## <chr> <int>
## 1 Outside 40073
## 2 Apartment 38081
## 3 Commercial 29952
## 4 House 29575
## 5 Other 16906
There are five types of premises, including outside, apartment, commercial, house, and other.
The number of crimes that happened in apartments is higher than in houses.
ggplot(premisetype1,
aes(x= reorder(premisetype, -counts), y = counts)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "darkblue") +
labs(title = "The five premise types",
caption="Source: CMI dataset",
y = "Frequency",
x = "Types of premises") +
geom_text(aes(label=counts),
position = position_dodge(width = 0.9), vjust=-0.25) +
theme(plot.title = element_text(color="black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size=18, face="bold"),
axis.title.y = element_text(size=18, face="bold"))
B. TYPES OF OFFENCES
offencetype <- Torontocrimes %>%
group_by(offence) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
head(offencetype,10)
## # A tibble: 10 x 2
## offence counts
## <chr> <int>
## 1 Assault 57324
## 2 B&E 29066
## 3 Theft Of Motor Vehicle 16834
## 4 Assault With Weapon 12844
## 5 Robbery - Mugging 5077
## 6 B&E W'Intent 4608
## 7 Assault Bodily Harm 3948
## 8 Theft Over 3038
## 9 Robbery - Other 2607
## 10 Robbery With Weapon 2392
There are 49 types of offenses, Assault appears to be the most popular type of offense.
Let visualize the first 15 top offences.
top_offence <- head(offencetype,15)
ggplot(top_offence,
aes(x= reorder(offence,-counts), y = counts)) +
geom_bar(stat="identity",
width = 0.75,
fill = "darkblue") +
labs(title="The top 15 offence types",
caption="Source: CMI dataset",
y = "Frequency",
x = NULL) +
geom_text(aes(label = counts), size = 5, position = position_dodge(width = 0.2), hjust = -0.05) +
theme(plot.title = element_text(color="black", size=18, face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size=18, face="bold"),
axis.title.y = element_text(size=18, face="bold")) +
coord_flip()
occur_year <- Torontocrimes %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(occurrenceyear) %>%
head(10)
occur_year
## # A tibble: 5 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2014 29428
## 2 2015 30055
## 3 2016 30374
## 4 2017 31828
## 5 2018 32902
ggplot(occur_year,
aes(x = occurrenceyear, y = counts)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "cornflowerblue") +
labs(title = "Increase in Crimes from 2014 to 2018",
caption = "Source: CMI dataset",
x = "Year",
y = "Frequency") +
theme_set(theme_classic()) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
The crimes (all kinds of crime) was found to increase from 29428 in 2014 to 32909 in 2018.
occur_month <- Torontocrimes %>%
group_by(occurrencemonth) %>%
summarise(counts = n())
occur_month
## # A tibble: 12 x 2
## occurrencemonth counts
## * <chr> <int>
## 1 April 12372
## 2 August 13474
## 3 December 12576
## 4 February 10936
## 5 January 12158
## 6 July 13524
## 7 June 13364
## 8 March 12121
## 9 May 13499
## 10 November 13481
## 11 October 13784
## 12 September 13298
The result is sorted in alphabetical order. We need to convert the following variables into factor.
Torontocrimes$occurrencemonth <- factor(Torontocrimes$occurrencemonth,
levels = c("January","February","March",
"April","May","June",
"July","August", "September",
"October","November","December"),ordered = TRUE)
Torontocrimes$occurrencedayofweek <- factor(Torontocrimes$occurrencedayofweek,
levels = c("Monday","Tuesday",
"Wednesday","Thursday",
"Friday","Saturday","Sunday"),ordered = TRUE)
Torontocrimes$reportedmonth <- factor(Torontocrimes$reportedmonth,
levels = c("January","February","March",
"April","May","June",
"July","August","September",
"October","November","December"),ordered = TRUE)
Torontocrimes$reporteddayofweek <- factor(Torontocrimes$reporteddayofweek,
levels = c("Monday","Tuesday",
"Wednesday","Thursday",
"Friday","Saturday", "Sunday"),ordered = TRUE)
occurrencemonth_tbl <- Torontocrimes %>%
group_by(occurrencemonth) %>%
summarise(counts= n())
occurrencemonth_tbl
## # A tibble: 12 x 2
## occurrencemonth counts
## * <ord> <int>
## 1 January 12158
## 2 February 10936
## 3 March 12121
## 4 April 12372
## 5 May 13499
## 6 June 13364
## 7 July 13524
## 8 August 13474
## 9 September 13298
## 10 October 13784
## 11 November 13481
## 12 December 12576
Now, months is sorted in chronological order. February appears to be the month where less crimes were committed.
g_occurrenceMONTH <- ggplot(occurrencemonth_tbl,
aes(x = occurrencemonth, y = counts))
g_occurrenceMONTH + geom_bar(stat = "identity",
width = 0.5,
fill = "cornflowerblue") +
labs(title = "Number of crimes by month",
caption = "Source: CMI dataset",
x = "Year",
y = "Frequency") +
theme_set(theme_classic()) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
OCCRRENCEDAYOFMONTH
occurrencedayofmonth_tbl <- Torontocrimes %>%
group_by(occurrenceday) %>%
count()
g_occurrenceday <- ggplot(occurrencedayofmonth_tbl, aes(x = occurrenceday, y = n))
g_occurrenceday +
geom_line(color = "orange", size = 0.5) +
geom_point(color = "blue", size = 5)+
labs(title = "Number of crimes by day of month",
caption = "Source: CMI dataset",
x = "Day of month",
y = "Frequency") +
theme_set(theme_classic()) +
scale_x_continuous(name="Day of month", breaks=seq(1, 31, 3)) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
We should remember that there are 7 months that have 31 days, and the month of February only has 28 days.
OCCURRENCEDAYOFWEEK
occurrencedayofweek_tbl <- Torontocrimes %>%
group_by(occurrencedayofweek) %>%
summarise(counts = n())
occurrencedayofweek_tbl
## # A tibble: 7 x 2
## occurrencedayofweek counts
## * <ord> <int>
## 1 Monday 21363
## 2 Tuesday 21082
## 3 Wednesday 21585
## 4 Thursday 21781
## 5 Friday 23512
## 6 Saturday 23075
## 7 Sunday 22189
ggplot(occurrencedayofweek_tbl,
aes(x = occurrencedayofweek, y = counts)) +
geom_bar(stat = "identity",
width = 0.75,
fill = "cornflowerblue") +
labs(title = "Number of crimes by day of week",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
geom_text(aes(label = counts),
position = position_dodge(width = 0.9),
vjust = -0.25) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
The number of crimes occurred on Friday are the highest.
OCCURRENCEHOUR
occurrencehour_tbl <- Torontocrimes %>%
group_by(occurrencehour) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
ggplot(occurrencehour_tbl,
aes(x = occurrencehour, y = counts)) +
geom_line(color = "purple") +
geom_point (color = "blue", size = 5) +
labs(title = "Number of crimes by time of the day",
caption = "Source: CMI dataset",
x = "Time of the day",
y = "Frequency") +
theme_set(theme_classic()) +
scale_x_continuous(name = "Day of month", breaks = seq(0, 23, 1)) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
The highest numbers of crime were found to at 12 am and 12 pm. The number of crimes appears to be small in the early morning. The number of crime increased gradually during the day and reached 10000 at 12 am.
MAJOR CRIME INDICATORS (MCI)
MCI_tbl <- Torontocrimes %>%
group_by(MCI) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
MCI_tbl
## # A tibble: 5 x 2
## MCI counts
## <chr> <int>
## 1 Assault 82267
## 2 Break and Enter 34793
## 3 Auto Theft 16834
## 4 Robbery 15382
## 5 Theft Over 5311
All the reported crimes are classified in five types of MIC, naming Assault, Break and Enter, Robbery, and Theft Over.
MCI_data <- data.frame(
group = MCI_tbl$MCI,
value = MCI_tbl$counts)
MCI_data
## group value
## 1 Assault 82267
## 2 Break and Enter 34793
## 3 Auto Theft 16834
## 4 Robbery 15382
## 5 Theft Over 5311
ggplot(data = MCI_data,
aes(x = reorder(group,-value),
y = value)) +
geom_col(aes(fill = group) ,
show.legend = FALSE) +
ggtitle(paste("The number of crime by categories")) +
coord_flip() +
geom_label(aes(label = paste(value,
floor((value/sum(value))*100),
sep = "\n"),
fill = group),
show.legend = FALSE,
size = 5,
label.padding = unit(0.25, "lines")) +
expand_limits(y = -150) +
scale_fill_brewer(palette = "Set1",
direction = -1) +
labs(title = "Five categoies of crimes",
caption = "Source: CMI dataset",
y = "Frequency",
x = NULL) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
From 2014 to 2018, Assault is the most popular crime. Indeed, 53% of crimes are classified as Assault.
There are only 3% of total crimes that are Theft Over. The proportion of Robbery and Auto Theft are similar, 9% and 10 %, respectively. The break and Enter crime is twice as many as Auto Theft crime.
FURTHER UNDERSTANDING THE DATA.
Now, we would like to know, between 2014 and 2018, are there any crimes reported later than a year, a month, or a week. We can also check if the reportedyear and the occurenceyear is difference.
YEAR
later_year <- Torontocrimes %>%
filter(occurrenceyear < reportedyear) %>%
count()
later_year
## # A tibble: 1 x 1
## n
## <int>
## 1 2041
There are 2141 cases reported in the next year after the crime happened.
It should be noted that these cases could happen in December in 2014 but reported in January in 2015.
We can also detect if there are any mistake on year, such as reportedyear is earlier than occurrenceyear
Torontocrimes %>%
filter(reportedyear < occurrenceyear) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
There is no mistake about the year.
MONTH We want to know if there cases reported at least a month later. In this case, we should fix the year the same
Torontocrimes %>%
filter(occurrencemonth != reportedmonth,
occurrenceyear == reportedyear) %>%
group_by(occurrenceyear) %>%
count()
## # A tibble: 5 x 2
## # Groups: occurrenceyear [5]
## occurrenceyear n
## <dbl> <int>
## 1 2014 1518
## 2 2015 1365
## 3 2016 1324
## 4 2017 1456
## 5 2018 2067
The table shows the number of cases that was not reported within a month (in the same month), for each year from 2014 to 2018. In 2008, the number of crimes reported at least one month later reached 2067. I should be noted that crime may be happened in the end of month but reported few days later.
Torontocrimes %>%
filter(occurrenceday < reportedday,
occurrenceyear == reportedyear,
occurrencemonth == reportedmonth) %>%
group_by(occurrencemonth) %>%
count()
## # A tibble: 12 x 2
## # Groups: occurrencemonth [12]
## occurrencemonth n
## <ord> <int>
## 1 January 2574
## 2 February 2460
## 3 March 2795
## 4 April 2840
## 5 May 3199
## 6 June 3070
## 7 July 3017
## 8 August 3175
## 9 September 3138
## 10 October 3310
## 11 November 3244
## 12 December 2801
Each month, at least 2500 cases were reported more than a day later.
dayofyear <- Torontocrimes %>%
group_by(occurrencedayofyear) %>%
summarise(counts = n())
ggplot(dayofyear,
aes(x = occurrencedayofyear, y = counts)) +
geom_line(color = "purple") + geom_point (color = "blue", size = 2) +
labs(title = "Number of crimes though a year, from 2014 to 2018",
caption = "Source: CMI dataset",
x = "Day of year",
y = "Counts") + theme_set(theme_classic())
Visualizing timeseries.
Torontocrimes <- Torontocrimes %>%
mutate(Date = strftime(occurrencedate, format="%Y-%m-%d"))
str(Torontocrimes$Date)
## chr [1:154587] "2014-06-20" "2014-07-01" "2014-07-01" "2014-07-01" ...
Torontocrimes$Date <- as.Date(Torontocrimes$Date,"%Y-%m-%d")
we only need day column as 2014-01-01 instead of full fomat
time_series <- Torontocrimes %>%
group_by(Date) %>%
summarise(counts = n())
ggplot(time_series,
aes(Date, counts)) +
geom_point(color = "purple",
size = 3,
pch = 18) +
scale_x_date(date_breaks = "6 month") +
labs(x= "Date",
y= "Frequency",
title = "Number of crime from 2014 to 2018") +
stat_smooth(colour = "green") +
theme(plot.title = element_text(color="darkblue", size=18, face="bold.italic"),
text = element_text(size=18,face="bold"),
axis.title.x = element_text(size= 18, face="bold"),
axis.title.y = element_text(size=18, face="bold"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'