In this post, the Toronto crime data set (167525 records and 29 variables) is explored. The package tidyverse is used to wrangle the data.

Finding 47 rows with missing values. Removing all of rows containing missing values as these rows are not useful for future analyis.
11973 duplicates have been found and removed.
The number of columns reduced from 29 to 19. Columns were transformed to the appropriate data types.
Initial analysis and univariate analysis is performed.

Toronto’s crime data is available on the Toronto Police Service Public Safety Data Portal.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(readr)
library(dplyr)
library(DataExplorer)
library(ggplot2)

getwd()

## [1] "C:/Users/user/Documents/Data Science/Analytics projects/7-Theme/Crime analysis/Toronto police/Toronto Police data-2014-2018"

data <- read_csv("MCI_2014_to_2018.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   event_unique_id = col_character(),
##   occurrencedate = col_datetime(format = ""),
##   reporteddate = col_datetime(format = ""),
##   premisetype = col_character(),
##   offence = col_character(),
##   reportedmonth = col_character(),
##   reporteddayofweek = col_character(),
##   occurrencemonth = col_character(),
##   occurrencedayofweek = col_character(),
##   MCI = col_character(),
##   Division = col_character(),
##   Neighbourhood = col_character()
## )
## i Use `spec()` for the full column specifications.

str(data)

## tibble [167,525 x 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X                  : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
##  $ Y                  : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
##  $ Index_             : num [1:167525] 2349 2350 2351 2352 2354 ...
##  $ event_unique_id    : chr [1:167525] "GO-20149004286" "GO-20142411379" "GO-20142411379" "GO-20142412127" ...
##  $ occurrencedate     : POSIXct[1:167525], format: "2014-06-20 10:55:00" "2014-07-02 00:20:00" ...
##  $ reporteddate       : POSIXct[1:167525], format: "2014-06-20 13:20:00" "2014-07-02 02:58:00" ...
##  $ premisetype        : chr [1:167525] "Apartment" "Outside" "Outside" "House" ...
##  $ ucr_code           : num [1:167525] 2130 1457 1610 2120 1430 ...
##  $ ucr_ext            : num [1:167525] 210 100 100 200 100 200 100 100 220 220 ...
##  $ offence            : chr [1:167525] "Theft Over" "Pointing A Firearm" "Robbery With Weapon" "B&E" ...
##  $ reportedyear       : num [1:167525] 2014 2014 2014 2014 2014 ...
##  $ reportedmonth      : chr [1:167525] "June" "July" "July" "July" ...
##  $ reportedday        : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
##  $ reporteddayofyear  : num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
##  $ reporteddayofweek  : chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
##  $ reportedhour       : num [1:167525] 13 2 2 5 20 17 23 11 22 16 ...
##  $ occurrenceyear     : num [1:167525] 2014 2014 2014 2014 2014 ...
##  $ occurrencemonth    : chr [1:167525] "June" "July" "July" "July" ...
##  $ occurrenceday      : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
##  $ occurrencedayofyear: num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
##  $ occurrencedayofweek: chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
##  $ occurrencehour     : num [1:167525] 10 0 0 1 20 0 23 1 15 1 ...
##  $ MCI                : chr [1:167525] "Theft Over" "Assault" "Robbery" "Break and Enter" ...
##  $ Division           : chr [1:167525] "D52" "D32" "D32" "D42" ...
##  $ Hood_ID            : num [1:167525] 76 36 36 132 130 120 31 76 132 32 ...
##  $ Neighbourhood      : chr [1:167525] "Bay Street Corridor (76)" "Newtonbrook West (36)" "Newtonbrook West (36)" "Malvern (132)" ...
##  $ Lat                : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
##  $ Long               : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
##  $ ObjectId           : num [1:167525] 2001 2002 2003 2004 2005 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X = col_double(),
##   ..   Y = col_double(),
##   ..   Index_ = col_double(),
##   ..   event_unique_id = col_character(),
##   ..   occurrencedate = col_datetime(format = ""),
##   ..   reporteddate = col_datetime(format = ""),
##   ..   premisetype = col_character(),
##   ..   ucr_code = col_double(),
##   ..   ucr_ext = col_double(),
##   ..   offence = col_character(),
##   ..   reportedyear = col_double(),
##   ..   reportedmonth = col_character(),
##   ..   reportedday = col_double(),
##   ..   reporteddayofyear = col_double(),
##   ..   reporteddayofweek = col_character(),
##   ..   reportedhour = col_double(),
##   ..   occurrenceyear = col_double(),
##   ..   occurrencemonth = col_character(),
##   ..   occurrenceday = col_double(),
##   ..   occurrencedayofyear = col_double(),
##   ..   occurrencedayofweek = col_character(),
##   ..   occurrencehour = col_double(),
##   ..   MCI = col_character(),
##   ..   Division = col_character(),
##   ..   Hood_ID = col_double(),
##   ..   Neighbourhood = col_character(),
##   ..   Lat = col_double(),
##   ..   Long = col_double(),
##   ..   ObjectId = col_double()
##   .. )

introduce(data)

## # A tibble: 1 x 9
##     rows columns discrete_columns continuous_colu~ all_missing_col~
##    <int>   <int>            <int>            <int>            <int>
## 1 167525      29               12               17                0
## # ... with 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>

The data contains 167525 rows and 29 columns.

There are also 245 missing values in total.

The data contain 29 variables.

The variables such as reportedyear, reportedmonth, reportedday, reporteddayofweek were extracted from the native variable reporteddate.

The variables such as occurenceyear, occurencemonth, occurencedday, occurencedayofweek were extracted from the native variable occurenceddate.

1. Checking consistency of the data

1.1. DUPLICATES

data %>% 
  distinct(event_unique_id) %>% 
  count()

## # A tibble: 1 x 1
##        n
##    <int>
## 1 145817

There are 145817 unique ids while there are 167525 rows in the data set. Therefore, there are maybe 21708 cases (167525-145817 = 21708) that were considered as multiple crime types, such as assault and theft.

data %>%
  group_by(event_unique_id) %>% 
  summarise(countid = n()) %>% 
  arrange(desc(countid)) %>% 
  head(10)

## # A tibble: 10 x 2
##    event_unique_id countid
##    <chr>             <int>
##  1 GO-20151785704       24
##  2 GO-2015840772        16
##  3 GO-20152031940       12
##  4 GO-201768934         11
##  5 GO-20142573299       10
##  6 GO-20161194129       10
##  7 GO-2016666713        10
##  8 GO-201669300         10
##  9 GO-20172020713       10
## 10 GO-20181906437       10

The crime with an id GO-20151785704 appears 24 times. I wonder if one occurrence can be recorded as many type of crimes.

data %>% 
  filter(event_unique_id == "GO-20151785704") %>% 
  select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)

## # A tibble: 24 x 5
##    occurrencedate      reporteddate        Neighbourhood offence           MCI  
##    <dttm>              <dttm>              <chr>         <chr>             <chr>
##  1 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  2 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  3 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  4 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  5 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  6 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  7 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  8 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
##  9 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
## 10 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95)    Administering No~ Assa~
## # ... with 14 more rows

In this case, these row with id GO-20151785704 are duplicated. The 24 rows are identical. We can remove these duplicates but what if a crime are reported as multiple types?

data %>% 
  filter(event_unique_id == "GO-20161246113") %>% 
  select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)

## # A tibble: 8 x 5
##   occurrencedate      reporteddate        Neighbourhood  offence          MCI   
##   <dttm>              <dttm>              <chr>          <chr>            <chr> 
## 1 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 2 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 3 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 4 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Assault          Assau~
## 5 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 6 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 7 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 8 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~

In this case, the crime with an id GO-20161246113 was considered as both assault and robbery. Another example of multiple offenses was found for the crime with id GO-20142139964

data %>% 
  filter(event_unique_id == "GO-20142139964") %>% 
  select(occurrencedate, reporteddate, Neighbourhood, offence,  MCI)

## # A tibble: 7 x 5
##   occurrencedate      reporteddate        Neighbourhood     offence       MCI   
##   <dttm>              <dttm>              <chr>             <chr>         <chr> 
## 1 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault With~ Assau~
## 2 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault       Assau~
## 3 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 4 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 5 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 6 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~
## 7 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (1~ Assault Bodi~ Assau~

Here, the crime with the id GO-20142139964 is classified as Assault With Weapon, Assault Bodily Harm, and simply Assault. However, the row where the offense is classified as Assault Bodily Harm is duplicated.

nrow(distinct(data))

## [1] 167525

The above code output shows that there are 167525 distinct rows despite the fact that we found there are many duplicate rows. To find the reason, we will examine all the columns of the row id GO-20142139964.

data %>% 
  filter(event_unique_id == "GO-20142139964")

## # A tibble: 7 x 29
##       X     Y Index_ event_unique_id occurrencedate      reporteddate       
##   <dbl> <dbl>  <dbl> <chr>           <dttm>              <dttm>             
## 1 -79.3  43.7  97670 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 2 -79.3  43.7  97671 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 3 -79.3  43.7  97672 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 4 -79.3  43.7  97673 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 5 -79.3  43.7  97674 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 6 -79.3  43.7  97675 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## 7 -79.3  43.7  97700 GO-20142139964  2014-05-24 01:00:00 2014-05-24 01:02:00
## # ... with 23 more variables: premisetype <chr>, ucr_code <dbl>, ucr_ext <dbl>,
## #   offence <chr>, reportedyear <dbl>, reportedmonth <chr>, reportedday <dbl>,
## #   reporteddayofyear <dbl>, reporteddayofweek <chr>, reportedhour <dbl>,
## #   occurrenceyear <dbl>, occurrencemonth <chr>, occurrenceday <dbl>,
## #   occurrencedayofyear <dbl>, occurrencedayofweek <chr>, occurrencehour <dbl>,
## #   MCI <chr>, Division <chr>, Hood_ID <dbl>, Neighbourhood <chr>, Lat <dbl>,
## #   Long <dbl>, ObjectId <dbl>

Here, all the values in the column index_ are unique. Therefore, the search will not find any duplicate. Therefore, we should remove the column Index_ before removing duplicates. It should be noted that the two column X and Y are exactly Lat and Lon

In addition, we can also remove the following columns: ucr_code, ucr_ext, Division, ObjectId.

data1 <- data %>% 
  select (-c(X, Y, Index_, ucr_code, 
             ucr_ext, Division, ObjectId))

To remove all the duplicates, we use the function distinct() in tidyverse

data2 <- data1 %>% 
  distinct()

nrow(distinct(data2))

## [1] 155552

In consequence, the data2 contains only 155552 records.

We can check the row GO-20142139964 again

data2 %>% 
  filter(event_unique_id == "GO-20142139964") %>% 
  select(occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)

## # A tibble: 3 x 5
##   occurrenceyear occurrencemonth occurrenceday offence             MCI    
##            <dbl> <chr>                   <dbl> <chr>               <chr>  
## 1           2014 May                        24 Assault With Weapon Assault
## 2           2014 May                        24 Assault             Assault
## 3           2014 May                        24 Assault Bodily Harm Assault

So three offenses were committed for GO-20142139964 : Assault With Weapon, Assault, Assault Bodily Harm.

We can check another crimw id.

data2 %>% 
  filter(event_unique_id == "GO-20161246113") %>% 
  select(event_unique_id, occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)

## # A tibble: 2 x 6
##   event_unique_id occurrenceyear occurrencemonth occurrenceday offence     MCI  
##   <chr>                    <dbl> <chr>                   <dbl> <chr>       <chr>
## 1 GO-20161246113            2016 July                       16 Robbery Wi~ Robb~
## 2 GO-20161246113            2016 July                       16 Assault     Assa~

The crime with id GO-20161246113 was found to have two offenses: Robbery with Weapon and Assault.

The MCIs for these two offenses are Robbery and Assault, respectively.

1.2. MISSING VALUES

plot_missing(data2)

There are 0.03 percent of rows that contain missing values (0.03% *155552 = 46.6656 = 47).

These 47 missing values are found in the five following columns: occurrencedayofweek, occurrencedayofyear, occurrenceday, occurrencemonth, occurrenceyear.

Another way to find the number of missing values for each column.

sapply(data2[,c("occurrencedayofweek", "occurrencedayofyear", "occurrenceday", "occurrencemonth", "occurrenceyear")], function(x) sum(is.na(x)))

## occurrencedayofweek occurrencedayofyear       occurrenceday     occurrencemonth 
##                  47                  47                  47                  47 
##      occurrenceyear 
##                  47

We can see that these five columns have the same number of missing values. We can guess that these 47 rows are the same.

We can examine further rows with missing values, as follow.

head(data2[which(is.na(data2$occurrenceyear)) ,
           c("occurrencedate","reporteddate",
             "occurrenceyear","occurrencemonth","occurrenceday",
             "occurrencedayofweek","occurrencedayofyear","occurrencehour")],50)

## # A tibble: 47 x 8
##    occurrencedate      reporteddate        occurrenceyear occurrencemonth
##    <dttm>              <dttm>                       <dbl> <chr>          
##  1 1996-01-31 00:00:00 2014-01-06 13:00:00             NA <NA>           
##  2 1998-01-01 00:01:00 2014-04-26 15:20:00             NA <NA>           
##  3 1998-01-01 00:01:00 2014-04-26 15:20:00             NA <NA>           
##  4 1980-04-24 13:26:00 2015-02-03 12:22:00             NA <NA>           
##  5 1999-03-24 00:01:00 2015-03-24 14:40:00             NA <NA>           
##  6 1995-01-01 12:00:00 2014-03-06 16:58:00             NA <NA>           
##  7 1995-01-01 12:00:00 2014-03-06 16:58:00             NA <NA>           
##  8 1999-01-01 00:01:00 2016-05-09 16:10:00             NA <NA>           
##  9 1998-06-01 12:00:00 2014-01-01 12:08:00             NA <NA>           
## 10 1989-01-01 01:52:00 2014-03-04 01:52:00             NA <NA>           
## # ... with 37 more rows, and 4 more variables: occurrenceday <dbl>,
## #   occurrencedayofweek <chr>, occurrencedayofyear <dbl>, occurrencehour <dbl>

As we can see here that all these crimes happened before 2014. The values of year, month, day, day of week and day of year were not extracted from occurencedate.

We can remove all of these records.

If we need to keep these rows for further analysis, we have to fill these missing value with appropriate values that are extracted from the column occurrencedate and the column reporteddate.

One way to replace these missing values:

data2b <- data2 %>% dplyr::mutate( occurrenceyear = lubridate::year(occurrencedate), occurrencemonth = lubridate::month(occurrencedate), occurrenceday = lubridate::day(occurrencedate), reportedmonth = lubridate::month(reporteddate))

For this project, we will simply remove rows containing missing values.

Remove all the records with missing values

data2 <- data2 %>% 
  drop_na()

We can always check if the data2 contain any missing value

which(is.na(data2$occurrenceyear))

## integer(0)

In summary, after dealing with duplicates and missing values, we obtain a data frame which 19 variables and 155505 records.

occur_year <- data2 %>%
  group_by(occurrenceyear) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts)) %>% 
  head(10)

occur_year

## # A tibble: 10 x 2
##    occurrenceyear counts
##             <dbl>  <int>
##  1           2018  32902
##  2           2017  31828
##  3           2016  30374
##  4           2015  30055
##  5           2014  29428
##  6           2013    481
##  7           2012    138
##  8           2011     81
##  9           2010     58
## 10           2009     38

There are crimes that happened before 2014.

To find the earliest year reported in the data

data2 %>%
  group_by(occurrenceyear) %>% 
  summarise(counts = n()) %>% 
  arrange(occurrenceyear)

## # A tibble: 19 x 2
##    occurrenceyear counts
##             <dbl>  <int>
##  1           2000     14
##  2           2001     11
##  3           2002      7
##  4           2003      8
##  5           2004     10
##  6           2005     14
##  7           2006      7
##  8           2007     21
##  9           2008     30
## 10           2009     38
## 11           2010     58
## 12           2011     81
## 13           2012    138
## 14           2013    481
## 15           2014  29428
## 16           2015  30055
## 17           2016  30374
## 18           2017  31828
## 19           2018  32902

We have already remove 49 records where there are missing values. These records related to all the crimes happen before 2000. Therefore, at this stage, we only see the records where the occurrenceyear is from 2000.

data2 %>% 
  filter(occurrenceyear == 2002) %>% 
  select(premisetype, offence ,occurrenceyear, reportedyear)

## # A tibble: 7 x 4
##   premisetype offence occurrenceyear reportedyear
##   <chr>       <chr>            <dbl>        <dbl>
## 1 House       Assault           2002         2014
## 2 Apartment   Assault           2002         2014
## 3 House       Assault           2002         2016
## 4 Commercial  B&E               2002         2017
## 5 Apartment   B&E               2002         2016
## 6 House       Assault           2002         2017
## 7 Apartment   Assault           2002         2016

Apparently, crimes happened in 2002 but were reported way too long after.

Before doing further analysis, it is better that we remove these records.

The resulting data contain only the crimes that occurred from 2014.

data3 <- data2 %>%
  filter(occurrenceyear >= 2014)

The data3 contains 154587 rows while the data2 contains 155505 rows. Therefore, 918 rows are removed. In other words, there were 918 crimes which happened between 2000 and 2013 but were reported only in 2014 or later. We can check if there are still any records where the occurrenceyear is before 2014.

data3 %>% 
  filter(occurrenceyear < 2014) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

Detecting mistakes

NEIGHBOURHOOD

First, we should see how many neighborhood in Toronto?

n_distinct(data3$Hood_ID)

## [1] 140

n_distinct(data3$Neighbourhood)

## [1] 141

Since there are only 140 neighborhoods in Toronto, There is mistake in the column Neighborhood.

Let find out this mistake by examining a few lines of the column Neighborhood.

head(data3$Neighbourhood)

## [1] "Bay Street Corridor (76)"  "Newtonbrook West (36)"    
## [3] "Newtonbrook West (36)"     "Malvern (132)"            
## [5] "Milliken (130)"            "Clairlea-Birchmount (120)"

The Neighborhood column contains the name of neighborhoods as well as the id number of these neighborhoods. The mistake could get from the name or the id.

First, we can check if there is any wrong number in the parenthesis. As Toronto has 140 neighborhoods, one possible mistake can be the number 141 that was assigned to one neighborhood.

sum(str_detect(data3$Neighbourhood, "141"))

## [1] 0

So, there is no number 141 in this column. However, we can not continue this method by increasing the number to 142, 143, 144… One way to tackle this problem is extracting the hoodid from the column Neighborhood, name the new column as Hood_id2, and compare with Hood_ID, row by row. We will see if there is a mismatch.

data4 <- data3 %>% 
  mutate(Hood_ID2 = regmatches(data3$Neighbourhood, regexpr("[0-9]+",data3$Neighbourhood)))

class(data4$Hood_ID)

## [1] "numeric"

class(data4$Hood_ID2)

## [1] "character"

We need to convert Hood_ID2 into numeric

data4$Hood_ID2 <- as.numeric(data4$Hood_ID2)

Checking

class(data4$Hood_ID)

## [1] "numeric"

class(data4$Hood_ID2)

## [1] "numeric"

Now, we can find the number of rows that two columns Hood_ID and Hood_ID2 have different/same values

data4 %>% 
  filter(Hood_ID != Hood_ID2) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

There are no mismatch. We can conclude that there is no mistake related to the id of neighborhood. Rather, mistakes come from the name of the neighborhood.

We can go back to the data4 as we do not need the Hood_ID2 column. If there is any mistake related to the name, we suspect that the number of the neighborhood will be minor. We can count neighborhood in the data.

data4 %>% 
  group_by(Neighbourhood) %>% 
  summarise(counts = n()) %>% 
  arrange((counts)) %>% 
  head(20)

## # A tibble: 20 x 2
##    Neighbourhood                          counts
##    <chr>                                   <int>
##  1 Eringate-Centennial_West Deane (11)        23
##  2 Lambton Baby Point (114)                  170
##  3 Yonge-St.Clair (97)                       224
##  4 Centennial Scarborough (133)              281
##  5 Markland Wood (12)                        284
##  6 Broadview North (57)                      285
##  7 Yonge-Eglinton (100)                      294
##  8 Caledonia-Fairbank (109)                  297
##  9 Forest Hill South (101)                   348
## 10 Princess-Rosethorn (10)                   352
## 11 Humewood-Cedarvale (106)                  360
## 12 Guildwood (140)                           364
## 13 Pleasant View (46)                        386
## 14 Mount Pleasant East (99)                  395
## 15 Blake-Jones (69)                          401
## 16 Bayview Woods-Steeles (49)                403
## 17 Bridle Path-Sunnybrook-York Mills (41)    404
## 18 Woodbine-Lumsden (60)                     406
## 19 Kingsway South (15)                       419
## 20 Maple Leaf (29)                           420

We can see that the neighborhood Eringate-Centennial_West Deane (11) appears 23 times which is maybe not usual.

The mistake come from the neighborhood named Eringate-Centennial-West Deane and Eringate-Centennial_West Deane (11), it should be Eringate-Centennial-West Deane

data4 %>% 
  group_by(Neighbourhood) %>%
  summarise(counts = n()) %>% 
  filter(Neighbourhood %in% 
           c("Eringate-Centennial_West Deane (11)","Eringate-Centennial-West Deane (11)"))

## # A tibble: 2 x 2
##   Neighbourhood                       counts
##   <chr>                                <int>
## 1 Eringate-Centennial-West Deane (11)    618
## 2 Eringate-Centennial_West Deane (11)     23

Now, we need to replace Eringate-Centennial_West Deane (11) by Eringate-Centennial-West Deane (11)

data4$Neighbourhood <- str_replace(data4$Neighbourhood, "Eringate-Centennial_West Deane (11)",  "Eringate-Centennial-West Deane (11)")

We can save the final data

write.csv(data4,'Torontocrimes.csv')

Univariate analysis

Torontocrimes <- read_csv("Torontocrimes.csv")

## Warning: Missing column names filled in: 'X1' [1]

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   event_unique_id = col_character(),
##   occurrencedate = col_datetime(format = ""),
##   reporteddate = col_datetime(format = ""),
##   premisetype = col_character(),
##   offence = col_character(),
##   reportedmonth = col_character(),
##   reporteddayofweek = col_character(),
##   occurrencemonth = col_character(),
##   occurrencedayofweek = col_character(),
##   MCI = col_character(),
##   Neighbourhood = col_character()
## )
## i Use `spec()` for the full column specifications.

A. PREMISESTYPE

premisetype1 <- Torontocrimes  %>%
  group_by(premisetype) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts))

premisetype1

## # A tibble: 5 x 2
##   premisetype counts
##   <chr>        <int>
## 1 Outside      40073
## 2 Apartment    38081
## 3 Commercial   29952
## 4 House        29575
## 5 Other        16906

There are five types of premises, including outside, apartment, commercial, house, and other.

The number of crimes that happened in apartments is higher than in houses.

ggplot(premisetype1,
       aes(x= reorder(premisetype, -counts), y = counts)) + 
  geom_bar(stat = "identity", 
           width = 0.5,
           fill = "darkblue") + 
  labs(title = "The five premise types", 
       caption="Source: CMI dataset",
       y = "Frequency",
       x = "Types of premises") +
  geom_text(aes(label=counts), 
            position = position_dodge(width = 0.9), vjust=-0.25) +
  theme(plot.title = element_text(color="black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size=18, face="bold"),
        axis.title.y = element_text(size=18, face="bold"))

B. TYPES OF OFFENCES

offencetype <- Torontocrimes %>%
  group_by(offence) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts))

head(offencetype,10)

## # A tibble: 10 x 2
##    offence                counts
##    <chr>                   <int>
##  1 Assault                 57324
##  2 B&E                     29066
##  3 Theft Of Motor Vehicle  16834
##  4 Assault With Weapon     12844
##  5 Robbery - Mugging        5077
##  6 B&E W'Intent             4608
##  7 Assault Bodily Harm      3948
##  8 Theft Over               3038
##  9 Robbery - Other          2607
## 10 Robbery With Weapon      2392

There are 49 types of offenses, Assault appears to be the most popular type of offense.

Let visualize the first 15 top offences.

top_offence <- head(offencetype,15)

ggplot(top_offence,
       aes(x= reorder(offence,-counts), y = counts)) +
  geom_bar(stat="identity",
           width = 0.75,
           fill = "darkblue") + 
  labs(title="The top 15 offence types",
       caption="Source: CMI dataset",
       y = "Frequency",
       x = NULL) +
  geom_text(aes(label = counts), size = 5, position = position_dodge(width = 0.2), hjust = -0.05) +
  theme(plot.title = element_text(color="black", size=18, face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size=18, face="bold"),
        axis.title.y = element_text(size=18, face="bold")) +
  coord_flip()

C. OCCURRENCEYEAR, OCCURRENCEMONTH, OCCURRENCEDAY, OCCURRENCEDAYOFWEEK

occur_year <- Torontocrimes %>%
  group_by(occurrenceyear) %>% 
  summarise(counts = n()) %>% 
  arrange(occurrenceyear) %>% 
  head(10)

occur_year

## # A tibble: 5 x 2
##   occurrenceyear counts
##            <dbl>  <int>
## 1           2014  29428
## 2           2015  30055
## 3           2016  30374
## 4           2017  31828
## 5           2018  32902

ggplot(occur_year,
       aes(x = occurrenceyear, y = counts)) + 
  geom_bar(stat = "identity",
           width = 0.5,
           fill = "cornflowerblue") + 
  labs(title = "Increase in Crimes from 2014 to 2018",
       caption = "Source: CMI dataset",
       x = "Year",
       y = "Frequency") +
  theme_set(theme_classic()) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

The crimes (all kinds of crime) was found to increase from 29428 in 2014 to 32909 in 2018.

occur_month <- Torontocrimes  %>%
  group_by(occurrencemonth) %>% 
  summarise(counts = n()) 

occur_month

## # A tibble: 12 x 2
##    occurrencemonth counts
##  * <chr>            <int>
##  1 April            12372
##  2 August           13474
##  3 December         12576
##  4 February         10936
##  5 January          12158
##  6 July             13524
##  7 June             13364
##  8 March            12121
##  9 May              13499
## 10 November         13481
## 11 October          13784
## 12 September        13298

The result is sorted in alphabetical order. We need to convert the following variables into factor.

Torontocrimes$occurrencemonth <- factor(Torontocrimes$occurrencemonth,
                                levels = c("January","February","March",
                                           "April","May","June",
                                           "July","August", "September",
                                           "October","November","December"),ordered = TRUE)

Torontocrimes$occurrencedayofweek <- factor(Torontocrimes$occurrencedayofweek,
                                    levels = c("Monday","Tuesday",
                                               "Wednesday","Thursday",
                                               "Friday","Saturday","Sunday"),ordered = TRUE)

Torontocrimes$reportedmonth <- factor(Torontocrimes$reportedmonth,
                              levels = c("January","February","March", 
                                         "April","May","June",
                                         "July","August","September",
                                         "October","November","December"),ordered = TRUE)

Torontocrimes$reporteddayofweek <- factor(Torontocrimes$reporteddayofweek,
                                  levels = c("Monday","Tuesday",
                                             "Wednesday","Thursday", 
                                             "Friday","Saturday", "Sunday"),ordered = TRUE)

occurrencemonth_tbl <-  Torontocrimes %>% 
  group_by(occurrencemonth) %>% 
  summarise(counts= n())

occurrencemonth_tbl

## # A tibble: 12 x 2
##    occurrencemonth counts
##  * <ord>            <int>
##  1 January          12158
##  2 February         10936
##  3 March            12121
##  4 April            12372
##  5 May              13499
##  6 June             13364
##  7 July             13524
##  8 August           13474
##  9 September        13298
## 10 October          13784
## 11 November         13481
## 12 December         12576

Now, months is sorted in chronological order. February appears to be the month where less crimes were committed.

g_occurrenceMONTH <- ggplot(occurrencemonth_tbl, 
            aes(x = occurrencemonth, y = counts))
g_occurrenceMONTH + geom_bar(stat = "identity", 
             width = 0.5, 
             fill = "cornflowerblue") + 
      labs(title = "Number of crimes by month", 
           caption = "Source: CMI dataset",
           x = "Year",
           y = "Frequency") +
  theme_set(theme_classic()) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

OCCRRENCEDAYOFMONTH

occurrencedayofmonth_tbl <-  Torontocrimes %>% 
  group_by(occurrenceday) %>% 
  count()

g_occurrenceday <- ggplot(occurrencedayofmonth_tbl, aes(x = occurrenceday, y = n))
g_occurrenceday + 
  geom_line(color = "orange", size = 0.5) + 
  geom_point(color = "blue", size = 5)+
      labs(title = "Number of crimes by day of month", 
           caption = "Source: CMI dataset",
           x = "Day of month",
           y = "Frequency") + 
  theme_set(theme_classic()) +
  scale_x_continuous(name="Day of month",  breaks=seq(1, 31, 3)) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

We should remember that there are 7 months that have 31 days, and the month of February only has 28 days.

OCCURRENCEDAYOFWEEK

occurrencedayofweek_tbl <-  Torontocrimes %>% 
  group_by(occurrencedayofweek) %>% 
  summarise(counts = n()) 


occurrencedayofweek_tbl

## # A tibble: 7 x 2
##   occurrencedayofweek counts
## * <ord>                <int>
## 1 Monday               21363
## 2 Tuesday              21082
## 3 Wednesday            21585
## 4 Thursday             21781
## 5 Friday               23512
## 6 Saturday             23075
## 7 Sunday               22189

ggplot(occurrencedayofweek_tbl,
       aes(x = occurrencedayofweek, y = counts)) + 
  geom_bar(stat = "identity",
           width = 0.75,
           fill = "cornflowerblue") + 
      labs(title = "Number of crimes by day of week", 
           caption = "Source: CMI dataset",
           x = NULL,
           y = "Frequency") + 
  theme_set(theme_classic()) +
  geom_text(aes(label = counts), 
            position = position_dodge(width = 0.9), 
            vjust = -0.25) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

The number of crimes occurred on Friday are the highest.

OCCURRENCEHOUR

occurrencehour_tbl <- Torontocrimes %>% 
  group_by(occurrencehour) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts))

ggplot(occurrencehour_tbl,
       aes(x = occurrencehour, y = counts)) + 
  geom_line(color = "purple") + 
  geom_point (color = "blue", size = 5) +
  labs(title = "Number of crimes by time of the day",
       caption = "Source: CMI dataset",
       x = "Time of the day",
       y = "Frequency") + 
  theme_set(theme_classic()) +
  scale_x_continuous(name = "Day of month",  breaks = seq(0, 23, 1)) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

The highest numbers of crime were found to at 12 am and 12 pm. The number of crimes appears to be small in the early morning. The number of crime increased gradually during the day and reached 10000 at 12 am.

MAJOR CRIME INDICATORS (MCI)

MCI_tbl <- Torontocrimes %>% 
  group_by(MCI) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts))
MCI_tbl

## # A tibble: 5 x 2
##   MCI             counts
##   <chr>            <int>
## 1 Assault          82267
## 2 Break and Enter  34793
## 3 Auto Theft       16834
## 4 Robbery          15382
## 5 Theft Over        5311

All the reported crimes are classified in five types of MIC, naming Assault, Break and Enter, Robbery, and Theft Over.

MCI_data <- data.frame(
  group = MCI_tbl$MCI,
  value = MCI_tbl$counts)
MCI_data

##             group value
## 1         Assault 82267
## 2 Break and Enter 34793
## 3      Auto Theft 16834
## 4         Robbery 15382
## 5      Theft Over  5311

ggplot(data = MCI_data, 
       aes(x = reorder(group,-value), 
           y = value)) +
  geom_col(aes(fill = group) , 
           show.legend = FALSE) +
  ggtitle(paste("The number of crime by categories")) +
  coord_flip() +
  geom_label(aes(label = paste(value, 
                               floor((value/sum(value))*100), 
                               sep = "\n"), 
                 fill = group),
             show.legend = FALSE,
             size = 5, 
             label.padding = unit(0.25, "lines")) +
  expand_limits(y = -150) +
  scale_fill_brewer(palette = "Set1", 
                    direction = -1) +
  labs(title = "Five categoies of crimes", 
       caption = "Source: CMI dataset",
       y = "Frequency",
       x = NULL) +
  theme(plot.title = element_text(color = "black", 
                                  size=18, 
                                  face="bold.italic"),
        text = element_text(size = 18,face="bold"),
        axis.title.x = element_text(size = 18, face = "bold"),
        axis.title.y = element_text(size = 18, face = "bold"))

From 2014 to 2018, Assault is the most popular crime. Indeed, 53% of crimes are classified as Assault.

There are only 3% of total crimes that are Theft Over. The proportion of Robbery and Auto Theft are similar, 9% and 10 %, respectively. The break and Enter crime is twice as many as Auto Theft crime.

FURTHER UNDERSTANDING THE DATA.

Now, we would like to know, between 2014 and 2018, are there any crimes reported later than a year, a month, or a week. We can also check if the reportedyear and the occurenceyear is difference.

YEAR

later_year <- Torontocrimes %>% 
  filter(occurrenceyear < reportedyear) %>% 
  count()

later_year

## # A tibble: 1 x 1
##       n
##   <int>
## 1  2041

There are 2141 cases reported in the next year after the crime happened.

It should be noted that these cases could happen in December in 2014 but reported in January in 2015.

We can also detect if there are any mistake on year, such as reportedyear is earlier than occurrenceyear

Torontocrimes %>% 
  filter(reportedyear < occurrenceyear) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

There is no mistake about the year.

MONTH We want to know if there cases reported at least a month later. In this case, we should fix the year the same

Torontocrimes %>% 
  filter(occurrencemonth != reportedmonth,
         occurrenceyear == reportedyear) %>%
  group_by(occurrenceyear) %>% 
  count()

## # A tibble: 5 x 2
## # Groups:   occurrenceyear [5]
##   occurrenceyear     n
##            <dbl> <int>
## 1           2014  1518
## 2           2015  1365
## 3           2016  1324
## 4           2017  1456
## 5           2018  2067

The table shows the number of cases that was not reported within a month (in the same month), for each year from 2014 to 2018. In 2008, the number of crimes reported at least one month later reached 2067. I should be noted that crime may be happened in the end of month but reported few days later.

Torontocrimes %>% 
  filter(occurrenceday < reportedday,
         occurrenceyear == reportedyear, 
         occurrencemonth == reportedmonth) %>%
  group_by(occurrencemonth) %>% 
  count()

## # A tibble: 12 x 2
## # Groups:   occurrencemonth [12]
##    occurrencemonth     n
##    <ord>           <int>
##  1 January          2574
##  2 February         2460
##  3 March            2795
##  4 April            2840
##  5 May              3199
##  6 June             3070
##  7 July             3017
##  8 August           3175
##  9 September        3138
## 10 October          3310
## 11 November         3244
## 12 December         2801

Each month, at least 2500 cases were reported more than a day later.

dayofyear <- Torontocrimes %>% 
  group_by(occurrencedayofyear) %>% 
  summarise(counts = n())

ggplot(dayofyear,
       aes(x = occurrencedayofyear, y = counts)) + 
  geom_line(color = "purple") + geom_point (color = "blue", size = 2) +
      labs(title = "Number of crimes though a year, from 2014 to 2018", 
           caption = "Source: CMI dataset",
           x = "Day of year",
           y = "Counts") + theme_set(theme_classic())

Visualizing timeseries.

Torontocrimes <- Torontocrimes %>% 
  mutate(Date = strftime(occurrencedate, format="%Y-%m-%d"))

str(Torontocrimes$Date)

##  chr [1:154587] "2014-06-20" "2014-07-01" "2014-07-01" "2014-07-01" ...

Torontocrimes$Date <- as.Date(Torontocrimes$Date,"%Y-%m-%d")

we only need day column as 2014-01-01 instead of full fomat

time_series <- Torontocrimes %>% 
  group_by(Date) %>% 
  summarise(counts = n())

ggplot(time_series, 
       aes(Date, counts)) +
  geom_point(color = "purple",
             size = 3,
             pch = 18) +
  scale_x_date(date_breaks = "6 month") +
  labs(x= "Date", 
       y= "Frequency", 
       title = "Number of crime from 2014 to 2018") +
  stat_smooth(colour = "green") +
  theme(plot.title = element_text(color="darkblue", size=18, face="bold.italic"),
        text = element_text(size=18,face="bold"),
        axis.title.x = element_text(size= 18, face="bold"),
        axis.title.y = element_text(size=18, face="bold"))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'