In this post, the Toronto crime data set (167525 records and 29 variables) is explored. The package tidyverse is used to wrangle the data.
Toronto’s crime data is available on the Toronto Police Service Public Safety Data Portal.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(dplyr)
library(DataExplorer)
library(ggplot2)
raw <- read_csv("MCI_2014_to_2018.csv")
## Rows: 167525 Columns: 29
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (10): event_unique_id, premisetype, offence, reportedmonth, reportedday...
## dbl (17): X, Y, Index_, ucr_code, ucr_ext, reportedyear, reportedday, repor...
## dttm (2): occurrencedate, reporteddate
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(raw)
## spec_tbl_df [167,525 x 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
## $ Y : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
## $ Index_ : num [1:167525] 2349 2350 2351 2352 2354 ...
## $ event_unique_id : chr [1:167525] "GO-20149004286" "GO-20142411379" "GO-20142411379" "GO-20142412127" ...
## $ occurrencedate : POSIXct[1:167525], format: "2014-06-20 10:55:00" "2014-07-02 00:20:00" ...
## $ reporteddate : POSIXct[1:167525], format: "2014-06-20 13:20:00" "2014-07-02 02:58:00" ...
## $ premisetype : chr [1:167525] "Apartment" "Outside" "Outside" "House" ...
## $ ucr_code : num [1:167525] 2130 1457 1610 2120 1430 ...
## $ ucr_ext : num [1:167525] 210 100 100 200 100 200 100 100 220 220 ...
## $ offence : chr [1:167525] "Theft Over" "Pointing A Firearm" "Robbery With Weapon" "B&E" ...
## $ reportedyear : num [1:167525] 2014 2014 2014 2014 2014 ...
## $ reportedmonth : chr [1:167525] "June" "July" "July" "July" ...
## $ reportedday : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
## $ reporteddayofyear : num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
## $ reporteddayofweek : chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
## $ reportedhour : num [1:167525] 13 2 2 5 20 17 23 11 22 16 ...
## $ occurrenceyear : num [1:167525] 2014 2014 2014 2014 2014 ...
## $ occurrencemonth : chr [1:167525] "June" "July" "July" "July" ...
## $ occurrenceday : num [1:167525] 20 2 2 2 2 18 18 19 19 20 ...
## $ occurrencedayofyear: num [1:167525] 171 183 183 183 183 199 199 200 200 201 ...
## $ occurrencedayofweek: chr [1:167525] "Friday" "Wednesday" "Wednesday" "Wednesday" ...
## $ occurrencehour : num [1:167525] 10 0 0 1 20 0 23 1 15 1 ...
## $ MCI : chr [1:167525] "Theft Over" "Assault" "Robbery" "Break and Enter" ...
## $ Division : chr [1:167525] "D52" "D32" "D32" "D42" ...
## $ Hood_ID : num [1:167525] 76 36 36 132 130 120 31 76 132 32 ...
## $ Neighbourhood : chr [1:167525] "Bay Street Corridor (76)" "Newtonbrook West (36)" "Newtonbrook West (36)" "Malvern (132)" ...
## $ Lat : num [1:167525] 43.7 43.8 43.8 43.8 43.8 ...
## $ Long : num [1:167525] -79.4 -79.4 -79.4 -79.2 -79.3 ...
## $ ObjectId : num [1:167525] 2001 2002 2003 2004 2005 ...
## - attr(*, "spec")=
## .. cols(
## .. X = col_double(),
## .. Y = col_double(),
## .. Index_ = col_double(),
## .. event_unique_id = col_character(),
## .. occurrencedate = col_datetime(format = ""),
## .. reporteddate = col_datetime(format = ""),
## .. premisetype = col_character(),
## .. ucr_code = col_double(),
## .. ucr_ext = col_double(),
## .. offence = col_character(),
## .. reportedyear = col_double(),
## .. reportedmonth = col_character(),
## .. reportedday = col_double(),
## .. reporteddayofyear = col_double(),
## .. reporteddayofweek = col_character(),
## .. reportedhour = col_double(),
## .. occurrenceyear = col_double(),
## .. occurrencemonth = col_character(),
## .. occurrenceday = col_double(),
## .. occurrencedayofyear = col_double(),
## .. occurrencedayofweek = col_character(),
## .. occurrencehour = col_double(),
## .. MCI = col_character(),
## .. Division = col_character(),
## .. Hood_ID = col_double(),
## .. Neighbourhood = col_character(),
## .. Lat = col_double(),
## .. Long = col_double(),
## .. ObjectId = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
introduce(raw)
## # A tibble: 1 x 9
## rows columns discrete_columns continuous_columns all_missing_columns
## <int> <int> <int> <int> <int>
## 1 167525 29 12 17 0
## # ... with 4 more variables: total_missing_values <int>, complete_rows <int>,
## # total_observations <int>, memory_usage <dbl>
The data contains 167525 rows and 29 columns.
There are also 245 missing values in total.
The data contain 29 variables.
The variables such as reportedyear, reportedmonth, reportedday, reporteddayofweek were extracted from the native variable reporteddate.
The variables such as occurenceyear, occurencemonth, occurencedday, occurencedayofweek were extracted from the native variable occurenceddate.
raw %>%
distinct(event_unique_id) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 145817
There are 145817 unique ids while there are 167525 rows in the data set. Therefore, there are maybe 21708 cases (167525-145817 = 21708) that were considered as multiple crime types, such as assault and theft.
raw %>%
group_by(event_unique_id) %>%
summarise(countid = n()) %>%
arrange(desc(countid)) %>%
head(10)
## # A tibble: 10 x 2
## event_unique_id countid
## <chr> <int>
## 1 GO-20151785704 24
## 2 GO-2015840772 16
## 3 GO-20152031940 12
## 4 GO-201768934 11
## 5 GO-20142573299 10
## 6 GO-20161194129 10
## 7 GO-2016666713 10
## 8 GO-201669300 10
## 9 GO-20172020713 10
## 10 GO-20181906437 10
The crime with an id GO-20151785704 appears 24 times. I wonder if one occurrence can be recorded as many type of crimes.
raw %>%
filter(event_unique_id == "GO-20151785704") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 24 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 2 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 3 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 4 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 5 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 6 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 7 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 8 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 9 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## 10 2015-10-16 20:56:00 2015-10-16 20:56:00 Annex (95) Administering No~ Assa~
## # ... with 14 more rows
In this case, these row with id GO-20151785704 are duplicated. The 24 rows are identical. We can remove these duplicates but what if a crime are reported as multiple types?
raw %>%
filter(event_unique_id == "GO-20161246113") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 8 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 2 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 3 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 4 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Assault Assau~
## 5 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 6 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 7 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
## 8 2016-07-16 03:00:00 2016-07-16 03:07:00 Milliken (130) Robbery With We~ Robbe~
In this case, the crime with an id GO-20161246113 was considered as both assault and robbery. Another example of multiple offenses was found for the crime with id GO-20142139964
raw %>%
filter(event_unique_id == "GO-20142139964") %>%
select(occurrencedate, reporteddate, Neighbourhood, offence, MCI)
## # A tibble: 7 x 5
## occurrencedate reporteddate Neighbourhood offence MCI
## <dttm> <dttm> <chr> <chr> <chr>
## 1 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Wit~ Assa~
## 2 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Assa~
## 3 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Bod~ Assa~
## 4 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Bod~ Assa~
## 5 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Bod~ Assa~
## 6 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Bod~ Assa~
## 7 2014-05-24 01:00:00 2014-05-24 01:02:00 Eglinton East (138) Assault Bod~ Assa~
Here, the crime with the id GO-20142139964 is classified as Assault With Weapon, Assault Bodily Harm, and simply Assault. However, the row where the offense is classified as Assault Bodily Harm is duplicated.
nrow(distinct(raw))
## [1] 167525
The above code output shows that there are 167525 distinct rows despite the fact that we found there are many duplicate rows. To find the reason, we will examine all the columns of the row id GO-20142139964.
raw %>%
filter(event_unique_id == "GO-20142139964")
## # A tibble: 7 x 29
## X Y Index_ event_unique_id occurrencedate reporteddate
## <dbl> <dbl> <dbl> <chr> <dttm> <dttm>
## 1 -79.3 43.7 97670 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 2 -79.3 43.7 97671 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 3 -79.3 43.7 97672 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 4 -79.3 43.7 97673 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 5 -79.3 43.7 97674 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 6 -79.3 43.7 97675 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## 7 -79.3 43.7 97700 GO-20142139964 2014-05-24 01:00:00 2014-05-24 01:02:00
## # ... with 23 more variables: premisetype <chr>, ucr_code <dbl>, ucr_ext <dbl>,
## # offence <chr>, reportedyear <dbl>, reportedmonth <chr>, reportedday <dbl>,
## # reporteddayofyear <dbl>, reporteddayofweek <chr>, reportedhour <dbl>,
## # occurrenceyear <dbl>, occurrencemonth <chr>, occurrenceday <dbl>,
## # occurrencedayofyear <dbl>, occurrencedayofweek <chr>, occurrencehour <dbl>,
## # MCI <chr>, Division <chr>, Hood_ID <dbl>, Neighbourhood <chr>, Lat <dbl>,
## # Long <dbl>, ObjectId <dbl>
Here, all the values in the column index_ are unique. Therefore, the search will not find any duplicate. Therefore, we should remove the column Index_ before removing duplicates. It should be noted that the two column X and Y are exactly Lat and Lon
In addition, we can also remove the following columns: ucr_code, ucr_ext, Division, ObjectId.
To remove all the duplicates, we use the function distinct() in tidyverse
raw <- raw %>%
distinct()
nrow(distinct(raw))
## [1] 167525
In consequence, the data2 contains only 155552 records.
We can check the row GO-20142139964 again
raw %>%
filter(event_unique_id == "GO-20142139964") %>%
select(occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)
## # A tibble: 7 x 5
## occurrenceyear occurrencemonth occurrenceday offence MCI
## <dbl> <chr> <dbl> <chr> <chr>
## 1 2014 May 24 Assault With Weapon Assault
## 2 2014 May 24 Assault Assault
## 3 2014 May 24 Assault Bodily Harm Assault
## 4 2014 May 24 Assault Bodily Harm Assault
## 5 2014 May 24 Assault Bodily Harm Assault
## 6 2014 May 24 Assault Bodily Harm Assault
## 7 2014 May 24 Assault Bodily Harm Assault
So three offenses were committed for GO-20142139964 : Assault With Weapon, Assault, Assault Bodily Harm.
We can check another crimw id.
raw %>%
filter(event_unique_id == "GO-20161246113") %>%
select(event_unique_id, occurrenceyear,occurrencemonth, occurrenceday, offence, MCI)
## # A tibble: 8 x 6
## event_unique_id occurrenceyear occurrencemonth occurrenceday offence MCI
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 2 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 3 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 4 GO-20161246113 2016 July 16 Assault Assa~
## 5 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 6 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 7 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
## 8 GO-20161246113 2016 July 16 Robbery Wi~ Robb~
The crime with id GO-20161246113 was found to have two offenses: Robbery with Weapon and Assault.
The MCIs for these two offenses are Robbery and Assault, respectively.
plot_missing(raw)
There are 0.03 percent of rows that contain missing values (0.03% *155552 = 46.6656 = 47).
These 47 missing values are found in the five following columns: occurrencedayofweek, occurrencedayofyear, occurrenceday, occurrencemonth, occurrenceyear.
Another way to find the number of missing values for each column.
sapply(raw[,c("occurrencedayofweek", "occurrencedayofyear", "occurrenceday", "occurrencemonth", "occurrenceyear")], function(x) sum(is.na(x)))
## occurrencedayofweek occurrencedayofyear occurrenceday occurrencemonth
## 49 49 49 49
## occurrenceyear
## 49
We can see that these five columns have the same number of missing values. We can guess that these 47 rows are the same.
We can examine further rows with missing values, as follow.
head(raw[which(is.na(raw$occurrenceyear)) ,
c("occurrencedate","reporteddate",
"occurrenceyear","occurrencemonth","occurrenceday",
"occurrencedayofweek","occurrencedayofyear","occurrencehour")],50)
## # A tibble: 49 x 8
## occurrencedate reporteddate occurrenceyear occurrencemonth
## <dttm> <dttm> <dbl> <chr>
## 1 1996-01-31 00:00:00 2014-01-06 13:00:00 NA <NA>
## 2 1998-01-01 00:01:00 2014-04-26 15:20:00 NA <NA>
## 3 1998-01-01 00:01:00 2014-04-26 15:20:00 NA <NA>
## 4 1998-01-01 00:01:00 2014-04-26 15:20:00 NA <NA>
## 5 1980-04-24 13:26:00 2015-02-03 12:22:00 NA <NA>
## 6 1999-03-24 00:01:00 2015-03-24 14:40:00 NA <NA>
## 7 1995-01-01 12:00:00 2014-03-06 16:58:00 NA <NA>
## 8 1995-01-01 12:00:00 2014-03-06 16:58:00 NA <NA>
## 9 1999-01-01 00:01:00 2016-05-09 16:10:00 NA <NA>
## 10 1998-06-01 12:00:00 2014-01-01 12:08:00 NA <NA>
## # ... with 39 more rows, and 4 more variables: occurrenceday <dbl>,
## # occurrencedayofweek <chr>, occurrencedayofyear <dbl>, occurrencehour <dbl>
As we can see here that all these crimes happened before 2014. The values of year, month, day, day of week and day of year were not extracted from occurencedate.
We can remove all of these records.
If we need to keep these rows for further analysis, we have to fill these missing value with appropriate values that are extracted from the column occurrencedate and the column reporteddate.
One way to replace these missing values:
data2b <- data2 %>% dplyr::mutate( occurrenceyear = lubridate::year(occurrencedate), occurrencemonth = lubridate::month(occurrencedate), occurrenceday = lubridate::day(occurrencedate), reportedmonth = lubridate::month(reporteddate))
For this project, we will simply remove rows containing missing values.
Remove all the records with missing values
raw <- raw %>%
drop_na()
We can always check if the raw contain any missing value
which(is.na(raw$occurrenceyear))
## integer(0)
In summary, after dealing with duplicates and missing values, we obtain a data frame which 19 variables and 155505 records.
occur_year <- raw %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
head(10)
occur_year
## # A tibble: 10 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2018 35382
## 2 2017 34457
## 3 2016 32751
## 4 2015 32282
## 5 2014 31628
## 6 2013 516
## 7 2012 141
## 8 2011 83
## 9 2010 60
## 10 2009 43
There are crimes that happened before 2014.
To find the earliest year reported in the data
raw %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(occurrenceyear)
## # A tibble: 19 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2000 14
## 2 2001 12
## 3 2002 7
## 4 2003 9
## 5 2004 12
## 6 2005 16
## 7 2006 8
## 8 2007 22
## 9 2008 33
## 10 2009 43
## 11 2010 60
## 12 2011 83
## 13 2012 141
## 14 2013 516
## 15 2014 31628
## 16 2015 32282
## 17 2016 32751
## 18 2017 34457
## 19 2018 35382
We have already remove 49 records where there are missing values. These records related to all the crimes happen before 2000. Therefore, at this stage, we only see the records where the occurrenceyear is from 2000.
raw %>%
filter(occurrenceyear == 2002) %>%
select(premisetype, offence ,occurrenceyear, reportedyear)
## # A tibble: 7 x 4
## premisetype offence occurrenceyear reportedyear
## <chr> <chr> <dbl> <dbl>
## 1 House Assault 2002 2014
## 2 Apartment Assault 2002 2014
## 3 House Assault 2002 2016
## 4 Commercial B&E 2002 2017
## 5 Apartment B&E 2002 2016
## 6 House Assault 2002 2017
## 7 Apartment Assault 2002 2016
Apparently, crimes happened in 2002 but were reported way too long after.
Before doing further analysis, it is better that we remove these records.
The resulting data contain only the crimes that occurred from 2014.
data3 <- raw %>%
filter(occurrenceyear >= 2014)
The data3 contains 154587 rows while the data2 contains 155505 rows. Therefore, 918 rows are removed. In other words, there were 918 crimes which happened between 2000 and 2013 but were reported only in 2014 or later. We can check if there are still any records where the occurrenceyear is before 2014.
data3 %>%
filter(occurrenceyear < 2014) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
NEIGHBOURHOOD
First, we should see how many neighborhood in Toronto?
n_distinct(data3$Hood_ID)
## [1] 140
n_distinct(data3$Neighbourhood)
## [1] 141
Since there are only 140 neighborhoods in Toronto, There is mistake in the column Neighborhood.
Let find out this mistake by examining a few lines of the column Neighborhood.
head(data3$Neighbourhood)
## [1] "Bay Street Corridor (76)" "Newtonbrook West (36)"
## [3] "Newtonbrook West (36)" "Malvern (132)"
## [5] "Milliken (130)" "Clairlea-Birchmount (120)"
The Neighborhood column contains the name of neighborhoods as well as the id number of these neighborhoods. The mistake could get from the name or the id.
First, we can check if there is any wrong number in the parenthesis. As Toronto has 140 neighborhoods, one possible mistake can be the number 141 that was assigned to one neighborhood.
sum(str_detect(data3$Neighbourhood, "(141)"))
## [1] 0
So, there is no number 141 in this column. However, we can not continue this method by increasing the number to 142, 143, 144… One way to tackle this problem is extracting the hoodid from the column Neighborhood, name the new column as Hood_id2, and compare with Hood_ID, row by row. We will see if there is a mismatch.
data4 <- data3 %>%
mutate(Hood_ID2 = regmatches(data3$Neighbourhood, regexpr("[0-9]+",data3$Neighbourhood)))
class(data4$Hood_ID)
## [1] "numeric"
class(data4$Hood_ID2)
## [1] "character"
We need to convert Hood_ID2 into numeric
data4$Hood_ID2 <- as.numeric(data4$Hood_ID2)
Checking
class(data4$Hood_ID)
## [1] "numeric"
class(data4$Hood_ID2)
## [1] "numeric"
Now, we can find the number of rows that two columns Hood_ID and Hood_ID2 have different/same values
data4 %>%
filter(Hood_ID != Hood_ID2) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
There are no mismatch. We can conclude that there is no mistake related to the id of neighborhood. Rather, mistakes come from the name of the neighborhood.
We can go back to the data4 as we do not need the Hood_ID2 column. If there is any mistake related to the name, we suspect that the number of the neighborhood will be minor. We can count neighborhood in the data.
data4 %>%
group_by(Neighbourhood) %>%
summarise(counts = n()) %>%
arrange((counts)) %>%
head(20)
## # A tibble: 20 x 2
## Neighbourhood counts
## <chr> <int>
## 1 Eringate-Centennial_West Deane (11) 23
## 2 Lambton Baby Point (114) 190
## 3 Yonge-St.Clair (97) 234
## 4 Markland Wood (12) 292
## 5 Broadview North (57) 299
## 6 Centennial Scarborough (133) 309
## 7 Yonge-Eglinton (100) 323
## 8 Caledonia-Fairbank (109) 324
## 9 Forest Hill South (101) 358
## 10 Princess-Rosethorn (10) 362
## 11 Humewood-Cedarvale (106) 370
## 12 Guildwood (140) 400
## 13 Mount Pleasant East (99) 413
## 14 Pleasant View (46) 416
## 15 Bridle Path-Sunnybrook-York Mills (41) 418
## 16 Blake-Jones (69) 425
## 17 Woodbine-Lumsden (60) 426
## 18 Bayview Woods-Steeles (49) 427
## 19 Lawrence Park North (105) 442
## 20 Etobicoke West Mall (13) 450
We can see that the neighborhood Eringate-Centennial_West Deane (11) appears 23 times which is maybe not usual.
The mistake come from the neighborhood named Eringate-Centennial-West Deane and Eringate-Centennial_West Deane (11), it should be Eringate-Centennial-West Deane
library("stringr")
sum(grepl("^\\s*$", data4$Neighbourhood))
## [1] 0
data4$Neighbourhood <- gsub("^\\s+|\\s+$", "", data4$Neighbourhood)
data4 %>%
group_by(Neighbourhood) %>%
summarise(counts = n()) %>%
filter(Neighbourhood %in% c("Eringate-Centennial_West Deane (11)",
"Eringate-Centennial-West Deane (11)"))
## # A tibble: 2 x 2
## Neighbourhood counts
## <chr> <int>
## 1 Eringate-Centennial-West Deane (11) 673
## 2 Eringate-Centennial_West Deane (11) 23
Now, we need to replace “Eringate-Centennial_West Deane (11)” by “Eringate-Centennial-West Deane (11)”.
data4$Neighbourhood <- str_replace_all(data4$Neighbourhood, "_","-")
we can check:
data4 %>%
filter(Neighbourhood == "Eringate-Centennial_West Deane (11)")
## # A tibble: 0 x 30
## # ... with 30 variables: X <dbl>, Y <dbl>, Index_ <dbl>, event_unique_id <chr>,
## # occurrencedate <dttm>, reporteddate <dttm>, premisetype <chr>,
## # ucr_code <dbl>, ucr_ext <dbl>, offence <chr>, reportedyear <dbl>,
## # reportedmonth <chr>, reportedday <dbl>, reporteddayofyear <dbl>,
## # reporteddayofweek <chr>, reportedhour <dbl>, occurrenceyear <dbl>,
## # occurrencemonth <chr>, occurrenceday <dbl>, occurrencedayofyear <dbl>,
## # occurrencedayofweek <chr>, occurrencehour <dbl>, MCI <chr>, ...
data4 %>%
filter(Neighbourhood == "Eringate-Centennial-West Deane (11)") %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 696
Checking if the reported date is before the occurence date
sum(data4$occurrencedate < data4$reporteddate)
## [1] 130007
sum(data4$occurrencedate > data4$reporteddate)
## [1] 0
class(data4$occurrencedate)
## [1] "POSIXct" "POSIXt"
class(data4$reporteddate)
## [1] "POSIXct" "POSIXt"
data4 %>%
filter(reporteddate - occurrencedate > 0) %>%
summarise(counts =n())
## # A tibble: 1 x 1
## counts
## <int>
## 1 130007
If the occurence date happened before reported day:
data4 %>%
filter(reporteddate - occurrencedate < 0) %>%
summarise(counts =n())
## # A tibble: 1 x 1
## counts
## <int>
## 1 0
The data contains 157587 rows. There were no mistake where the reported day happened before the occurence date. One assumption is that the reported day and the occurrence date is the same
data4 %>%
filter(reporteddate - occurrencedate == 0) %>%
summarise(counts =n())
## # A tibble: 1 x 1
## counts
## <int>
## 1 36493
So there are 32774 entries where the reported day and the occurrence date are the same
data4 %>%
select(occurrencedate,reporteddate) %>%
filter(occurrencedate == reporteddate) %>%
head(10)
## # A tibble: 10 x 2
## occurrencedate reporteddate
## <dttm> <dttm>
## 1 2014-07-18 23:15:00 2014-07-18 23:15:00
## 2 2014-08-09 01:05:00 2014-08-09 01:05:00
## 3 2014-08-17 23:06:00 2014-08-17 23:06:00
## 4 2014-06-09 17:21:00 2014-06-09 17:21:00
## 5 2014-06-10 14:21:00 2014-06-10 14:21:00
## 6 2014-07-31 19:54:00 2014-07-31 19:54:00
## 7 2014-07-31 13:48:00 2014-07-31 13:48:00
## 8 2014-08-04 20:09:00 2014-08-04 20:09:00
## 9 2014-08-13 01:39:00 2014-08-13 01:39:00
## 10 2014-04-18 18:13:00 2014-04-18 18:13:00
for the future analysis. We will not need the reported date
data5 <- data4 %>%
select(event_unique_id, X, Y, # for now, we will keep X, Y
occurrencedate,
premisetype, offence, MCI,
Neighbourhood, Lat, Long,
occurrenceyear,occurrencemonth,occurrencedayofyear,occurrencedayofweek,occurrencehour)
We can save the final data
write.csv(data4,'Torontocrimes.csv')
Torontocrimes <- read_csv("Torontocrimes.csv")
## New names:
## * `` -> ...1
## Rows: 166500 Columns: 31
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (10): event_unique_id, premisetype, offence, reportedmonth, reportedday...
## dbl (19): ...1, X, Y, Index_, ucr_code, ucr_ext, reportedyear, reportedday,...
## dttm (2): occurrencedate, reporteddate
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
premisetype1 <- Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
group_by(premisetype) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
premisetype1
## # A tibble: 5 x 2
## premisetype counts
## <chr> <int>
## 1 Outside 9344
## 2 Apartment 8052
## 3 Commercial 7537
## 4 House 6142
## 5 Other 4307
There are five types of premises, including outside, apartment, commercial, house, and other.
The number of crimes that happened in apartments is higher than in houses.
ggplot(premisetype1,
aes(x= reorder(premisetype, -counts), y = counts)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "darkblue") +
labs(title = "The five premise types",
caption="Source: CMI dataset",
y = "Frequency",
x = NULL) +
geom_text(aes(label=counts),
position = position_dodge(width = 0.9), vjust=-0.25) +
theme(plot.title = element_text(color="black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size=18, face="bold"),
axis.title.y = element_text(size=18, face="bold"))
B. TYPES OF OFFENCES
offencetype <- Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
select(offence, MCI) %>%
group_by(offence,MCI) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
## `summarise()` has grouped output by 'offence'. You can override using the `.groups` argument.
head(offencetype,10)
## # A tibble: 10 x 3
## # Groups: offence [10]
## offence MCI counts
## <chr> <chr> <int>
## 1 Assault Assault 12701
## 2 B&E Break and Enter 6262
## 3 Theft Of Motor Vehicle Auto Theft 4617
## 4 Assault With Weapon Assault 3007
## 5 Robbery - Mugging Robbery 1008
## 6 B&E W'Intent Break and Enter 988
## 7 Assault Bodily Harm Assault 835
## 8 Assault Peace Officer Assault 713
## 9 Theft Over Theft Over 615
## 10 Robbery With Weapon Robbery 599
There are 49 types of offenses, Assault appears to be the most popular type of offense.
Let visualize the first 15 top offences.
top_offence <- head(offencetype,15)
ggplot(top_offence,
aes(x= reorder(offence,-counts),
y = counts,
fill = MCI)) +
geom_bar(stat="identity",
width = 0.75) +
labs(title = "The top 15 types of offence ",
caption = "Source: CMI dataset",
y = "Frequency",
x = NULL) +
geom_text(aes(label = counts), size = 5, position = position_dodge(width = 0.2), hjust = -0.05) +
theme(plot.title = element_text(color="black", size=18, face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size=18, face="bold"),
axis.title.y = element_text(size=18, face="bold"),
legend.position = c(0.7, 0.5),
legend.direction = "vertical")+
coord_flip()
occur_year <- Torontocrimes %>%
group_by(occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(occurrenceyear) %>%
head(10)
occur_year
## # A tibble: 5 x 2
## occurrenceyear counts
## <dbl> <int>
## 1 2014 31628
## 2 2015 32282
## 3 2016 32751
## 4 2017 34457
## 5 2018 35382
ggplot(occur_year,
aes(x = occurrenceyear, y = counts)) +
geom_bar(stat = "identity",
width = 0.6,
fill = "cornflowerblue") +
labs(title = "Increase in crimes from 2014 to 2018",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
geom_text(aes(label = counts),
size = 5.0,
position = position_dodge(width = 0.9),
vjust = -0.15) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
The crimes (all kinds of crime) was found to increase from 29428 in 2014 to 32909 in 2018.
occur_month <- Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
group_by(occurrencemonth) %>%
summarise(counts = n())
occur_month
## # A tibble: 12 x 2
## occurrencemonth counts
## <chr> <int>
## 1 April 2705
## 2 August 3079
## 3 December 2823
## 4 February 2594
## 5 January 2963
## 6 July 2957
## 7 June 3040
## 8 March 2951
## 9 May 3168
## 10 November 3072
## 11 October 3117
## 12 September 2913
The result is sorted in alphabetical order. We need to convert the following variables into factor.
Torontocrimes$occurrencemonth <- factor(Torontocrimes$occurrencemonth,
levels = c("January","February","March",
"April","May","June",
"July","August", "September",
"October","November","December"),ordered = TRUE)
Torontocrimes$occurrencedayofweek <- factor(Torontocrimes$occurrencedayofweek,
levels = c("Monday","Tuesday",
"Wednesday","Thursday",
"Friday","Saturday","Sunday"),ordered = TRUE)
In 2018 only
occurrencemonth_tbl <- Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
group_by(occurrencemonth) %>%
summarise(counts= n())
occurrencemonth_tbl
## # A tibble: 12 x 2
## occurrencemonth counts
## <ord> <int>
## 1 January 2963
## 2 February 2594
## 3 March 2951
## 4 April 2705
## 5 May 3168
## 6 June 3040
## 7 July 2957
## 8 August 3079
## 9 September 2913
## 10 October 3117
## 11 November 3072
## 12 December 2823
Now, months is sorted in chronological order. February appears to be the month where less crimes were committed.
ggplot(occurrencemonth_tbl,
aes(x = occurrencemonth, y = counts)) +
geom_bar(stat = "identity",
width = 0.5,
fill = "cornflowerblue") +
labs(title = "Number of crimes by month (2014-2018)",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
OCCRRENCEDAYOFMONTH
As we have data for 4 years, we might use the box-plot
occurrencedayofmonth_tbl <- Torontocrimes %>%
group_by(occurrenceday,occurrenceyear) %>%
count()
head(occurrencedayofmonth_tbl,10)
## # A tibble: 10 x 3
## # Groups: occurrenceday, occurrenceyear [10]
## occurrenceday occurrenceyear n
## <dbl> <dbl> <int>
## 1 1 2014 1514
## 2 1 2015 1543
## 3 1 2016 1396
## 4 1 2017 1572
## 5 1 2018 1612
## 6 2 2014 984
## 7 2 2015 1026
## 8 2 2016 1048
## 9 2 2017 1184
## 10 2 2018 1110
occurrencedayofmonth_tbl$occurrenceday <- as.factor(occurrencedayofmonth_tbl$occurrenceday)
ggplot(occurrencedayofmonth_tbl,
aes(x = occurrenceday, y = n)) +
geom_boxplot() +
labs(title = "Number of crimes by day of month (2014-2018)",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
theme(plot.title = element_text(color = "black",
size = 14,
face = "bold.italic"),
text = element_text(size = 14,face="bold"),
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"))
We should remember that there are 7 months that have 31 days, and the month of February only has 28 days.
OCCURRENCEDAYOFWEEK
occurrencedayofweek_tbl <- Torontocrimes %>%
group_by(occurrencedayofweek, occurrenceyear) %>%
summarise(counts = n())
## `summarise()` has grouped output by 'occurrencedayofweek'. You can override using the `.groups` argument.
occurrencedayofweek_tbl$occurrencedayofweek <- as.factor(occurrencedayofweek_tbl$occurrencedayofweek)
occurrencedayofweek_tbl
## # A tibble: 35 x 3
## # Groups: occurrencedayofweek [7]
## occurrencedayofweek occurrenceyear counts
## <ord> <dbl> <int>
## 1 Monday 2014 4275
## 2 Monday 2015 4419
## 3 Monday 2016 4511
## 4 Monday 2017 4718
## 5 Monday 2018 5020
## 6 Tuesday 2014 4479
## 7 Tuesday 2015 4350
## 8 Tuesday 2016 4286
## 9 Tuesday 2017 4598
## 10 Tuesday 2018 4895
## # ... with 25 more rows
ggplot(occurrencedayofweek_tbl,
aes(x = occurrencedayofweek, y = counts)) +
geom_boxplot() +
labs(title = "Number of crimes by day of week (2014-2018)",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
theme(plot.title = element_text(color = "black",
size = 14,
face = "bold.italic"),
text = element_text(size = 14,face="bold"),
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"))
The number of crimes occurred on Friday are the highest.
OCCURRENCEHOUR
occurrencehour_tbl <- Torontocrimes %>%
group_by(occurrencehour) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
ggplot(occurrencehour_tbl,
aes(x = occurrencehour, y = counts)) +
geom_line(color = "purple") +
geom_point (color = "blue", size = 5) +
labs(title = "Number of crimes by time of the day",
caption = "Source: CMI dataset",
x = "Time of the day",
y = "Frequency") +
theme_set(theme_classic()) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
The highest numbers of crime were found to at 12 am and 12 pm. The number of crimes appears to be small in the early morning. The number of crime increased gradually during the day and reached 10000 at 12 am.
occurrencehour_tbl2 <- Torontocrimes %>%
group_by(occurrencehour,occurrenceyear) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
## `summarise()` has grouped output by 'occurrencehour'. You can override using the `.groups` argument.
head(occurrencehour_tbl2,10)
## # A tibble: 10 x 3
## # Groups: occurrencehour [5]
## occurrencehour occurrenceyear counts
## <dbl> <dbl> <int>
## 1 0 2015 2121
## 2 0 2018 2098
## 3 0 2017 2089
## 4 0 2014 2084
## 5 0 2016 2075
## 6 12 2017 1986
## 7 22 2018 1959
## 8 18 2018 1939
## 9 21 2018 1928
## 10 21 2017 1921
occurrencehour_tbl2$occurrencehour <- as.factor(occurrencehour_tbl2$occurrencehour)
ggplot(occurrencehour_tbl2,
aes(x = occurrencehour, y = counts)) +
geom_boxplot() +
labs(title = "Number of crimes by day of week (2014-2018)",
caption = "Source: CMI dataset",
x = NULL,
y = "Frequency") +
theme_set(theme_classic()) +
theme(plot.title = element_text(color = "black",
size = 14,
face = "bold.italic"),
text = element_text(size = 14,face="bold"),
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"))
MAJOR CRIME INDICATORS (MCI)
MCI_tbl <- Torontocrimes %>%
filter(occurrenceyear ==2018) %>%
group_by(MCI) %>%
summarise(counts = n()) %>%
arrange(desc(counts))
MCI_tbl
## # A tibble: 5 x 2
## MCI counts
## <chr> <int>
## 1 Assault 18617
## 2 Break and Enter 7485
## 3 Auto Theft 4617
## 4 Robbery 3522
## 5 Theft Over 1141
All the reported crimes are classified in five types of MIC, naming Assault, Break and Enter, Robbery, and Theft Over.
MCI_data <- data.frame(
group = MCI_tbl$MCI,
value = MCI_tbl$counts)
MCI_data
## group value
## 1 Assault 18617
## 2 Break and Enter 7485
## 3 Auto Theft 4617
## 4 Robbery 3522
## 5 Theft Over 1141
ggplot(data = MCI_data,
aes(x = reorder(group,-value),
y = value)) +
geom_col(aes(fill = group) ,
show.legend = FALSE) +
coord_flip() +
geom_label(aes(label = paste(value,
floor((value/sum(value))*100),
sep = "\n"),
fill = group),
show.legend = FALSE,
size = 5,
label.padding = unit(0.25, "lines")) +
expand_limits(y = -150) +
scale_fill_brewer(palette = "Set1",
direction = -1) +
labs(title = "Five categoies of crimes (2018)",
caption = "Source: CMI dataset",
y = "Frequency",
x = NULL) +
theme(plot.title = element_text(color = "black",
size=18,
face="bold.italic"),
text = element_text(size = 18,face="bold"),
axis.title.x = element_text(size = 18, face = "bold"),
axis.title.y = element_text(size = 18, face = "bold"))
From 2014 to 2018, Assault is the most popular crime. Indeed, 53% of crimes are classified as Assault.
There are only 3% of total crimes that are Theft Over. The proportion of Robbery and Auto Theft are similar, 9% and 10 %, respectively. The break and Enter crime is twice as many as Auto Theft crime.
FURTHER UNDERSTANDING THE DATA.
Now, we would like to know, between 2014 and 2018, are there any crimes reported later than a year, a month, or a week. We can also check if the reportedyear and the occurenceyear is difference.
YEAR
later_year <- Torontocrimes %>%
filter(occurrenceyear < reportedyear) %>%
count()
later_year
## # A tibble: 1 x 1
## n
## <int>
## 1 2141
There are 2141 cases reported in the next year after the crime happened.
It should be noted that these cases could happen in December in 2014 but reported in January in 2015.
We can also detect if there are any mistake on year, such as reportedyear is earlier than occurrenceyear
Torontocrimes %>%
filter(reportedyear < occurrenceyear) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 0
There is no mistake about the year.
MONTH We want to know if there cases reported at least a month later. In this case, we should fix the year the same
Torontocrimes %>%
filter(occurrencemonth != reportedmonth,
occurrenceyear == reportedyear) %>%
group_by(occurrenceyear) %>%
count()
## # A tibble: 5 x 2
## # Groups: occurrenceyear [5]
## occurrenceyear n
## <dbl> <int>
## 1 2014 1582
## 2 2015 1416
## 3 2016 1368
## 4 2017 1522
## 5 2018 2192
The table shows the number of cases that was not reported within a month (in the same month), for each year from 2014 to 2018. In 2008, the number of crimes reported at least one month later reached 2067. I should be noted that crime may be happened in the end of month but reported few days later.
Torontocrimes %>%
filter(occurrenceday < reportedday,
occurrenceyear == reportedyear,
occurrencemonth == reportedmonth) %>%
group_by(occurrencemonth) %>%
count()
## # A tibble: 12 x 2
## # Groups: occurrencemonth [12]
## occurrencemonth n
## <ord> <int>
## 1 January 2706
## 2 February 2560
## 3 March 2939
## 4 April 2964
## 5 May 3343
## 6 June 3206
## 7 July 3139
## 8 August 3299
## 9 September 3280
## 10 October 3458
## 11 November 3412
## 12 December 2909
Each month, at least 2500 cases were reported more than a day later.
dayofyear <- Torontocrimes %>%
group_by(occurrencedayofyear) %>%
summarise(counts = n())
ggplot(dayofyear,
aes(x = occurrencedayofyear, y = counts)) +
geom_line(color = "purple") + geom_point (color = "blue", size = 2) +
labs(title = "Number of crimes though a year, from 2014 to 2018",
caption = "Source: CMI dataset",
x = "Day of year",
y = "Counts") + theme_set(theme_classic())
Visualizing timeseries.
Torontocrimes <- Torontocrimes %>%
mutate(Date = strftime(occurrencedate, format="%Y-%m-%d"))
str(Torontocrimes$Date)
## chr [1:166500] "2014-06-20" "2014-07-01" "2014-07-01" "2014-07-01" ...
Torontocrimes$Date <- as.Date(Torontocrimes$Date,"%Y-%m-%d")
we only need day column as 2014-01-01 instead of full fomat
time_series <- Torontocrimes %>%
group_by(Date) %>%
summarise(counts = n())
ggplot(time_series,
aes(Date, counts)) +
geom_point(color = "purple",
size = 3,
pch = 18) +
scale_x_date(date_breaks = "6 month") +
labs(x= "Date",
y= "Frequency",
title = "Number of crime from 2014 to 2018") +
stat_smooth(colour = "green") +
theme(plot.title = element_text(color="darkblue", size=18, face="bold.italic"),
text = element_text(size=18,face="bold"),
axis.title.x = element_text(size= 18, face="bold"),
axis.title.y = element_text(size=18, face="bold"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Where is the highest number of crimes ?
Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
group_by(Neighbourhood) %>%
summarise(Frequency = n()) %>%
arrange(desc(Frequency)) %>%
head(10)
## # A tibble: 10 x 2
## Neighbourhood Frequency
## <chr> <int>
## 1 Church-Yonge Corridor (75) 1552
## 2 Waterfront Communities-The Island (77) 1266
## 3 West Humber-Clairville (1) 1035
## 4 Moss Park (73) 914
## 5 Kensington-Chinatown (78) 815
## 6 Bay Street Corridor (76) 805
## 7 Downsview-Roding-CFB (26) 647
## 8 Annex (95) 615
## 9 Woburn (137) 607
## 10 York University Heights (27) 607
Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
group_by(Neighbourhood) %>%
summarise( counts = n()) %>%
arrange(counts) %>%
head(10)
## # A tibble: 10 x 2
## Neighbourhood counts
## <chr> <int>
## 1 Yonge-St.Clair (97) 41
## 2 Lambton Baby Point (114) 44
## 3 Broadview North (57) 50
## 4 Centennial Scarborough (133) 52
## 5 Caledonia-Fairbank (109) 56
## 6 Markland Wood (12) 56
## 7 Princess-Rosethorn (10) 62
## 8 Yonge-Eglinton (100) 68
## 9 Etobicoke West Mall (13) 76
## 10 Humewood-Cedarvale (106) 76
Torontocrimes %>%
filter(occurrenceyear == 2018) %>%
select(MCI) %>%
group_by(MCI) %>%
count()
## # A tibble: 5 x 2
## # Groups: MCI [5]
## MCI n
## <chr> <int>
## 1 Assault 18617
## 2 Auto Theft 4617
## 3 Break and Enter 7485
## 4 Robbery 3522
## 5 Theft Over 1141
library(ggTimeSeries)
## Warning: package 'ggTimeSeries' was built under R version 4.1.3
Torontocrimes %>%
select(occurrencedate) %>%
mutate(date = as.Date(occurrencedate)) %>%
group_by(date) %>%
summarise(frequency = n()) %>%
ggplot_calendar_heatmap("date", "frequency") +
xlab(NULL) +
ylab(NULL) +
facet_wrap(~Year, ncol = 1) +
scale_fill_gradient(high="red", low="yellow")+
theme(plot.title = element_text(color="darkblue", size=14, face="bold.italic"),
text = element_text(size=14,face="bold"),
axis.title.x = element_text(size=14, face="bold"),
axis.title.y = element_text(size=14, face="bold"))