#install.packages("data.table")
library(data.table)
DT <- fread(file = "/Users/divyakampalli/Desktop/DPA/finalproject/crimeanalysis.csv", header = T, sep = ",", na.strings = "")
head(DT)
## ID Case Number Date Block IUCR
## 1: 5741943 HN549294 08/25/2007 09:22:18 AM 074XX N ROGERS AVE 0560
## 2: 25953 JE240540 05/24/2021 03:06:00 PM 020XX N LARAMIE AVE 0110
## 3: 26038 JE279849 06/26/2021 09:24:00 AM 062XX N MC CORMICK RD 0110
## 4: 13279676 JG507211 11/09/2023 07:30:00 AM 019XX W BYRON ST 0620
## 5: 13274752 JG501049 11/12/2023 07:59:00 AM 086XX S COTTAGE GROVE AVE 0454
## 6: 1930689 HH109118 01/05/2002 09:24:00 PM 007XX E 103 ST 0820
## Primary Type Description
## 1: ASSAULT SIMPLE
## 2: HOMICIDE FIRST DEGREE MURDER
## 3: HOMICIDE FIRST DEGREE MURDER
## 4: BURGLARY UNLAWFUL ENTRY
## 5: BATTERY AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MINOR INJURY
## 6: THEFT $500 AND UNDER
## Location Description Arrest Domestic Beat District Ward Community Area
## 1: OTHER FALSE FALSE 2422 24 49 1
## 2: STREET TRUE FALSE 2515 25 36 19
## 3: PARKING LOT TRUE FALSE 1711 17 50 13
## 4: APARTMENT FALSE FALSE 1922 19 47 5
## 5: SMALL RETAIL STORE FALSE FALSE 632 6 6 44
## 6: GAS STATION TRUE FALSE 512 5 NA NA
## FBI Code X Coordinate Y Coordinate Year Updated On Latitude
## 1: 08A NA NA 2007 08/17/2015 03:03:40 PM NA
## 2: 01A 1141387 1913179 2021 11/18/2023 03:39:49 PM 41.91784
## 3: 01A 1152781 1941458 2021 11/18/2023 03:39:49 PM 41.99522
## 4: 05 1162518 1925906 2023 11/18/2023 03:39:49 PM 41.95235
## 5: 08B 1183071 1847869 2023 11/25/2023 03:41:03 PM 41.73775
## 6: 06 NA NA 2002 02/04/2016 06:33:39 AM NA
## Longitude Location
## 1: NA <NA>
## 2: -87.75597 (41.917838056, -87.755968972)
## 3: -87.71335 (41.995219444, -87.713354912)
## 4: -87.67798 (41.952345086, -87.677975059)
## 5: -87.60486 (41.737750767, -87.604855911)
## 6: NA <NA>
str(DT)
## Classes 'data.table' and 'data.frame': 7946816 obs. of 22 variables:
## $ ID : int 5741943 25953 26038 13279676 13274752 1930689 13203321 13210088 13210004 13210062 ...
## $ Case Number : chr "HN549294" "JE240540" "JE279849" "JG507211" ...
## $ Date : chr "08/25/2007 09:22:18 AM" "05/24/2021 03:06:00 PM" "06/26/2021 09:24:00 AM" "11/09/2023 07:30:00 AM" ...
## $ Block : chr "074XX N ROGERS AVE" "020XX N LARAMIE AVE" "062XX N MC CORMICK RD" "019XX W BYRON ST" ...
## $ IUCR : chr "0560" "0110" "0110" "0620" ...
## $ Primary Type : chr "ASSAULT" "HOMICIDE" "HOMICIDE" "BURGLARY" ...
## $ Description : chr "SIMPLE" "FIRST DEGREE MURDER" "FIRST DEGREE MURDER" "UNLAWFUL ENTRY" ...
## $ Location Description: chr "OTHER" "STREET" "PARKING LOT" "APARTMENT" ...
## $ Arrest : logi FALSE TRUE TRUE FALSE FALSE TRUE ...
## $ Domestic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Beat : int 2422 2515 1711 1922 632 512 122 1225 333 1732 ...
## $ District : int 24 25 17 19 6 5 1 12 3 17 ...
## $ Ward : int 49 36 50 47 6 NA 42 27 7 30 ...
## $ Community Area : int 1 19 13 5 44 NA 32 28 43 21 ...
## $ FBI Code : chr "08A" "01A" "01A" "05" ...
## $ X Coordinate : int NA 1141387 1152781 1162518 1183071 NA 1174694 1160870 1190812 1151117 ...
## $ Y Coordinate : int NA 1913179 1941458 1925906 1847869 NA 1901831 1898642 1856743 1922554 ...
## $ Year : int 2007 2021 2021 2023 2023 2002 2023 2023 2023 2023 ...
## $ Updated On : chr "08/17/2015 03:03:40 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" ...
## $ Latitude : num NA 41.9 42 42 41.7 ...
## $ Longitude : num NA -87.8 -87.7 -87.7 -87.6 ...
## $ Location : chr NA "(41.917838056, -87.755968972)" "(41.995219444, -87.713354912)" "(41.952345086, -87.677975059)" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Data exploration using summary
summary(DT)
## ID Case Number Date Block
## Min. : 634 Length:7946816 Length:7946816 Length:7946816
## 1st Qu.: 3852320 Class :character Class :character Class :character
## Median : 7149126 Mode :character Mode :character Mode :character
## Mean : 7150988
## 3rd Qu.:10335352
## Max. :13292996
##
## IUCR Primary Type Description Location Description
## Length:7946816 Length:7946816 Length:7946816 Length:7946816
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Arrest Domestic Beat District Ward
## Mode :logical Mode :logical Min. : 111 Min. : 1.0 Min. : 1.0
## FALSE:5892780 FALSE:6581484 1st Qu.: 621 1st Qu.: 6.0 1st Qu.:10.0
## TRUE :2054036 TRUE :1365332 Median :1034 Median :10.0 Median :23.0
## Mean :1185 Mean :11.3 Mean :22.8
## 3rd Qu.:1731 3rd Qu.:17.0 3rd Qu.:34.0
## Max. :2535 Max. :31.0 Max. :50.0
## NA's :47 NA's :614854
## Community Area FBI Code X Coordinate Y Coordinate
## Min. : 0.0 Length:7946816 Min. : 0 Min. : 0
## 1st Qu.:23.0 Class :character 1st Qu.:1152998 1st Qu.:1859105
## Median :32.0 Mode :character Median :1166138 Median :1890771
## Mean :37.5 Mean :1164616 Mean :1885821
## 3rd Qu.:57.0 3rd Qu.:1176389 3rd Qu.:1909321
## Max. :77.0 Max. :1205119 Max. :1951622
## NA's :613478 NA's :87614 NA's :87614
## Year Updated On Latitude Longitude
## Min. :2001 Length:7946816 Min. :36.62 Min. :-91.69
## 1st Qu.:2005 Class :character 1st Qu.:41.77 1st Qu.:-87.71
## Median :2009 Mode :character Median :41.86 Median :-87.67
## Mean :2010 Mean :41.84 Mean :-87.67
## 3rd Qu.:2015 3rd Qu.:41.91 3rd Qu.:-87.63
## Max. :2023 Max. :42.02 Max. :-87.52
## NA's :87614 NA's :87614
## Location
## Length:7946816
## Class :character
## Mode :character
##
##
##
##
# Extract 5 past years' data
dttest <- DT[Year > 2018]
# Rename some variables
setnames(dttest, c("Case Number", "Primary Type", "Location Description", "Community Area"), c("Case", "Type", "Locdescrip", "Community"))
# Test whether there exist duplicates
any(duplicated(dttest[["Case"]]))
## [1] TRUE
# Remove duplicates according to Case Number
dttest <- dttest[!duplicated(dttest[["Case"]])]
# Test again to assure that there is no more duplicates
any(duplicated(dttest[["Case"]]))
## [1] FALSE
# Test whether there exist missing values
any(is.na(dttest))
## [1] TRUE
# Find where are the missing values
colSums(is.na(dttest))
## ID Case Date Block IUCR Type
## 0 0 0 0 0 0
## Description Locdescrip Arrest Domestic Beat District
## 0 6051 0 0 0 0
## Ward Community FBI Code X Coordinate Y Coordinate Year
## 48 2 0 16907 16907 0
## Updated On Latitude Longitude Location
## 0 16907 16907 16907
#Try to replace NAs with similar values
dttest$`Latitude` <- na.omit(dttest$`Latitude`)[match(dttest$`X Coordinate`, na.omit(dttest$`X Coordinate`))]
colSums(is.na(dttest))
## ID Case Date Block IUCR Type
## 0 0 0 0 0 0
## Description Locdescrip Arrest Domestic Beat District
## 0 6051 0 0 0 0
## Ward Community FBI Code X Coordinate Y Coordinate Year
## 48 2 0 16907 16907 0
## Updated On Latitude Longitude Location
## 0 16907 16907 16907
# Remove NA in latitude, longitude, location
dttest <- dttest[!is.na(dttest[["Latitude"]])]
# We remark that we do not need to repeat the function for each columns, after removing duplicates in the column "Latitude", all missing values concerning the location description disappear since these three columns are relevant.
colSums(is.na(dttest))
## ID Case Date Block IUCR Type
## 0 0 0 0 0 0
## Description Locdescrip Arrest Domestic Beat District
## 0 4513 0 0 0 0
## Ward Community FBI Code X Coordinate Y Coordinate Year
## 47 1 0 0 0 0
## Updated On Latitude Longitude Location
## 0 0 0 0
# Remove NA in Case Number
dttest <- dttest[!is.na(dttest[["Case"]])]
colSums(is.na(dttest))
## ID Case Date Block IUCR Type
## 0 0 0 0 0 0
## Description Locdescrip Arrest Domestic Beat District
## 0 4513 0 0 0 0
## Ward Community FBI Code X Coordinate Y Coordinate Year
## 47 1 0 0 0 0
## Updated On Latitude Longitude Location
## 0 0 0 0
# Replace NAs for Location Description using records in Location
dttest$`Locdescrip` <- na.omit(dttest$`Locdescrip`)[match(dttest$`Location`, na.omit(dttest$`Location`))]
# Replace NAs for District using records in Beat
dttest$`District` <- na.omit(dttest$`District`)[match(dttest$`Beat`, na.omit(dttest$`Beat`))]
# Replace NAs for Ward using records in Location
dttest$`Ward` <- na.omit(dttest$`Ward`)[match(dttest$`Location`, na.omit(dttest$`Location`))]
# Replace NAs for Community Area using records in Location
dttest$`Community` <- na.omit(dttest$`Community`)[match(dttest$`Location`, na.omit(dttest$`Location`))]
colSums(is.na(dttest))
## ID Case Date Block IUCR Type
## 0 0 0 0 0 0
## Description Locdescrip Arrest Domestic Beat District
## 0 637 0 0 0 0
## Ward Community FBI Code X Coordinate Y Coordinate Year
## 5 1 0 0 0 0
## Updated On Latitude Longitude Location
## 0 0 0 0
# Remove the observations containing NAs that cannot be replaced in the column Locdescrip
dttest <- dttest[!is.na(dttest[["Locdescrip"]])]
# Test again to make sure that there is no more missing values
any(is.na(dttest))
## [1] FALSE
# Using boxplot.stats for numeric column Year
boxplot.stats(dttest$Year)$out
## integer(0)
# How many districts in our dataset?
length(unique(dttest[["District"]]))
## [1] 22
# What are the codes of districts?
table(dttest[["District"]])
##
## 1 2 3 4 5 6 7 8 9 10 11 12 14
## 57356 53489 57288 65487 49891 72754 55644 70805 49563 51028 72007 61636 38451
## 15 16 17 18 19 20 22 24 25
## 42143 40576 31902 57244 54053 22159 37021 37759 58782
Highest number of crimes are in District 8
# How many community areas in our dataset?
length(unique(dttest[["Community"]]))
## [1] 77
# What are the codes of community areas?
table(dttest[["Community"]])
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 18660 15983 17694 8474 5275 26294 16758 51818 1087 4851 4176 1973 3757
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 9835 12616 10923 6222 2532 19001 6076 8830 19511 33106 30946 63537 23829
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 19967 42053 36892 18467 11267 39542 10533 4779 13412 3577 3474 16155 7765
## 40 41 42 43 44 45 46 47 48 49 50 51 52
## 11257 9210 16225 41896 29853 5634 19785 1709 6342 28362 4851 8303 4852
## 53 54 55 56 57 58 59 60 61 62 63 64 65
## 17427 5943 2673 8032 3815 9291 3697 6162 18743 3734 9065 4130 6834
## 66 67 68 69 70 71 72 73 74 75 76 77
## 24192 26677 25579 30992 9085 32409 3776 13465 2106 8279 8036 12970
Highest number of crimes are in 25th community
# Remove data having 0 as community code
dttest <- dttest[which(Community != 0),]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df <- data.frame(DT)
crimes <- df %>%
select(c(Date, Primary.Type)) %>%
mutate(Primary.Type = as.factor(Primary.Type),
Date = mdy_hms(Date),
Date = floor_date(Date, unit = "hours")) %>% #takes a date-time object and rounds it down to hours unit
arrange(Date)
library(ggplot2)
crimes %>%
count(Primary.Type, sort = T) %>%
head(5) %>%
ggplot(aes(x = n, y = reorder(Primary.Type, n))) +
geom_col()+
labs(title = 'Top 5 Crime in Chicago',
x = 'Number of Crime',
y = 'Crime')
From the chart above, theft has the highest number of crimes among
others. Therefore, in this analysis we will try to do a time series
prediction analysis for the theft case.
range(crimes$Date)
## [1] "2001-01-01 UTC" "2023-11-23 UTC"
For this analysis, we will limit the data analysis to only the number of theft crimes from the beginning of 2018 to the end of 2021.
theft <- crimes %>%
filter(Primary.Type == 'THEFT') %>%
group_by(Date) %>%
summarise(Theft = n()) %>%
filter(Date > '2018-01-01' & Date < '2022-01-02')
head(theft, 5)
## # A tibble: 5 × 2
## Date Theft
## <dttm> <int>
## 1 2018-01-01 07:00:00 4
## 2 2018-01-01 08:00:00 3
## 3 2018-01-01 09:00:00 10
## 4 2018-01-01 10:00:00 2
## 5 2018-01-01 11:00:00 9
tail(theft, 5)
## # A tibble: 5 × 2
## Date Theft
## <dttm> <int>
## 1 2022-01-01 23:00:00 3
## 2 2022-01-02 00:00:00 3
## 3 2022-01-02 02:00:00 1
## 4 2022-01-02 03:00:00 1
## 5 2022-01-02 04:00:00 3
length(theft$Date)
## [1] 33654
theft <- theft %>%
slice(-c(33625:33641))
range(theft$Date)
## [1] "2018-01-01 07:00:00 UTC" "2022-01-02 04:00:00 UTC"
Data Pre-Processing Data provisions that must be treated in the Time Series, among others:
-no missing intervals -no missing values -data should be ordered by time
colSums(is.na(theft))
## Date Theft
## 0 0
library(padr)
theft <- theft %>%
pad(start_val = ymd_hms("2018-01-01 00:00:00"), end_val = ymd_hms("2021-12-31 23:00:00")) %>%
replace(., is.na(.), 0)
## pad applied on the interval: hour
range(theft$Date)
## [1] "2018-01-01 00:00:00 UTC" "2021-12-31 23:00:00 UTC"
theft_ts <- ts(data = theft$Theft,
start = min(theft$Date),
frequency = 24) # daily seasonality
In this section, I tried to explore whether our timeseries object has trend and seasonal properties (one-seasonal/multiseasonal).
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# decompose ts object
theft_deco <- decompose(theft_ts)
autoplot(theft_deco)
Based on the plot, the trend still shows a pattern of ups and downs which indicates a multiseasonal time series object. Therefore, we should try reanalyze this data through multiseasonal time series using another seasonal frequency.
First Trial
theft$Theft %>%
msts(seasonal.periods = c(24,24*7)) %>% # multiseasonal ts (daily, weekly)
mstl() %>% # multiseasonal ts decomposition
autoplot()
Second trail
theft$Theft %>%
msts(seasonal.periods = c(24, 24*7, 24*7*4)) %>% # multiseasonal ts (daily, weekly, monthly)
mstl() %>% # multiseasonal ts decomposition
autoplot()
Third trial
theft$Theft %>%
msts(seasonal.periods = c(24, 24*7, 24*7*4, 24*7*4*12)) %>% # multiseasonal ts (daily, weekly, monthly, annualy)
mstl() %>% # multiseasonal ts decomposition
autoplot()
The last ts object showed the best decomposition among the 3 trials and therefore will be used for the time series model building. Additionally, note that the data is an additive time series and this is a valuable information for the model building later.
library(tseries)
# assign final ts object
theft_msts <- theft$Theft %>%
msts(seasonal.periods = c(24, 24*7, 24*7*4, 24*7*4*12))
# check for stationary
adf.test(theft_msts)
## Warning in adf.test(theft_msts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: theft_msts
## Dickey-Fuller = -15.14, Lag order = 32, p-value = 0.01
## alternative hypothesis: stationary
Based on Augmented Dickey-Fuller Test (adf.test) result, the p-value is < alpha and therefore the data is already stationary. Therefore, for a model building using SARIMA, we do not need to perform differencing on the data first.
Cross validation After finishing our time frame, we will continue to conduct the cross validation activities. we will split our data into two types of data. Train data as a data that will be trained in to our model, and test data as a data that provide an unbiased evaluation of a model fit on the training data set.
We will split our train data into two type of data set named train and test. Our test data will be a set of data consist of last 365 days of reported crime from our data frame.
length(theft$Date)
## [1] 35064
The cross-validation scheme for time series should not be sampled randomly, but splitted sequentially.
theft_train <- theft_msts %>% head(length(theft_msts) - 24*7*4*12)
theft_test <- theft_msts %>% tail(24*7*4*12)
head(theft_msts)
## Multi-Seasonal Time Series:
## Start: 1 1
## Seasonal Periods: 24 168 672 8064
## Data:
## [1] 0 0 0 0 0 0
Seasonality Analysis
# Decompose MSTS Object
theft_multi_dec <- theft_msts %>%
mstl()
theft_multi_dec %>%
tail(24*7*4*12) %>%
autoplot()
# Create a data frame based on MSTS Object
df_theft_multi <- as.data.frame(theft_multi_dec)
glimpse(df_theft_multi)
## Rows: 35,064
## Columns: 7
## $ Data <dbl> 0, 0, 0, 0, 0, 0, 0, 4, 3, 10, 2, 9, 14, 2, 3, 7, 9, 7, 2…
## $ Trend <dbl> 7.022148, 7.022269, 7.022390, 7.022511, 7.022632, 7.02275…
## $ Seasonal24 <dbl> -0.6647175, -2.8706026, -4.1476981, -4.0685433, -4.314429…
## $ Seasonal168 <dbl> -1.23743281, -0.13659220, -0.12242789, -1.25920591, 0.237…
## $ Seasonal672 <dbl> -1.18058087, -0.55985752, -0.24198083, 0.48740682, 0.2836…
## $ Seasonal8064 <dbl> -1.38916336, -0.42363040, -0.08377644, -0.18719778, 0.424…
## $ Remainder <dbl> -2.5502531, -3.0315859, -2.4265063, -1.9949704, -3.654255…
Hourly Seasonality These are the plot of Hourly Seasonality of Battery Crime in Chicago
df_theft_multi %>%
mutate(day = theft$Date) %>%
group_by(day) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
head(24*7) %>%
ggplot(aes(x = day, y = seasonal)) +
geom_point(col = "maroon") +
geom_line(col = "blue") +
theme_minimal()
Daily Seasonality
df_theft_multi %>%
mutate(day = wday(theft$Date, label = T)) %>%
group_by(day) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
ggplot(aes(x = day, y = seasonal)) +
geom_col() +
theme_minimal()
As we can see from the plot above, The theft tend to increase in the Wednesday, Thursday, Friday and Saturday.
Monthly Seasonality
df_theft_multi %>%
mutate(month = month(theft$Date, label = T)) %>%
group_by(month) %>%
summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
ggplot(aes(x = month, y = seasonal)) +
geom_col() +
theme_minimal()
Based on the monthly plot, the theft tend to increase from June until October.
Model Building For model building, I will compare between two of the widely used time series modeling in business and industry: ETS Holt-Winters and Seasonal Arima. I use ETS Holt-Winters because my data contain both seasonal and trend. I also want to compare it between seasonal Arima to check whether seasonal ARIMA can give better forecasting performance.
# ets Holt-Winters
theft_ets <- stlm(theft_train, method = "ets") # no log transformation for additive data
# ARIMA
theft_arima <- stlm(theft_train, method = "arima")
Forecast & Evaluation
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# forecast
theft_ets_f <- forecast(theft_ets, h = 24*7*4*12)
theft_arima_f <- forecast(theft_arima, h = 24*7*4*12)
# visualization
a <- autoplot(theft_ets_f, series = "ETS", fcol = "red") +
autolayer(theft_msts, series = "Actual", color = "black") +
labs(subtitle = "Number of Theft Case at Chicago, from Jan 2018 - Dec 2021",
y = "Theft Frequency") +
theme_minimal()
b <- autoplot(theft_arima_f, series = "ARIMA", fcol = "blue") +
autolayer(theft_msts, series = "Actual", color = "black") +
labs(subtitle = "Number of Theft Case at Chicago, from Jan 2018 - Dec 2021",
y = "Theft Frequency") +
theme_minimal()
grid.arrange(a,b)
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
# Calculate MAE using the mae function
mae_ets <- mae(theft_ets_f$mean, theft_test)
# Create a data frame with MAE values for ETS and ARIMA models
mae_comparison <- data.frame(ETS = mae_ets, ARIMA = mae(theft_arima_f$mean, theft_test))
# Print the MAE values
print(mae_comparison)
## ETS ARIMA
## 1 2.333739 2.419786
From the analysis above, we can conclude that we have sucsesfully forecast theft frequency and found ARIMA as the best model with the lowest error (MAE 2.419%, although not very different from ETS model).
Assumption Check Normality: Anderson-Darling Test H0 : residuals are normally distributed H1 : residuals are not normally distributed
library(tseries)
# Assuming theft_arima_f$residuals is your time series residuals
result <- adf.test(theft_arima_f$residuals)
## Warning in adf.test(theft_arima_f$residuals): p-value smaller than printed
## p-value
# Print the test results
print(result)
##
## Augmented Dickey-Fuller Test
##
## data: theft_arima_f$residuals
## Dickey-Fuller = -30.449, Lag order = 29, p-value = 0.01
## alternative hypothesis: stationary
hist(theft_arima_f$residuals, breaks = 20)
Autocorrelation: Box.test - Ljng-Box H0 : No autocorrelation in the forecast errors H1 : there is an autocorrelation in the forecast errors
Box.test(theft_arima_f$residuals, type = "Ljung-Box") # there is not enough data to reject H0
##
## Box-Ljung test
##
## data: theft_arima_f$residuals
## X-squared = 0.0051992, df = 1, p-value = 0.9425
Based on the assumption check, there is no autocorrelation on our forecast residuals (p-value > 0.05). Still, our forecast’s residuals are not distributed normally, therefore it’s residuals may not be appeared around its mean as seen in the histogram.
In a time series, such errors might emerge from various unpredictable events and is actually quite unavoidable. One strategy to overcome it is to analyze what kinds of unpredictable events that might occur and occurs frequently. This can be done by time series analysis using seasonality adjustment.
Conclusion Based on the analysis and our model performance,we have sucsesfully forecast theft frequency and found ARIMA as the best model with the lowest error (MAE 2.318%, although not very different from ETS model). Our model able to predict the theft crime rate in the city of Chicago with Measure of Error (MAE) around 2.318%. Still, it can be optimized thoroughly with several way as our model does not fulfill the Normality assumptions.
As for the seasonal analysis, it is safe to say that the theft crime is likely to increase around 10 am, it reaches a peak value in 5 pm (after office hours) and tend to increase towards 12 o’clock in the morning. The crime itself is more likely to happen from June until October.
# Remove the useless columns
dttest <- dttest[, !c("ID", "IUCR", "Description", "FBI Code", "Block", "Ward", "X Coordinate", "Y Coordinate", "Updated On")]
# Transform the variable Date from string to date object
#install.packages("lubridate")
library(lubridate)
dttest[["Date"]] <- parse_date_time(dttest[["Date"]], orders = "mdY IMSp")
#conflicts_prefer(data.table::hour)
# Create four time intervals
tint <- c("0", "5.9", "11.9", "17.9", "23.9")
# Extract hours
hours <- hour(dttest[["Date"]])
# Matching
dttest[["Tint"]] <- cut(hours, breaks = tint, labels = c("0-5H", "6-11H", "12-17H", "18-24H"), include.lowest = T)
# Create the column Day showing the weekday when the incident occurred
dttest[["Day"]] <- wday(dttest[["Date"]], label = T)
# Create the column Month showing the month when the incident occurred
dttest[["Month"]] <- month(dttest[["Date"]], label = T)
# Extract quarters
quarters <- quarter(dttest$Date)
# Create four season intervals
sint <- c("0.9", "1.9", "2.9", "3.9", "4.9")
# Matching
dttest[["Season"]] <- cut(quarters, breaks = sint, labels = c("SPRING", "SUMMER", "FALL", "WINTER"))
# Summary of all types
table(dttest[["Type"]])
##
## ARSON ASSAULT
## 2355 100021
## BATTERY BURGLARY
## 211515 39060
## CONCEALED CARRY LICENSE VIOLATION CRIM SEXUAL ASSAULT
## 898 983
## CRIMINAL DAMAGE CRIMINAL SEXUAL ASSAULT
## 130012 6057
## CRIMINAL TRESPASS DECEPTIVE PRACTICE
## 22738 80358
## GAMBLING HOMICIDE
## 203 3264
## HUMAN TRAFFICKING INTERFERENCE WITH PUBLIC OFFICER
## 49 3426
## INTIMIDATION KIDNAPPING
## 834 620
## LIQUOR LAW VIOLATION MOTOR VEHICLE THEFT
## 922 77007
## NARCOTICS NON-CRIMINAL
## 34928 17
## OBSCENITY OFFENSE INVOLVING CHILDREN
## 230 9319
## OTHER NARCOTIC VIOLATION OTHER OFFENSE
## 23 70746
## PROSTITUTION PUBLIC INDECENCY
## 1537 33
## PUBLIC PEACE VIOLATION RITUALISM
## 4866 1
## ROBBERY SEX OFFENSE
## 42401 5556
## STALKING THEFT
## 1663 245131
## WEAPONS VIOLATION
## 40265
# Number of types
length(unique(dttest[["Type"]]))
## [1] 33
# Regroup some "small" types
dttest[["Type"]] <- ifelse(dttest[["Type"]] %in% c("CRIMINAL DAMAGE"), "DAMAGE",
ifelse(dttest[["Type"]] %in% c("DECEPTIVE PRACTICE"), "DECEIVE",
ifelse(dttest[["Type"]] %in% c("KIDNAPPING", "OFFENSE INVOLVING CHILDREN", "HUMAN TRAFFICKING"), "HUMANCHILD",
ifelse(dttest[["Type"]] %in% c("NARCOTICS", "OTHER NARCOTIC VIOLATION"), "NARCOTICS",
ifelse(dttest[["Type"]] %in% c("MOTOR VEHICLE THEFT"), "MOTO",
ifelse(dttest[["Type"]] %in% c("OTHER OFFENSE"), "OTHER",
ifelse(dttest[["Type"]] %in% c("CRIM SEXUAL ASSAULT", "PROSTITUTION", "SEX OFFENSE"), "SEX",
ifelse(dttest[["Type"]] %in% c("GAMBLING", "INTERFERENCE WITH PUBLIC OFFICER", "INTIMIDATION", "LIQUOR LAW VIOLATION", "OBSCENITY", "PUBLIC INDECENCY", "PUBLIC PEACE VIOLATION", "STALKING", "NON-CRIMINAL", "NON-CRIMINAL (SUBJECT SPECIFIED)", "NON - CRIMINAL"), "SOCIETY",
ifelse(dttest[["Type"]] %in% c("CRIMINAL TRESPASS"), "TRESPASS",
ifelse(dttest[["Type"]] %in% c("CONCEALED CARRY LICENSE VIOLATION", "WEAPONS VIOLATION"), "WEAPONS", dttest[["Type"]]))))))))))
dttest[["Locdescrip"]] <- ifelse(dttest[["Locdescrip"]] %in% c("VEHICLE-COMMERCIAL", "VEHICLE - DELIVERY TRUCK", "VEHICLE - OTHER RIDE SERVICE", "VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)", "VEHICLE NON-COMMERCIAL", "TRAILER", "TRUCK", "DELIVERY TRUCK", "TAXICAB", "OTHER COMMERCIAL TRANSPORTATION"), "VEHICLE",
ifelse(dttest[["Locdescrip"]] %in% c("BAR OR TAVERN", "TAVERN", "TAVERN/LIQUOR STORE"), "TAVERN",
ifelse(dttest[["Locdescrip"]] %in% c("SCHOOL YARD", "SCHOOL, PRIVATE, BUILDING", "SCHOOL, PRIVATE, GROUNDS", "SCHOOL, PUBLIC, BUILDING", "SCHOOL, PUBLIC, GROUNDS", "COLLEGE/UNIVERSITY GROUNDS", "COLLEGE/UNIVERSITY RESIDENCE HALL"), "SCHOOL",
ifelse(dttest[["Locdescrip"]] %in% c("RESIDENCE", "RESIDENCE-GARAGE", "RESIDENCE PORCH/HALLWAY", "RESIDENTIAL YARD (FRONT/BACK)", "DRIVEWAY - RESIDENTIAL", "GARAGE", "HOUSE", "PORCH", "YARD"), "RESIDENCE",
ifelse(dttest[["Locdescrip"]] %in% c("PARKING LOT", "PARKING LOT/GARAGE(NON.RESID.)", "POLICE FACILITY/VEH PARKING LOT"), "PARKING",
ifelse(dttest[["Locdescrip"]] %in% c("OTHER", "OTHER RAILROAD PROP / TRAIN DEPOT", "ABANDONED BUILDING", "ANIMAL HOSPITAL", "ATHLETIC CLUB", "BASEMENT", "BOAT/WATERCRAFT", "CHURCH", "CHURCH/SYNAGOGUE/PLACE OF WORSHIP", "COIN OPERATED MACHINE", "CONSTRUCTION SITE", "SEWER", "STAIRWELL", "VACANT LOT", "VACANT LOT/LAND", "VESTIBULE", "WOODED AREA", "FARM", "FACTORY", "FACTORY/MANUFACTURING BUILDING", "FEDERAL BUILDING", "FIRE STATION", "FOREST PRESERVE", "GOVERNMENT BUILDING", "GOVERNMENT BUILDING/PROPERTY", "JAIL / LOCK-UP FACILITY", "LIBRARY", "MOVIE HOUSE/THEATER", "POOL ROOM", "SPORTS ARENA/STADIUM", "WAREHOUSE", "AUTO", "AUTO / BOAT / RV DEALERSHIP", "CEMETARY"), "OTHERS",
ifelse(dttest[["Locdescrip"]] %in% c("COMMERCIAL / BUSINESS OFFICE"), "BIGBUSINESS",
ifelse(dttest[["Locdescrip"]] %in% c("PARK PROPERTY"), "PARK",
ifelse(dttest[["Locdescrip"]] %in% c("ATM (AUTOMATIC TELLER MACHINE)", "BANK", "CREDIT UNION", "CURRENCY EXCHANGE", "SAVINGS AND LOAN"), "BANK",
ifelse(dttest[["Locdescrip"]] %in% c("HOTEL", "HOTEL/MOTEL"), "HOTEL",
ifelse(dttest[["Locdescrip"]] %in% c("HOSPITAL", "HOSPITAL BUILDING/GROUNDS", "DAY CARE CENTER", "NURSING HOME", "NURSING HOME/RETIREMENT HOME", "MEDICAL/DENTAL OFFICE"), "HEALTH",
ifelse(dttest[["Locdescrip"]] %in% c("ALLEY", "BOWLING ALLEY"), "ALLEY",
ifelse(dttest[["Locdescrip"]] %in% c("CHA APARTMENT", "CHA HALLWAY/STAIRWELL/ELEVATOR", "CHA PARKING LOT", "CHA PARKING LOT/GROUNDS"), "CHA",
ifelse(dttest[["Locdescrip"]] %in% c("CTA BUS", "CTA BUS STOP", "CTA GARAGE / OTHER PROPERTY", "CTA PLATFORM", "CTA STATION", "CTA TRACKS - RIGHT OF WAY", "CTA TRAIN", "CTA \"\"L\"\" TRAIN"), "CTA",
ifelse(dttest[["Locdescrip"]] %in% c("AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA", "AIRPORT BUILDING NON-TERMINAL - SECURE AREA", "AIRPORT EXTERIOR - NON-SECURE AREA", "AIRPORT EXTERIOR - SECURE AREA", "AIRPORT PARKING LOT", "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA", "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA", "AIRPORT TRANSPORTATION SYSTEM (ATS)", "AIRPORT VENDING ESTABLISHMENT", "AIRPORT/AIRCRAFT", "AIRCRAFT"), "AIRPORT",
ifelse(dttest[["Locdescrip"]] %in% c("APPLIANCE STORE", "BARBERSHOP", "CAR WASH", "CLEANING STORE", "CONVENIENCE STORE", "DEPARTMENT STORE", "DRUG STORE", "GARAGE/AUTO REPAIR", "GAS STATION", "GAS STATION DRIVE/PROP.", "GROCERY FOOD STORE", "NEWSSTAND", "OFFICE", "PAWN SHOP", "RETAIL STORE", "SMALL RETAIL STORE"), "STORE",
ifelse(dttest[["Locdescrip"]] %in% c("BRIDGE", "DRIVEWAY", "GANGWAY", "HIGHWAY/EXPRESSWAY", "LAKEFRONT/WATERFRONT/RIVERBANK", "SIDEWALK", "STREET", "HALLWAY"), "STREET",
dttest[["Locdescrip"]])))))))))))))))))
# Set dttest as data.frame
dttest <- as.data.frame(dttest)
# Reorder columns
dttest <- dttest[c("Case", "Date", "Year", "Month", "Day", "Season", "Tint", "Type", "Arrest", "Domestic", "Locdescrip", "Beat", "District", "Community", "Latitude", "Longitude", "Location")]
# Normalize variables
dttest[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")] <- lapply(dttest[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")], as.factor)
library(dplyr)
glimpse(dttest)
## Rows: 1,137,038
## Columns: 17
## $ Case <chr> "JE240540", "JE279849", "JG507211", "JG501049", "JG415333",…
## $ Date <dttm> 2021-05-24 15:06:00, 2021-06-26 09:24:00, 2023-11-09 07:30…
## $ Year <int> 2021, 2021, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,…
## $ Month <ord> May, Jun, Nov, Nov, Sep, Aug, Jul, Aug, Sep, Aug, Jul, Sep,…
## $ Day <ord> Mon, Sat, Thu, Sun, Wed, Thu, Mon, Sun, Mon, Tue, Mon, Sun,…
## $ Season <fct> SUMMER, SUMMER, WINTER, WINTER, FALL, FALL, FALL, FALL, FAL…
## $ Tint <fct> 12-17H, 6-11H, 6-11H, 6-11H, 12-17H, 12-17H, 18-24H, 6-11H,…
## $ Type <fct> HOMICIDE, HOMICIDE, BURGLARY, BATTERY, DAMAGE, DECEIVE, CRI…
## $ Arrest <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Domestic <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ Locdescrip <fct> "STREET", "PARKING", "APARTMENT", "STORE", "PARKING LOT / G…
## $ Beat <fct> 2515, 1711, 1922, 632, 122, 1225, 333, 1732, 822, 835, 731,…
## $ District <fct> 25, 17, 19, 6, 1, 12, 3, 17, 8, 8, 7, 22, 7, 9, 11, 11, 25,…
## $ Community <fct> 19, 13, 5, 44, 32, 28, 43, 21, 63, 70, 69, 73, 67, 60, 27, …
## $ Latitude <dbl> 41.91784, 41.99522, 41.95235, 41.73775, 41.88602, 41.87757,…
## $ Longitude <dbl> -87.75597, -87.71335, -87.67798, -87.60486, -87.63394, -87.…
## $ Location <chr> "(41.917838056, -87.755968972)", "(41.995219444, -87.713354…
# Do not use scientific notation
options(scipen=200)
# Detach plyr if it's loaded and not required
if ("package:plyr" %in% search()) {
detach("package:plyr", unload=TRUE)
}
# Load libraries
library(dplyr)
library(ggplot2)
# Your analysis code
dttest %>%
dplyr::group_by(Year) %>%
dplyr::summarise(Count = n()) %>%
ggplot(aes(x = Year, y = Count)) +
geom_line(colour = "red") +
geom_point(colour = "red") +
geom_bar(aes(x = Year, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Year", y = "Number of Crimes", title = "Evolution of Number of Crimes") +
geom_text(aes(x = Year, y = Count, label = Count), size = 3, vjust = -1, position = position_dodge(0.9)) +
theme_minimal() +
theme(axis.title.x=element_blank(), axis.title.y=element_blank())
The number of cases decreased during lockdown but increased in 2022.
# By time intervals
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:forecast':
##
## gghistogram
p1 <- dttest %>%
group_by(Tint) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Tint, y = Count)) +
geom_bar(aes(x = Tint, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Time intervals", y = "Number of crimes", title = "Evolution by time intervals") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
print(p1)
# By weekdays
# Assuming Day is a character vector representing weekdays
p2 <- dttest %>%
group_by(Day) %>%
summarise(Count = n()) %>%
ggplot(aes(x = factor(Day, level = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")), y = Count)) +
geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
labs(x = "Weekdays", y = "Number of crimes", title = "Evolution by weekdays") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_blank())
print(p2)
# Assuming Month is a character vector representing months
p3 <- dttest %>%
group_by(Month) %>%
summarise(Count = n()) %>%
ggplot(aes(x = factor(Month, level = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")), y = Count)) +
geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
labs(x = "Months", y = "Number of crimes", title = "Evolution by months") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_blank())
print(p3)
# By seasons
p4 <- dttest %>%
group_by(Season) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Season, y = Count)) +
geom_bar(aes(x = factor(Season, level = c("SPRING", "SUMMER", "FALL", "WINTER")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Seasons", y = "Number of crimes", title = "Evolution by seasons") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# Combine plots into one plot
print(p4)
# Find top 5 most frequent places
top5 <- dttest %>%
group_by(Locdescrip) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5) # Select the top 5 places
# Find bottom 5 most frequent places
bottom5 <- dttest %>%
group_by(Locdescrip) %>%
summarise(Count = n()) %>%
arrange(Count) %>%
head(5) # Select the bottom 5 places
# Create a plot for the top 5 places
p_top5 <- top5 %>%
ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
labs(x = "Places", y = "Number of crimes", title = "Top 5 most frequent places") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_blank())
# Create a plot for the bottom 5 places
p_bottom5 <- bottom5 %>%
ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
labs(x = "Places", y = "Number of crimes", title = "Bottom 5 most frequent places") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
theme(axis.title.x = element_blank()) +
theme(axis.title.y = element_blank())
# Print top 5 places with counts
print(top5)
## # A tibble: 5 × 2
## Locdescrip Count
## <fct> <int>
## 1 STREET 367187
## 2 APARTMENT 192348
## 3 RESIDENCE 187694
## 4 STORE 94047
## 5 OTHERS 29915
# Print bottom 5 places with counts
print(bottom5)
## # A tibble: 5 × 2
## Locdescrip Count
## <fct> <int>
## 1 HORSE STABLE 1
## 2 PUBLIC GRAMMAR SCHOOL 1
## 3 RAILROAD PROPERTY 1
## 4 VEHICLE - COMMERCIAL: TROLLEY BUS 1
## 5 CLUB 2
# Print and view the top 5 plot
print(p_top5)
# Print and view the bottom 5 plot
print(p_bottom5)
# Import and plot the shape file
library(ggplot2)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(dplyr)
# Load the shapefile (adjust the path as per your system)
# Replace the path with the actual path to your shapefile
shapefile_path <- "/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp"
# Read the shapefile
mapcomu <- st_read(shapefile_path)
## Reading layer `geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6' from data source `/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## Geodetic CRS: WGS84(DD)
names(mapcomu)
## [1] "area" "area_num_1" "area_numbe" "comarea" "comarea_id"
## [6] "community" "perimeter" "shape_area" "shape_len" "geometry"
library(sf)
library(ggplot2)
library(dplyr)
library(data.table)
library(stringr)
# Read the shapefile using sf
#mapcomu <- st_read("path_to_your_shapefile.shp")
# Extract number of crimes for each community
# Make sure 'Community' in dttest corresponds to 'area_numbe' in mapcomu
temp <- dttest %>%
group_by(Community) %>%
summarise(Count = n())
# Merge two data frames
# Replace 'area_numbe' with the correct column name from mapcomu if different
temp2df <- left_join(st_as_sf(mapcomu), temp, by = c("area_numbe" = "Community"))
# Basic plot using sf object directly
locplot <- ggplot(data = temp2df) +
geom_sf(aes(fill = Count), color = "black", size = 0.25) +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Number of crimes per community") +
theme_void() +
theme(legend.position = "bottom")
# Rest of your code for plotting police stations and histogram...
# Import the police station
dfpolice <- fread(file = "/Users/divyakampalli/Downloads/Police_Stations_-_Map.csv", header = T, sep = ",", na.strings = "")
# Extract police stations' locations
dfpolice$LOCATION <- gsub("[(*)]", "", dfpolice$LOCATION)
policeloc <-str_split_fixed(dfpolice$LOCATION, ", ", 2)
policeloc <- as.data.frame(policeloc)
colnames(policeloc) <- c("lat", "long")
policeloc$lat <- as.numeric(as.character(policeloc$lat))
policeloc$long <- as.numeric(as.character(policeloc$long))
policeloc$id <- dfpolice$DISTRICT
# Plot police stations (by using black triangles) on the map
locplot <- locplot +
geom_point(data = policeloc, aes(x = long, y = lat), size = 1, shape = 24, fill = "black")
# Plot histogramme
tempplot <- dttest %>%
group_by(Community) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Community, y = Count)) +
geom_bar(aes(x = reorder(Community, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Community number", y = "Number of crimes", title = "Evolution by community areas") +
theme_minimal() +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
locplot
tempplot
# Types and number of crimes
p1 <- dttest %>%
group_by(Type) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Type, y = Count)) +
geom_bar(aes(x = reorder(Type, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
coord_flip() +
labs(x = "Number of crimes", y = "Type", title = "Evolution of number of crimes for different types") +
theme_minimal() +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# Evolution over years
p2 <- dttest %>%
group_by(Year, Type) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Year, y = Count, fill = Type)) +
geom_area() +
labs(x = "Years", y = "Number of crimes", title = "Evolution of crime types over years")
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
library(ggplot2)
# Assuming dttest is your data.table with columns Year, Type
# Get unique crime types
crime_types <- unique(dttest$Type)
# Create a list to store individual plots
plots_list <- list()
# Loop through each crime type and create a plot
for (crime_type in crime_types) {
plot_data <- dttest %>%
filter(Type == crime_type) %>%
group_by(Year) %>%
summarise(Count = n())
# Create a plot for the current crime type
current_plot <- ggplot(plot_data, aes(x = Year, y = Count)) +
geom_smooth(method = "lm") +
geom_point() +
labs(x = "Years", y = "Number of crimes", title = paste("Evolution of", crime_type, "over years"))
# Add the plot to the list
plots_list[[crime_type]] <- current_plot
}
# Print and view each plot
for (crime_type in crime_types) {
print(plots_list[[crime_type]])
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Transform the type
dttest[, c("Month", "Day", "Season", "Tint")] <- lapply(dttest[, c("Month", "Day", "Season", "Tint")], as.character)
# By time intervals
p1 <- dttest %>%
group_by(Type, Tint) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Tint, y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Time intervals", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by time intervals") +
theme_bw() +
theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
print(p1)
# By weekdays
p2 <- dttest %>%
group_by(Type, Day) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Day, y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Weekdays", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by weekdays") +
theme_bw() +
theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
print(p2)
# By months
p3 <- dttest %>%
group_by(Type, Month) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Month, y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Months", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by months") +
theme_bw() +
theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
print(p3)
# By seasons
p4 <- dttest %>%
group_by(Type, Season) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Season, y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Seasons", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by seasons") +
theme_bw() +
theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
print(p4)
# Find top10 most frequent places
top10P <- head(names(sort(table(dttest$Locdescrip), decreasing = TRUE)), 10)
# Find top10 most frequent crime types
top10T <- head(names(sort(table(dttest$Type), decreasing = TRUE)), 10)
# Plot
filter(dttest, Locdescrip %in% top10P) %>%
filter(Type %in% top10T) %>%
group_by(Type, Locdescrip) %>%
summarise(Count = n()) %>%
ggplot(aes(x = reorder(Locdescrip, Count), y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Places", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by places") +
theme_bw() +
theme(
panel.grid.major = element_line(colour = NA),
panel.grid.minor = element_line(colour = NA),
axis.text.x = element_text(angle = 45, vjust = 0.1, hjust = 0.1) # Diagonal X-axis labels
)
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
# Find top10 most dangerous community areas
top10C <- head(names((sort(table(dttest$Community), decreasing = TRUE))), 10)
# Plot
filter(dttest, Type %in% top10T) %>%
filter(Community %in% top10C) %>%
group_by(Type, Community) %>%
summarise(Count = n()) %>%
ggplot(aes(x = reorder(Community, Count), y = reorder(Type, Count))) +
geom_tile(aes(fill = Count)) +
scale_x_discrete("Community areas", expand = c(0, 0), position = "top") +
scale_y_discrete("Crime types", expand = c(0, -2)) +
scale_fill_gradient("Number of crimes", low = "white", high = "red") +
ggtitle("Evolution by areas") +
theme_bw() +
theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
# Numbers
dttest %>%
filter(Domestic == T) %>%
group_by(Year) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Year, y = Count)) +
geom_bar(aes(x = Year, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Number of crimes", y = "Year", title = "Evolution of number of domestic crimes in different years") +
theme_minimal() +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# By time intervals
p1 <- dttest %>%
filter(Domestic == T) %>%
group_by(Tint) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Tint, y = Count)) +
geom_bar(aes(x = Tint, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Time intervals", y = "Number of domestic crimes", title = "Evolution by time intervals") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# By weekdays
p2 <- dttest %>%
filter(Domestic == T) %>%
group_by(Day) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Day, y = Count)) +
geom_bar(aes(x = factor(Day, level = c("lun\\.", "mar\\.", "mer\\.", "jeu\\.", "ven\\.", "sam\\.", "dim\\.")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Weekdays", y = "Number of domestic crimes", title = "Evolution by weekdays") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# By months
p3 <- dttest %>%
filter(Domestic == T) %>%
group_by(Month) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Month, y = Count)) +
geom_bar(aes(x = factor(Month, level = c("janv\\.", "févr\\.", "mars", "avr\\.", "mai", "juin", "juil\\.", "août", "sept\\.", "oct\\.", "nov\\.", "déc\\.")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Months", y = "Number of domestic crimes", title = "Evolution by months") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# By seasons
p4 <- dttest %>%
filter(Domestic == T) %>%
group_by(Season) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Season, y = Count)) +
geom_bar(aes(x = factor(Season, level = c("SPRING", "SUMMER", "FALL", "WINTER")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Seasons", y = "Number of domestic crimes", title = "Evolution by seasons") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# Locations
dttest %>%
filter(Domestic == T) %>%
group_by(Locdescrip) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Locdescrip, y = Count)) +
geom_bar(aes(x = reorder(Locdescrip, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
labs(x = "Places", y = "Number of crimes", title = "Evolution by places") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# Extract data
temp <- dttest %>%
filter(Arrest == T) %>%
group_by(Year) %>%
summarise(Count = n())
# Compute the crime rates
temp$rate <- lapply(temp$Count, function(x) x / nrow(dttest))
temp$rate <- as.numeric(temp$rate)
# Plot
ggplot(temp, aes(x = Year, y = rate)) +
geom_line() +
theme_minimal() +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank())
# Find top10 most dangerous community areas
top10C <- head(names((sort(table(dttest$Community), decreasing = TRUE))), 10)
# Iterate through each community area
for (community_area in top10C) {
# Filter data for the current community area
community_data <- filter(dttest, Community == community_area)
# Create a plot for the current community area
plot <- community_data %>%
group_by(Year) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Year, y = Count)) +
geom_smooth(method = "lm") +
geom_point() +
labs(x = "Years", y = "Number of crimes", title = paste("Evolution of crimes in", community_area, "over years"))
# Display the plot
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Extract data
temp <- dttest %>%
filter(Arrest == TRUE, Community %in% top10C) %>%
group_by(Year, Community) %>%
summarise(Count = n())
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Compute the crime rates
temp$rate <- temp$Count / nrow(dttest)
# Iterate through each community area
for (community_area in unique(temp$Community)) {
# Filter data for the current community area
community_data <- filter(temp, Community == community_area)
# Create a plot for the current community area
plot <- ggplot(community_data, aes(x = Year, y = rate)) +
geom_line() +
labs(x = "Years", y = "Crime rates", title = paste("Evolution of arrested crime rates in", community_area, "over years"))
# Display the plot
print(plot)
}
# Extract data
temp <- filter(dttest, Arrest == T) %>%
group_by(Year, Type) %>%
summarise(Count = n())
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
# Compute the crime rates
temp$rate <- lapply(temp$Count, function(x) x / nrow(dttest))
temp$rate <- as.numeric(temp$rate)
# Plot
ggplot(temp, aes(x = Year, y = rate, colour = Type)) +
geom_line()