Chicago Crime Data Analysis

#Loading Necessary Libraries

library(data.table)
library(ggplot2)
library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(ggpubr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(Metrics)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(forecast)

## 
## Attaching package: 'forecast'

## The following object is masked from 'package:Metrics':
## 
##     accuracy

## The following object is masked from 'package:ggpubr':
## 
##     gghistogram

library(padr)

#Data Preprocessing

Import data

Dataset <- fread(file = "/Users/divyakampalli/Desktop/DPA/finalproject/crimeanalysis.csv", header = T, sep = ",", na.strings = "")

Top 5 rows of the Dataset

head(Dataset)

Structure of the Dataset

str(Dataset)

## Classes 'data.table' and 'data.frame':   7946816 obs. of  22 variables:
##  $ ID                  : int  5741943 25953 26038 13279676 13274752 1930689 13203321 13210088 13210004 13210062 ...
##  $ Case Number         : chr  "HN549294" "JE240540" "JE279849" "JG507211" ...
##  $ Date                : chr  "08/25/2007 09:22:18 AM" "05/24/2021 03:06:00 PM" "06/26/2021 09:24:00 AM" "11/09/2023 07:30:00 AM" ...
##  $ Block               : chr  "074XX N ROGERS AVE" "020XX N LARAMIE AVE" "062XX N MC CORMICK RD" "019XX W BYRON ST" ...
##  $ IUCR                : chr  "0560" "0110" "0110" "0620" ...
##  $ Primary Type        : chr  "ASSAULT" "HOMICIDE" "HOMICIDE" "BURGLARY" ...
##  $ Description         : chr  "SIMPLE" "FIRST DEGREE MURDER" "FIRST DEGREE MURDER" "UNLAWFUL ENTRY" ...
##  $ Location Description: chr  "OTHER" "STREET" "PARKING LOT" "APARTMENT" ...
##  $ Arrest              : logi  FALSE TRUE TRUE FALSE FALSE TRUE ...
##  $ Domestic            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Beat                : int  2422 2515 1711 1922 632 512 122 1225 333 1732 ...
##  $ District            : int  24 25 17 19 6 5 1 12 3 17 ...
##  $ Ward                : int  49 36 50 47 6 NA 42 27 7 30 ...
##  $ Community Area      : int  1 19 13 5 44 NA 32 28 43 21 ...
##  $ FBI Code            : chr  "08A" "01A" "01A" "05" ...
##  $ X Coordinate        : int  NA 1141387 1152781 1162518 1183071 NA 1174694 1160870 1190812 1151117 ...
##  $ Y Coordinate        : int  NA 1913179 1941458 1925906 1847869 NA 1901831 1898642 1856743 1922554 ...
##  $ Year                : int  2007 2021 2021 2023 2023 2002 2023 2023 2023 2023 ...
##  $ Updated On          : chr  "08/17/2015 03:03:40 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" ...
##  $ Latitude            : num  NA 41.9 42 42 41.7 ...
##  $ Longitude           : num  NA -87.8 -87.7 -87.7 -87.6 ...
##  $ Location            : chr  NA "(41.917838056, -87.755968972)" "(41.995219444, -87.713354912)" "(41.952345086, -87.677975059)" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Summary of Dataset

summary(Dataset)

##        ID           Case Number            Date              Block          
##  Min.   :     634   Length:7946816     Length:7946816     Length:7946816    
##  1st Qu.: 3852320   Class :character   Class :character   Class :character  
##  Median : 7149126   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 7150988                                                           
##  3rd Qu.:10335352                                                           
##  Max.   :13292996                                                           
##                                                                             
##      IUCR           Primary Type       Description        Location Description
##  Length:7946816     Length:7946816     Length:7946816     Length:7946816      
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##    Arrest         Domestic            Beat         District         Ward       
##  Mode :logical   Mode :logical   Min.   : 111   Min.   : 1.0   Min.   : 1.0    
##  FALSE:5892780   FALSE:6581484   1st Qu.: 621   1st Qu.: 6.0   1st Qu.:10.0    
##  TRUE :2054036   TRUE :1365332   Median :1034   Median :10.0   Median :23.0    
##                                  Mean   :1185   Mean   :11.3   Mean   :22.8    
##                                  3rd Qu.:1731   3rd Qu.:17.0   3rd Qu.:34.0    
##                                  Max.   :2535   Max.   :31.0   Max.   :50.0    
##                                                 NA's   :47     NA's   :614854  
##  Community Area     FBI Code          X Coordinate      Y Coordinate    
##  Min.   : 0.0     Length:7946816     Min.   :      0   Min.   :      0  
##  1st Qu.:23.0     Class :character   1st Qu.:1152998   1st Qu.:1859105  
##  Median :32.0     Mode  :character   Median :1166138   Median :1890771  
##  Mean   :37.5                        Mean   :1164616   Mean   :1885821  
##  3rd Qu.:57.0                        3rd Qu.:1176389   3rd Qu.:1909321  
##  Max.   :77.0                        Max.   :1205119   Max.   :1951622  
##  NA's   :613478                      NA's   :87614     NA's   :87614    
##       Year       Updated On           Latitude       Longitude     
##  Min.   :2001   Length:7946816     Min.   :36.62   Min.   :-91.69  
##  1st Qu.:2005   Class :character   1st Qu.:41.77   1st Qu.:-87.71  
##  Median :2009   Mode  :character   Median :41.86   Median :-87.67  
##  Mean   :2010                      Mean   :41.84   Mean   :-87.67  
##  3rd Qu.:2015                      3rd Qu.:41.91   3rd Qu.:-87.63  
##  Max.   :2023                      Max.   :42.02   Max.   :-87.52  
##                                    NA's   :87614   NA's   :87614   
##    Location        
##  Length:7946816    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

#Data Cleaning and Preprocessing

Extracting 5 past years’ data

TestDT <- Dataset[Year > 2018]

Renaming some of the variables

setnames(TestDT, c("Case Number", "Primary Type", "Location Description", "Community Area"), c("Case", "Type", "Locdescrip", "Community"))

Checking if there are any Duplicates

any(duplicated(TestDT[["Case"]]))

## [1] TRUE

Removing any duplicates in Case Number and testing again to check if there are any duplicates.

TestDT <- TestDT[!duplicated(TestDT[["Case"]])]
any(duplicated(TestDT[["Case"]]))

## [1] FALSE

Testing for missing values

any(is.na(TestDT))

## [1] TRUE

# Finding the missing values in each coloumn.
colSums(is.na(TestDT))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         6051            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           48            2            0        16907        16907            0 
##   Updated On     Latitude    Longitude     Location 
##            0        16907        16907        16907

 #Replacing NAs with similar values
TestDT$`Latitude` <- na.omit(TestDT$`Latitude`)[match(TestDT$`X Coordinate`, na.omit(TestDT$`X Coordinate`))]
colSums(is.na(TestDT))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         6051            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           48            2            0        16907        16907            0 
##   Updated On     Latitude    Longitude     Location 
##            0        16907        16907        16907

# Removing NA in latitude, longitude, location
TestDT <- TestDT[!is.na(TestDT[["Latitude"]])]

We note that since these three columns are significant, we do not need to repeat the function for each column after eliminating duplicates in the “Latitude” column. As a result, all missing values pertaining to the location description disappear.

colSums(is.na(TestDT))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         4513            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           47            1            0            0            0            0 
##   Updated On     Latitude    Longitude     Location 
##            0            0            0            0

# Removing NA in Case Number
TestDT <- TestDT[!is.na(TestDT[["Case"]])]
colSums(is.na(TestDT))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         4513            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           47            1            0            0            0            0 
##   Updated On     Latitude    Longitude     Location 
##            0            0            0            0

# Replacing NAs for Location Description using  Location records 
TestDT$`Locdescrip` <- na.omit(TestDT$`Locdescrip`)[match(TestDT$`Location`, na.omit(TestDT$`Location`))]

# Replacing NAs for District using records in Beat
TestDT$`District` <- na.omit(TestDT$`District`)[match(TestDT$`Beat`, na.omit(TestDT$`Beat`))]

# Replacing NAs for Ward using  Location records 
TestDT$`Ward` <- na.omit(TestDT$`Ward`)[match(TestDT$`Location`, na.omit(TestDT$`Location`))]

# Replacing NAs for Community Area using  Location records 
TestDT$`Community` <- na.omit(TestDT$`Community`)[match(TestDT$`Location`, na.omit(TestDT$`Location`))]

colSums(is.na(TestDT))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0          637            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##            5            1            0            0            0            0 
##   Updated On     Latitude    Longitude     Location 
##            0            0            0            0

# Removing the observations containing NAs that cannot be replaced in the column Locdescrip
TestDT <- TestDT[!is.na(TestDT[["Locdescrip"]])]

# Testing again to make sure that there is no more missing values
any(is.na(TestDT))

## [1] FALSE

# Using boxplot.stats for numeric column Year
boxplot.stats(TestDT$Year)$out

## integer(0)

# Number of districts in our dataset 
length(unique(TestDT[["District"]]))

## [1] 22

# What are the codes of districts? 
table(TestDT[["District"]])

## 
##     1     2     3     4     5     6     7     8     9    10    11    12    14 
## 57356 53489 57288 65487 49891 72754 55644 70805 49563 51028 72007 61636 38451 
##    15    16    17    18    19    20    22    24    25 
## 42143 40576 31902 57244 54053 22159 37021 37759 58782

Highest number of crimes are in District 8

# How many community areas in our dataset? 
length(unique(TestDT[["Community"]]))

## [1] 77

# What are the codes of community areas? 
table(TestDT[["Community"]])

## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 18660 15983 17694  8474  5275 26294 16758 51818  1087  4851  4176  1973  3757 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
##  9835 12616 10923  6222  2532 19001  6076  8830 19511 33106 30946 63537 23829 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## 19967 42053 36892 18467 11267 39542 10533  4779 13412  3577  3474 16155  7765 
##    40    41    42    43    44    45    46    47    48    49    50    51    52 
## 11257  9210 16225 41896 29853  5634 19785  1709  6342 28362  4851  8303  4852 
##    53    54    55    56    57    58    59    60    61    62    63    64    65 
## 17427  5943  2673  8032  3815  9291  3697  6162 18743  3734  9065  4130  6834 
##    66    67    68    69    70    71    72    73    74    75    76    77 
## 24192 26677 25579 30992  9085 32409  3776 13465  2106  8279  8036 12970

Highest number of crimes are in 25th community

# Removing data having 0 as community code
TestDT <- TestDT[which(Community != 0),]

df <- data.frame(Dataset)
crimes <- df %>% 
  select(c(Date, Primary.Type)) %>% 
  mutate(Primary.Type = as.factor(Primary.Type),
         Date = mdy_hms(Date), 
         Date = floor_date(Date, unit = "hours")) %>% #takes a date-time object and rounds it down to hours unit
  arrange(Date)

crimes %>% 
  count(Primary.Type, sort = T) %>% 
  head(5) %>% 
  ggplot(aes(x = n, y = reorder(Primary.Type, n))) +
  geom_col()+
  labs(title = 'Top 5 Crimes in Chicago', 
       x = 'Number of Crimes', 
       y = 'Crimes')

From the chart above, theft_crime has the highest number of crimes among others. Therefore, in this analysis we will try to do a time series prediction analysis for the theft_crime case.

range(crimes$Date)

## [1] "2001-01-01 UTC" "2023-11-23 UTC"

For this analysis, we will limit the data analysis to only the number of theft_crime crimes from the beginning of 2018 to the end of 2022.

theft_crime <- crimes %>% 
  filter(Primary.Type == 'THEFT') %>% 
  group_by(Date) %>% 
  summarise(Theft = n()) %>% 
  filter(Date > '2018-01-01' & Date < '2022-12-31')

head(theft_crime, 5)

tail(theft_crime, 5)

length(theft_crime$Date)

## [1] 42152

theft_crime <- theft_crime %>%
  slice(-c(33625:33641))

range(theft_crime$Date)

## [1] "2018-01-01 07:00:00 UTC" "2022-12-31 05:00:00 UTC"

Pre-processing of Data Among the data requirements that the Time Series must handle are the following:

no intervals missing -no values that are missing -The order of the data should be by time

colSums(is.na(theft_crime))

##  Date Theft 
##     0     0

theft_crime <- theft_crime %>% 
  pad(start_val = ymd_hms("2018-01-01 00:00:00"), end_val = ymd_hms("2021-12-31 23:00:00")) %>% 
  replace(., is.na(.), 0)

## pad applied on the interval: hour

range(theft_crime$Date)

## [1] "2018-01-01 00:00:00 UTC" "2021-12-31 23:00:00 UTC"

theft_crime_ts <- ts(data = theft_crime$Theft,
   start = min(theft_crime$Date),
   frequency = 24) # daily seasonality

We have attempted to investigate whether our timeseries object has trend and seasonal characteristics in this section (one-seasonal/multiseasonal).

# decompose ts object
theft_deco <- decompose(theft_crime_ts)
autoplot(theft_deco)

Plotting the trend reveals that there is still an up-and-down pattern, indicating that the time series object is multiseasonal. Consequently, we ought to attempt reanalyzing this data using multiseasonal time series with a different seasonal frequency.

Trial - 1

theft_crime$Theft %>% 
  msts(seasonal.periods = c(24,24*7)) %>% # multiseasonal ts (daily, weekly)
  mstl() %>% # multiseasonal ts decomposition
  autoplot()

Trail - 2

theft_crime$Theft %>% 
  msts(seasonal.periods = c(24, 24*7, 24*7*4)) %>% # multiseasonal ts (daily, weekly, monthly)
  mstl() %>% # multiseasonal ts decomposition
  autoplot()

Trial - 3

theft_crime$Theft %>% 
  msts(seasonal.periods = c(24, 24*7, 24*7*4, 24*7*4*12)) %>% # multiseasonal ts (daily, weekly, monthly, annualy)
  mstl() %>% # multiseasonal ts decomposition
  autoplot()

The time series model building will employ the final ts object since it demonstrated the best decomposition out of the three trials. Furthermore take note of the fact that the data is an additive time series; this is important information for developing the model at a later stage.

# assign final ts object
theft_msts <- theft_crime$Theft %>% 
  msts(seasonal.periods = c(24, 24*7, 24*7*4, 24*7*4*12))

# check for stationary
adf.test(theft_msts)

## Warning in adf.test(theft_msts): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  theft_msts
## Dickey-Fuller = -15.14, Lag order = 32, p-value = 0.01
## alternative hypothesis: stationary

The data is already stationary because the p-value is less than alpha, according to the results of the Augmented Dickey-Fuller Testing (adf.test). Therefore, we do not need to perform data differencing before building a model using SARIMA.

Verification by cross-checking We will carry out the cross-validation tasks once our allotted time has elapsed. We are going to divide our data into two categories. Test data is data that offers an objective assessment of a model fit on the training data set, whereas train data is data that will be fed into our model.

We are going to divide our train data into two distinct types of data sets: train and test. The last 365 days’ worth of reported crimes from our data frame will make up our test data set.

length(theft_crime$Date)

## [1] 35064

Sequential splitting rather than random sampling is the recommended approach for the cross-validation of time series.

theft_train <- theft_msts %>% head(length(theft_msts) - 24*7*4*12)
theft_test <- theft_msts %>% tail(24*7*4*12)
head(theft_msts)

## Multi-Seasonal Time Series:
## Start: 1 1
## Seasonal Periods: 24 168 672 8064
## Data:
## [1] 0 0 0 0 0 0

Seasonality Analysis

# Decompose MSTS Object
theft_multi_dec <- theft_msts %>%
  mstl()

theft_multi_dec %>%
  tail(24*7*4*12) %>%
  autoplot()

# Create a data frame based on MSTS Object

df_theft_multi <- as.data.frame(theft_multi_dec)

glimpse(df_theft_multi)

## Rows: 35,064
## Columns: 7
## $ Data         <dbl> 0, 0, 0, 0, 0, 0, 0, 4, 3, 10, 2, 9, 14, 2, 3, 7, 9, 7, 2…
## $ Trend        <dbl> 7.022148, 7.022269, 7.022390, 7.022511, 7.022632, 7.02275…
## $ Seasonal24   <dbl> -0.6647175, -2.8706026, -4.1476981, -4.0685433, -4.314429…
## $ Seasonal168  <dbl> -1.23743281, -0.13659220, -0.12242789, -1.25920591, 0.237…
## $ Seasonal672  <dbl> -1.18058087, -0.55985752, -0.24198083, 0.48740682, 0.2836…
## $ Seasonal8064 <dbl> -1.38916336, -0.42363040, -0.08377644, -0.18719778, 0.424…
## $ Remainder    <dbl> -2.5502531, -3.0315859, -2.4265063, -1.9949704, -3.654255…

Seasonality by Hour The Hourly Seasonality of Battery Crime in Chicago follows this plot.

df_theft_multi %>%
  mutate(day = theft_crime$Date) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
  head(24*7) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_point(col = "maroon") + 
  geom_line(col = "blue") +
  theme_minimal()

Daily Seasonality

df_theft_multi %>%
  mutate(day = wday(theft_crime$Date, label = T)) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_col() +
  theme_minimal()

The plot above shows that theft crimes tend to rise on Wednesdays, Thursdays, Fridays, and Saturdays.

Monthly Seasonality

df_theft_multi %>%
  mutate(month = month(theft_crime$Date, label = T)) %>%
  group_by(month) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal672 + Seasonal8064)) %>%
  ggplot(aes(x = month, y = seasonal)) +
  geom_col() +
  theme_minimal()

The monthly plot indicates that theft_crime tends to rise from June through October.

Model Building ETS Holt-Winters and Seasonal Arima are two popular time series modeling techniques in business and industry that I will compare when building my model. Since my data includes both seasonal and trend information, I utilize ETS Holt-Winters. In order to determine whether seasonal ARIMA can provide better forecasting performance, I also want to compare it to seasonal Arima.

# ets Holt-Winters
theft_crime_ets <- stlm(theft_train, method = "ets") # no log transformation for additive data

# ARIMA
theft_crime_arima <- stlm(theft_train, method = "arima")

Forecast & Evaluation

# forecast
theft_ets_f <- forecast(theft_crime_ets, h = 24*7*4*12)
theft_arima_f <- forecast(theft_crime_arima, h = 24*7*4*12)
# visualization

a <- autoplot(theft_ets_f, series = "ETS", fcol = "red") +
  autolayer(theft_msts, series = "Actual", color = "black") + 
  labs(subtitle = "Number of Theft Case at Chicago, from Jan 2018 - Dec 2021",
       y = "Theft Frequency") +
  theme_minimal()

b <- autoplot(theft_arima_f, series = "ARIMA", fcol = "blue") +
  autolayer(theft_msts, series = "Actual", color = "black") +
  labs(subtitle = "Number of Theft Case at Chicago, from Jan 2018 - Dec 2021",
       y = "Theft Frequency") +
  theme_minimal()

grid.arrange(a,b)

# Calculate MAE using the mae function
mae_ets <- mae(theft_ets_f$mean, theft_test)

# Create a data frame with MAE values for ETS and ARIMA models
mae_comparison <- data.frame(ETS = mae_ets, ARIMA = mae(theft_arima_f$mean, theft_test))

# Print the MAE values
print(mae_comparison)

##        ETS    ARIMA
## 1 2.333739 2.419786

Based on the above analysis, we can say that we have successfully predicted the frequency of theft crimes and that ETS is the best model with the lowest error (MAE 2.33%, which is not too different from the ETS model).

Assumption Verification Normality: Testing Anderson-Darling H0: The residual distribution is normal H1: The residual distribution is not normal

# Assuming theft_arima_f$residuals is your time series residuals
result <- adf.test(theft_arima_f$residuals)

## Warning in adf.test(theft_arima_f$residuals): p-value smaller than printed
## p-value

# Print the test results
print(result)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  theft_arima_f$residuals
## Dickey-Fuller = -30.449, Lag order = 29, p-value = 0.01
## alternative hypothesis: stationary

hist(theft_arima_f$residuals, breaks = 20)

Autocorrelation: Box.test - Ljng-Box H0: The forecast errors show no autocorrelation. H1: The forecast errors exhibit autocorrelation

Box.test(theft_arima_f$residuals, type = "Ljung-Box") # there is not enough data to reject H0

## 
##  Box-Ljung test
## 
## data:  theft_arima_f$residuals
## X-squared = 0.0051992, df = 1, p-value = 0.9425

Our forecast residuals show no autocorrelation based on the assumption check (p-value > 0.05). However, as our forecast’s residuals are not normally distributed, they might not appear in the histogram’s vicinity of the mean.

These kinds of errors in a time series are actually quite inevitable and can arise from a variety of unforeseen events. Analyzing the kinds of unpredictable events that could happen and do so frequently is one way to get past it. The use of seasonality adjustment in time series analysis can accomplish this.

In summary We have successfully predicted the frequency of theft crimes based on our analysis and model performance, and the ARIMA model has proven to be the most accurate with the lowest error (MAE 2.419786%), albeit it is not significantly different from the ETS model. Our model was able to predict the Chicago theft crime rate with a Measure of Error (MAE) of approximately 2.419786%. Even so, there are a number of ways to thoroughly optimize it because our model does not meet the normalcy assumptions.

Based on the seasonal analysis, it is reasonable to conclude that theft crime will probably start to rise at 10 a.m., peak at 5 p.m. (after business hours), and then continue to rise until 12 a.m. The actual crime is more likely to occur between June and October.

Similarly, we can do Time series analysis for each crime.

TestDT <- TestDT[, !c("ID", "IUCR", "Description", "FBI Code", "Block", "Ward", "X Coordinate", "Y Coordinate", "Updated On")]

TestDT[["Date"]] <- parse_date_time(TestDT[["Date"]], orders = "mdY IMSp")

Create four time intervals and Extract hours

tint <- c("0", "5.9", "11.9", "17.9", "23.9")

hours <- hour(TestDT[["Date"]])

TestDT[["Tint"]] <- cut(hours, breaks = tint, labels = c("0-5H", "6-11H", "12-17H", "18-24H"), include.lowest = T)

# Create the column Day showing the weekday when the incident occurred
TestDT[["Day"]] <- wday(TestDT[["Date"]], label = T)

# Create the column Month showing the month when the incident occurred
TestDT[["Month"]] <- month(TestDT[["Date"]], label = T)

# Extract quarters
quarters <- quarter(TestDT$Date)

# Create four season intervals
sint <- c("0.9", "1.9", "2.9", "3.9", "4.9")

# Matching
TestDT[["Season"]] <- cut(quarters, breaks = sint, labels = c("SPRING", "SUMMER", "FALL", "WINTER"))

# Summary of all types
table(TestDT[["Type"]])

## 
##                             ARSON                           ASSAULT 
##                              2355                            100021 
##                           BATTERY                          BURGLARY 
##                            211515                             39060 
## CONCEALED CARRY LICENSE VIOLATION               CRIM SEXUAL ASSAULT 
##                               898                               983 
##                   CRIMINAL DAMAGE           CRIMINAL SEXUAL ASSAULT 
##                            130012                              6057 
##                 CRIMINAL TRESPASS                DECEPTIVE PRACTICE 
##                             22738                             80358 
##                          GAMBLING                          HOMICIDE 
##                               203                              3264 
##                 HUMAN TRAFFICKING  INTERFERENCE WITH PUBLIC OFFICER 
##                                49                              3426 
##                      INTIMIDATION                        KIDNAPPING 
##                               834                               620 
##              LIQUOR LAW VIOLATION               MOTOR VEHICLE THEFT 
##                               922                             77007 
##                         NARCOTICS                      NON-CRIMINAL 
##                             34928                                17 
##                         OBSCENITY        OFFENSE INVOLVING CHILDREN 
##                               230                              9319 
##          OTHER NARCOTIC VIOLATION                     OTHER OFFENSE 
##                                23                             70746 
##                      PROSTITUTION                  PUBLIC INDECENCY 
##                              1537                                33 
##            PUBLIC PEACE VIOLATION                         RITUALISM 
##                              4866                                 1 
##                           ROBBERY                       SEX OFFENSE 
##                             42401                              5556 
##                          STALKING                             THEFT 
##                              1663                            245131 
##                 WEAPONS VIOLATION 
##                             40265

# Number of types
length(unique(TestDT[["Type"]]))

## [1] 33

# Regroup some "small" types
TestDT[["Type"]] <- ifelse(TestDT[["Type"]] %in% c("CRIMINAL DAMAGE"), "DAMAGE", 
                   ifelse(TestDT[["Type"]] %in% c("DECEPTIVE PRACTICE"), "DECEIVE",
                   ifelse(TestDT[["Type"]] %in% c("KIDNAPPING", "OFFENSE INVOLVING CHILDREN", "HUMAN TRAFFICKING"), "HUMANCHILD",
                   ifelse(TestDT[["Type"]] %in% c("NARCOTICS", "OTHER NARCOTIC VIOLATION"), "NARCOTICS", 
                   ifelse(TestDT[["Type"]] %in% c("MOTOR VEHICLE THEFT"), "MOTO", 
                   ifelse(TestDT[["Type"]] %in% c("OTHER OFFENSE"), "OTHER", 
                   ifelse(TestDT[["Type"]] %in% c("CRIM SEXUAL ASSAULT", "PROSTITUTION", "SEX OFFENSE"), "SEX", 
                   ifelse(TestDT[["Type"]] %in% c("GAMBLING", "INTERFERENCE WITH PUBLIC OFFICER", "INTIMIDATION", "LIQUOR LAW VIOLATION", "OBSCENITY", "PUBLIC INDECENCY", "PUBLIC PEACE VIOLATION", "STALKING", "NON-CRIMINAL", "NON-CRIMINAL (SUBJECT SPECIFIED)", "NON - CRIMINAL"), "SOCIETY", 
                   ifelse(TestDT[["Type"]] %in% c("CRIMINAL TRESPASS"), "TRESPASS", 
                   ifelse(TestDT[["Type"]] %in% c("CONCEALED CARRY LICENSE VIOLATION", "WEAPONS VIOLATION"), "WEAPONS", TestDT[["Type"]]))))))))))

TestDT[["Locdescrip"]] <- ifelse(TestDT[["Locdescrip"]] %in% c("VEHICLE-COMMERCIAL", "VEHICLE - DELIVERY TRUCK", "VEHICLE - OTHER RIDE SERVICE", "VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)", "VEHICLE NON-COMMERCIAL", "TRAILER", "TRUCK", "DELIVERY TRUCK", "TAXICAB", "OTHER COMMERCIAL TRANSPORTATION"), "VEHICLE", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("BAR OR TAVERN", "TAVERN", "TAVERN/LIQUOR STORE"), "TAVERN",
                   ifelse(TestDT[["Locdescrip"]] %in% c("SCHOOL YARD", "SCHOOL, PRIVATE, BUILDING", "SCHOOL, PRIVATE, GROUNDS", "SCHOOL, PUBLIC, BUILDING", "SCHOOL, PUBLIC, GROUNDS", "COLLEGE/UNIVERSITY GROUNDS", "COLLEGE/UNIVERSITY RESIDENCE HALL"), "SCHOOL",
                   ifelse(TestDT[["Locdescrip"]] %in% c("RESIDENCE", "RESIDENCE-GARAGE", "RESIDENCE PORCH/HALLWAY", "RESIDENTIAL YARD (FRONT/BACK)", "DRIVEWAY - RESIDENTIAL", "GARAGE", "HOUSE", "PORCH", "YARD"), "RESIDENCE", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("PARKING LOT", "PARKING LOT/GARAGE(NON.RESID.)", "POLICE FACILITY/VEH PARKING LOT"), "PARKING", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("OTHER", "OTHER RAILROAD PROP / TRAIN DEPOT", "ABANDONED BUILDING", "ANIMAL HOSPITAL", "ATHLETIC CLUB", "BASEMENT", "BOAT/WATERCRAFT", "CHURCH", "CHURCH/SYNAGOGUE/PLACE OF WORSHIP", "COIN OPERATED MACHINE", "CONSTRUCTION SITE", "SEWER", "STAIRWELL", "VACANT LOT", "VACANT LOT/LAND", "VESTIBULE", "WOODED AREA", "FARM", "FACTORY", "FACTORY/MANUFACTURING BUILDING", "FEDERAL BUILDING", "FIRE STATION", "FOREST PRESERVE", "GOVERNMENT BUILDING", "GOVERNMENT BUILDING/PROPERTY", "JAIL / LOCK-UP FACILITY", "LIBRARY", "MOVIE HOUSE/THEATER", "POOL ROOM", "SPORTS ARENA/STADIUM", "WAREHOUSE", "AUTO", "AUTO / BOAT / RV DEALERSHIP", "CEMETARY"), "OTHERS", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("COMMERCIAL / BUSINESS OFFICE"), "BIGBUSINESS", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("PARK PROPERTY"), "PARK", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("ATM (AUTOMATIC TELLER MACHINE)", "BANK", "CREDIT UNION", "CURRENCY EXCHANGE", "SAVINGS AND LOAN"), "BANK", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("HOTEL", "HOTEL/MOTEL"), "HOTEL", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("HOSPITAL", "HOSPITAL BUILDING/GROUNDS", "DAY CARE CENTER", "NURSING HOME", "NURSING HOME/RETIREMENT HOME", "MEDICAL/DENTAL OFFICE"), "HEALTH", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("ALLEY", "BOWLING ALLEY"), "ALLEY", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("CHA APARTMENT", "CHA HALLWAY/STAIRWELL/ELEVATOR", "CHA PARKING LOT", "CHA PARKING LOT/GROUNDS"), "CHA", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("CTA BUS", "CTA BUS STOP", "CTA GARAGE / OTHER PROPERTY", "CTA PLATFORM", "CTA STATION", "CTA TRACKS - RIGHT OF WAY", "CTA TRAIN", "CTA \"\"L\"\" TRAIN"), "CTA", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA", "AIRPORT BUILDING NON-TERMINAL - SECURE AREA", "AIRPORT EXTERIOR - NON-SECURE AREA", "AIRPORT EXTERIOR - SECURE AREA", "AIRPORT PARKING LOT", "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA", "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA", "AIRPORT TRANSPORTATION SYSTEM (ATS)", "AIRPORT VENDING ESTABLISHMENT", "AIRPORT/AIRCRAFT", "AIRCRAFT"), "AIRPORT", 
                   ifelse(TestDT[["Locdescrip"]] %in% c("APPLIANCE STORE", "BARBERSHOP", "CAR WASH", "CLEANING STORE", "CONVENIENCE STORE", "DEPARTMENT STORE", "DRUG STORE", "GARAGE/AUTO REPAIR", "GAS STATION", "GAS STATION DRIVE/PROP.", "GROCERY FOOD STORE", "NEWSSTAND", "OFFICE", "PAWN SHOP", "RETAIL STORE", "SMALL RETAIL STORE"), "STORE",
                   ifelse(TestDT[["Locdescrip"]] %in% c("BRIDGE", "DRIVEWAY", "GANGWAY", "HIGHWAY/EXPRESSWAY", "LAKEFRONT/WATERFRONT/RIVERBANK", "SIDEWALK", "STREET", "HALLWAY"), "STREET",
                   TestDT[["Locdescrip"]])))))))))))))))))

# Set TestDT as data.frame
TestDT <- as.data.frame(TestDT)

# Reorder columns
TestDT <- TestDT[c("Case", "Date", "Year", "Month", "Day", "Season", "Tint", "Type", "Arrest", "Domestic", "Locdescrip", "Beat", "District", "Community", "Latitude", "Longitude", "Location")]

# Normalize variables
TestDT[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")] <- lapply(TestDT[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")], as.factor)

glimpse(TestDT)

## Rows: 1,137,038
## Columns: 17
## $ Case       <chr> "JE240540", "JE279849", "JG507211", "JG501049", "JG415333",…
## $ Date       <dttm> 2021-05-24 15:06:00, 2021-06-26 09:24:00, 2023-11-09 07:30…
## $ Year       <int> 2021, 2021, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,…
## $ Month      <ord> May, Jun, Nov, Nov, Sep, Aug, Jul, Aug, Sep, Aug, Jul, Sep,…
## $ Day        <ord> Mon, Sat, Thu, Sun, Wed, Thu, Mon, Sun, Mon, Tue, Mon, Sun,…
## $ Season     <fct> SUMMER, SUMMER, WINTER, WINTER, FALL, FALL, FALL, FALL, FAL…
## $ Tint       <fct> 12-17H, 6-11H, 6-11H, 6-11H, 12-17H, 12-17H, 18-24H, 6-11H,…
## $ Type       <fct> HOMICIDE, HOMICIDE, BURGLARY, BATTERY, DAMAGE, DECEIVE, CRI…
## $ Arrest     <lgl> TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE…
## $ Domestic   <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ Locdescrip <fct> "STREET", "PARKING", "APARTMENT", "STORE", "PARKING LOT / G…
## $ Beat       <fct> 2515, 1711, 1922, 632, 122, 1225, 333, 1732, 822, 835, 731,…
## $ District   <fct> 25, 17, 19, 6, 1, 12, 3, 17, 8, 8, 7, 22, 7, 9, 11, 11, 25,…
## $ Community  <fct> 19, 13, 5, 44, 32, 28, 43, 21, 63, 70, 69, 73, 67, 60, 27, …
## $ Latitude   <dbl> 41.91784, 41.99522, 41.95235, 41.73775, 41.88602, 41.87757,…
## $ Longitude  <dbl> -87.75597, -87.71335, -87.67798, -87.60486, -87.63394, -87.…
## $ Location   <chr> "(41.917838056, -87.755968972)", "(41.995219444, -87.713354…

# Do not use scientific notation
options(scipen=200)

# Detach plyr if it's loaded and not required
if ("package:plyr" %in% search()) {
  detach("package:plyr", unload=TRUE)
}

TestDT %>%
  dplyr::group_by(Year) %>%
  dplyr::summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) +
  geom_line(colour = "red") +
  geom_point(colour = "red") +
  geom_bar(aes(x = Year, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Year", y = "Number of Crimes", title = "Evolution of Number of Crimes") +
  geom_text(aes(x = Year, y = Count, label = Count), size = 3, vjust = -1, position = position_dodge(0.9)) +
  theme_minimal() +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank())

The number of cases decreased during lockdown but increased in 2022.

# By time intervals


p1 <- TestDT %>%
  group_by(Tint) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Tint, y = Count)) +
  geom_bar(aes(x = Tint, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Time intervals", y = "Number of crimes", title = "Evolution by time intervals") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) + 
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())
print(p1)

# By weekdays
# Assuming Day is a character vector representing weekdays

p2 <- TestDT %>%
  group_by(Day) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = factor(Day, level = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")), y = Count)) +
  geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Weekdays", y = "Number of crimes", title = "Evolution by weekdays") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
print(p2)

# Assuming Month is a character vector representing months

p3 <- TestDT %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = factor(Month, level = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")), y = Count)) +
  geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Months", y = "Number of crimes", title = "Evolution by months") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
print(p3)

# By seasons
p4 <- TestDT %>%
  group_by(Season) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Season, y = Count)) +
  geom_bar(aes(x = factor(Season, level = c("SPRING", "SUMMER", "FALL", "WINTER")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Seasons", y = "Number of crimes", title = "Evolution by seasons") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Combine plots into one plot
print(p4)

# Find top 5 most frequent places
top5 <- TestDT %>%
  group_by(Locdescrip) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  head(5)  # Select the top 5 places

# Find bottom 5 most frequent places
bottom5 <- TestDT %>%
  group_by(Locdescrip) %>%
  summarise(Count = n()) %>%
  arrange(Count) %>%
  head(5)  # Select the bottom 5 places

# Create a plot for the top 5 places
p_top5 <- top5 %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Places", y = "Number of crimes", title = "Top 5 most frequent places") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())

# Create a plot for the bottom 5 places
p_bottom5 <- bottom5 %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "#6495ED", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Places", y = "Number of crimes", title = "Bottom 5 most frequent places") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())

# Print top 5 places with counts
print(top5)

## # A tibble: 5 × 2
##   Locdescrip  Count
##   <fct>       <int>
## 1 STREET     367187
## 2 APARTMENT  192348
## 3 RESIDENCE  187694
## 4 STORE       94047
## 5 OTHERS      29915

# Print bottom 5 places with counts
print(bottom5)

## # A tibble: 5 × 2
##   Locdescrip                        Count
##   <fct>                             <int>
## 1 HORSE STABLE                          1
## 2 PUBLIC GRAMMAR SCHOOL                 1
## 3 RAILROAD PROPERTY                     1
## 4 VEHICLE - COMMERCIAL: TROLLEY BUS     1
## 5 CLUB                                  2

# Print and view the top 5 plot
print(p_top5)

# Print and view the bottom 5 plot
print(p_bottom5)

# Load the shapefile (adjust the path as per your system)

# Replace the path with the actual path to your shapefile
shapefile_path <- "/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp"

# Read the shapefile
mapcomu <- st_read(shapefile_path)

## Reading layer `geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6' from data source `/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## Geodetic CRS:  WGS84(DD)

names(mapcomu)

##  [1] "area"       "area_num_1" "area_numbe" "comarea"    "comarea_id"
##  [6] "community"  "perimeter"  "shape_area" "shape_len"  "geometry"

# Read the shapefile using sf
#mapcomu <- st_read("path_to_your_shapefile.shp")

# Extract number of crimes for each community
# Make sure 'Community' in TestDT corresponds to 'area_numbe' in mapcomu
temp <- TestDT %>%
  group_by(Community) %>%
  summarise(Count = n())

# Merge two data frames
# Replacing 'area_numbe' with the correct column name from mapcomu if different
temp2df <- left_join(st_as_sf(mapcomu), temp, by = c("area_numbe" = "Community"))

# Basic plot using sf object directly
locplot <- ggplot(data = temp2df) +
  geom_sf(aes(fill = Count), color = "black", size = 0.25) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Number of crimes per community") +
  theme_void() +
  theme(legend.position = "bottom")

# Rest of your code for plotting police stations and histogram...

# Import the police station
dfpolice <- fread(file = "/Users/divyakampalli/Downloads/Police_Stations_-_Map.csv", header = T, sep = ",", na.strings = "")


# Extract police stations' locations
dfpolice$LOCATION <- gsub("[(*)]", "", dfpolice$LOCATION)
policeloc <-str_split_fixed(dfpolice$LOCATION, ", ", 2)
policeloc <- as.data.frame(policeloc)
colnames(policeloc) <- c("lat", "long")
policeloc$lat <- as.numeric(as.character(policeloc$lat))
policeloc$long <- as.numeric(as.character(policeloc$long))
policeloc$id <- dfpolice$DISTRICT

# Plot police stations (by using black triangles) on the map
locplot <- locplot +
  geom_point(data = policeloc, aes(x = long, y = lat), size = 1, shape = 24, fill = "black")

# Plot histogramme
tempplot <- TestDT %>%
  group_by(Community) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Community, y = Count)) +
  geom_bar(aes(x = reorder(Community, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Community number", y = "Number of crimes", title = "Evolution by community areas") +
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

locplot

tempplot

# Types and number of crimes
p1 <- TestDT %>%
  group_by(Type) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Type, y = Count)) +
  geom_bar(aes(x = reorder(Type, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) + 
  coord_flip() + 
  labs(x = "Number of crimes", y = "Type", title = "Evolution of number of crimes for different types") +
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Evolution over years
p2 <- TestDT %>% 
  group_by(Year, Type) %>% 
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count, fill = Type)) + 
  geom_area() +
  labs(x = "Years", y = "Number of crimes", title = "Evolution of crime types over years")

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Get unique crime types
crime_types <- unique(TestDT$Type)

# Create a list to store individual plots
plots_list <- list()

# Loop through each crime type and create a plot
for (crime_type in crime_types) {
  plot_data <- TestDT %>%
    filter(Type == crime_type) %>%
    group_by(Year) %>%
    summarise(Count = n())
  
  # Create a plot for the current crime type
  current_plot <- ggplot(plot_data, aes(x = Year, y = Count)) +
    geom_smooth(method = "lm") +
    geom_point() +
    labs(x = "Years", y = "Number of crimes", title = paste("Evolution of", crime_type, "over years"))
  
  # Add the plot to the list
  plots_list[[crime_type]] <- current_plot
}

# Print and view each plot
for (crime_type in crime_types) {
  print(plots_list[[crime_type]])
}

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

# Transform the type
TestDT[, c("Month", "Day", "Season", "Tint")] <- lapply(TestDT[, c("Month", "Day", "Season", "Tint")], as.character)

# By time intervals
p1 <- TestDT %>%
  group_by(Type, Tint) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Tint, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Time intervals", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by time intervals") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p1)

# By weekdays
p2 <- TestDT %>%
  group_by(Type, Day) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Day, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Weekdays", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by weekdays") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p2)

# By months
p3 <- TestDT %>%
  group_by(Type, Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Month, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Months", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by months") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p3)

# By seasons
p4 <- TestDT %>%
  group_by(Type, Season) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Season, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Seasons", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by seasons") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p4)

# Find top10 most frequent places
top10P <- head(names(sort(table(TestDT$Locdescrip), decreasing = TRUE)), 10)

# Find top10 most frequent crime types
top10T <- head(names(sort(table(TestDT$Type), decreasing = TRUE)), 10)

# Plot
filter(TestDT, Locdescrip %in% top10P) %>%
  filter(Type %in% top10T) %>%
  group_by(Type, Locdescrip) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Places", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by places") +
  theme_bw() +
  theme(
    panel.grid.major = element_line(colour = NA),
    panel.grid.minor = element_line(colour = NA),
    axis.text.x = element_text(angle = 45, vjust = 0.1, hjust = 0.1)  # Diagonal X-axis labels
  )

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

# Find top10 most dangerous community areas
top10C <- head(names((sort(table(TestDT$Community), decreasing = TRUE))), 10)

# Plot
filter(TestDT, Type %in% top10T) %>%
  filter(Community %in% top10C) %>%
  group_by(Type, Community) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = reorder(Community, Count), y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Community areas", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by areas") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

# Numbers
TestDT %>%
  filter(Domestic == T) %>%
  group_by(Year) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) +
  geom_bar(aes(x = Year, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) + 
  labs(x = "Number of crimes", y = "Year", title = "Evolution of number of domestic crimes in different years") +
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# By time intervals
p1 <- TestDT %>%
  filter(Domestic == T) %>%
  group_by(Tint) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Tint, y = Count)) +
  geom_bar(aes(x = Tint, y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Time intervals", y = "Number of domestic crimes", title = "Evolution by time intervals") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) + 
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# By weekdays
p2 <- TestDT %>%
  filter(Domestic == T) %>%
  group_by(Day) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Day, y = Count)) +
  geom_bar(aes(x = factor(Day, level = c("lun\\.", "mar\\.", "mer\\.", "jeu\\.", "ven\\.", "sam\\.", "dim\\.")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Weekdays", y = "Number of domestic crimes", title = "Evolution by weekdays") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) + 
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# By months
p3 <- TestDT %>%
  filter(Domestic == T) %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Month, y = Count)) +
  geom_bar(aes(x = factor(Month, level = c("janv\\.", "févr\\.", "mars", "avr\\.", "mai", "juin", "juil\\.", "août", "sept\\.", "oct\\.", "nov\\.", "déc\\.")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Months", y = "Number of domestic crimes", title = "Evolution by months") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# By seasons
p4 <- TestDT %>%
  filter(Domestic == T) %>%
  group_by(Season) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Season, y = Count)) +
  geom_bar(aes(x = factor(Season, level = c("SPRING", "SUMMER", "FALL", "WINTER")), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Seasons", y = "Number of domestic crimes", title = "Evolution by seasons") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Locations
TestDT %>%
  filter(Domestic == T) %>%
  group_by(Locdescrip) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Locdescrip, y = Count)) +
  geom_bar(aes(x = reorder(Locdescrip, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Places", y = "Number of crimes", title = "Evolution by places") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Extract data
temp <- TestDT %>%
  filter(Arrest == T) %>%
  group_by(Year) %>%
  summarise(Count = n())

# Compute the crime rates
temp$rate <- lapply(temp$Count, function(x) x / nrow(TestDT))
temp$rate <- as.numeric(temp$rate)

# Plot
ggplot(temp, aes(x = Year, y = rate)) + 
  geom_line() +  
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Find top10 most dangerous community areas
top10C <- head(names((sort(table(TestDT$Community), decreasing = TRUE))), 10)

# Iterate through each community area
for (community_area in top10C) {
  # Filter data for the current community area
  community_data <- filter(TestDT, Community == community_area)

  # Create a plot for the current community area
  plot <- community_data %>%
    group_by(Year) %>%
    summarise(Count = n()) %>%
    ggplot(aes(x = Year, y = Count)) +
    geom_smooth(method = "lm") +
    geom_point() +
    labs(x = "Years", y = "Number of crimes", title = paste("Evolution of crimes in", community_area, "over years"))

  # Display the plot
  print(plot)
}

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

# Extract data
temp <- TestDT %>%
  filter(Arrest == TRUE, Community %in% top10C) %>%
  group_by(Year, Community) %>%
  summarise(Count = n())

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Compute the crime rates
temp$rate <- temp$Count / nrow(TestDT)

# Iterate through each community area
for (community_area in unique(temp$Community)) {
  # Filter data for the current community area
  community_data <- filter(temp, Community == community_area)

  # Create a plot for the current community area
  plot <- ggplot(community_data, aes(x = Year, y = rate)) +
    geom_line() +
    labs(x = "Years", y = "Crime rates", title = paste("Evolution of arrested crime rates in Community - ", community_area, "over years"))

  # Display the plot
  print(plot)
}

# Extract data
temp <- filter(TestDT, Arrest == T) %>%
  group_by(Year, Type) %>%
  summarise(Count = n())

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Compute the crime rates
temp$rate <- lapply(temp$Count, function(x) x / nrow(TestDT))
temp$rate <- as.numeric(temp$rate)

# Plot
ggplot(temp, aes(x = Year, y = rate, colour = Type)) + 
  geom_line()

There is a steady decrease in the number of crimes even in the most dangerous communities. But there was also significant reduction in the arrest rate. This shows the police inefficiency. In conclusion, the arrest rate is very low for the amount of crimes and this shows that the Chicago police work on it along with keeping the community safe.

Chicago Crime Data Analysis

Divya Kampalli , Bhavya Sirobhushanam , Venkateshwar Reddy Kasturi

2023-12-04

We note that since these three columns are significant, we do not need to repeat the function for each column after eliminating duplicates in the “Latitude” column. As a result, all missing values pertaining to the location description disappear.