Chicago Crime Data Analysis

#Loading Necessary Libraries

library(data.table)
library(ggplot2)
library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(ggpubr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(Metrics)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(forecast)

## 
## Attaching package: 'forecast'

## The following object is masked from 'package:Metrics':
## 
##     accuracy

## The following object is masked from 'package:ggpubr':
## 
##     gghistogram

library(padr)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(patchwork)

#Data Preparation

Import data

Dataset <- fread(file = "/Users/divyakampalli/Desktop/DPA/finalproject/crimeanalysis.csv", header = T, sep = ",", na.strings = "")

Top 5 rows of the Dataset

head(Dataset)

Structure of the Dataset

str(Dataset)

## Classes 'data.table' and 'data.frame':   7946816 obs. of  22 variables:
##  $ ID                  : int  5741943 25953 26038 13279676 13274752 1930689 13203321 13210088 13210004 13210062 ...
##  $ Case Number         : chr  "HN549294" "JE240540" "JE279849" "JG507211" ...
##  $ Date                : chr  "08/25/2007 09:22:18 AM" "05/24/2021 03:06:00 PM" "06/26/2021 09:24:00 AM" "11/09/2023 07:30:00 AM" ...
##  $ Block               : chr  "074XX N ROGERS AVE" "020XX N LARAMIE AVE" "062XX N MC CORMICK RD" "019XX W BYRON ST" ...
##  $ IUCR                : chr  "0560" "0110" "0110" "0620" ...
##  $ Primary Type        : chr  "ASSAULT" "HOMICIDE" "HOMICIDE" "BURGLARY" ...
##  $ Description         : chr  "SIMPLE" "FIRST DEGREE MURDER" "FIRST DEGREE MURDER" "UNLAWFUL ENTRY" ...
##  $ Location Description: chr  "OTHER" "STREET" "PARKING LOT" "APARTMENT" ...
##  $ Arrest              : logi  FALSE TRUE TRUE FALSE FALSE TRUE ...
##  $ Domestic            : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Beat                : int  2422 2515 1711 1922 632 512 122 1225 333 1732 ...
##  $ District            : int  24 25 17 19 6 5 1 12 3 17 ...
##  $ Ward                : int  49 36 50 47 6 NA 42 27 7 30 ...
##  $ Community Area      : int  1 19 13 5 44 NA 32 28 43 21 ...
##  $ FBI Code            : chr  "08A" "01A" "01A" "05" ...
##  $ X Coordinate        : int  NA 1141387 1152781 1162518 1183071 NA 1174694 1160870 1190812 1151117 ...
##  $ Y Coordinate        : int  NA 1913179 1941458 1925906 1847869 NA 1901831 1898642 1856743 1922554 ...
##  $ Year                : int  2007 2021 2021 2023 2023 2002 2023 2023 2023 2023 ...
##  $ Updated On          : chr  "08/17/2015 03:03:40 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" "11/18/2023 03:39:49 PM" ...
##  $ Latitude            : num  NA 41.9 42 42 41.7 ...
##  $ Longitude           : num  NA -87.8 -87.7 -87.7 -87.6 ...
##  $ Location            : chr  NA "(41.917838056, -87.755968972)" "(41.995219444, -87.713354912)" "(41.952345086, -87.677975059)" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Summary of Dataset

summary(Dataset)

##        ID           Case Number            Date              Block          
##  Min.   :     634   Length:7946816     Length:7946816     Length:7946816    
##  1st Qu.: 3852320   Class :character   Class :character   Class :character  
##  Median : 7149126   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 7150988                                                           
##  3rd Qu.:10335352                                                           
##  Max.   :13292996                                                           
##                                                                             
##      IUCR           Primary Type       Description        Location Description
##  Length:7946816     Length:7946816     Length:7946816     Length:7946816      
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##    Arrest         Domestic            Beat         District         Ward       
##  Mode :logical   Mode :logical   Min.   : 111   Min.   : 1.0   Min.   : 1.0    
##  FALSE:5892780   FALSE:6581484   1st Qu.: 621   1st Qu.: 6.0   1st Qu.:10.0    
##  TRUE :2054036   TRUE :1365332   Median :1034   Median :10.0   Median :23.0    
##                                  Mean   :1185   Mean   :11.3   Mean   :22.8    
##                                  3rd Qu.:1731   3rd Qu.:17.0   3rd Qu.:34.0    
##                                  Max.   :2535   Max.   :31.0   Max.   :50.0    
##                                                 NA's   :47     NA's   :614854  
##  Community Area     FBI Code          X Coordinate      Y Coordinate    
##  Min.   : 0.0     Length:7946816     Min.   :      0   Min.   :      0  
##  1st Qu.:23.0     Class :character   1st Qu.:1152998   1st Qu.:1859105  
##  Median :32.0     Mode  :character   Median :1166138   Median :1890771  
##  Mean   :37.5                        Mean   :1164616   Mean   :1885821  
##  3rd Qu.:57.0                        3rd Qu.:1176389   3rd Qu.:1909321  
##  Max.   :77.0                        Max.   :1205119   Max.   :1951622  
##  NA's   :613478                      NA's   :87614     NA's   :87614    
##       Year       Updated On           Latitude       Longitude     
##  Min.   :2001   Length:7946816     Min.   :36.62   Min.   :-91.69  
##  1st Qu.:2005   Class :character   1st Qu.:41.77   1st Qu.:-87.71  
##  Median :2009   Mode  :character   Median :41.86   Median :-87.67  
##  Mean   :2010                      Mean   :41.84   Mean   :-87.67  
##  3rd Qu.:2015                      3rd Qu.:41.91   3rd Qu.:-87.63  
##  Max.   :2023                      Max.   :42.02   Max.   :-87.52  
##                                    NA's   :87614   NA's   :87614   
##    Location        
##  Length:7946816    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

#Data Cleaning and Preprocessing

Extracting 5 past years’ data

CrimesDF <- Dataset[Year > 2018]

Renaming some of the variables

setnames(CrimesDF, c("Case Number", "Primary Type", "Location Description", "Community Area"), c("Case", "Type", "Locdescrip", "Community"))

Checking if there are any Duplicates

any(duplicated(CrimesDF[["Case"]]))

## [1] TRUE

Removing any duplicates in Case Number and testing again to check if there are any duplicates.

CrimesDF <- CrimesDF[!duplicated(CrimesDF[["Case"]])]
any(duplicated(CrimesDF[["Case"]]))

## [1] FALSE

Testing for missing values

any(is.na(CrimesDF))

## [1] TRUE

Finding the missing values in each coloumn.

colSums(is.na(CrimesDF))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         6051            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           48            2            0        16907        16907            0 
##   Updated On     Latitude    Longitude     Location 
##            0        16907        16907        16907

Replacing all NAs with similar values

CrimesDF$`Latitude` <- na.omit(CrimesDF$`Latitude`)[match(CrimesDF$`X Coordinate`, na.omit(CrimesDF$`X Coordinate`))]
colSums(is.na(CrimesDF))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         6051            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           48            2            0        16907        16907            0 
##   Updated On     Latitude    Longitude     Location 
##            0        16907        16907        16907

Removing NA in latitude, longitude, location, Case Number

CrimesDF <- CrimesDF[!is.na(CrimesDF[["Latitude"]])]
CrimesDF <- CrimesDF[!is.na(CrimesDF[["Case"]])]
colSums(is.na(CrimesDF))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0         4513            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##           47            1            0            0            0            0 
##   Updated On     Latitude    Longitude     Location 
##            0            0            0            0

Replacing all NAs with similar records

CrimesDF$`Locdescrip` <- na.omit(CrimesDF$`Locdescrip`)[match(CrimesDF$`Location`, na.omit(CrimesDF$`Location`))]
CrimesDF$`District` <- na.omit(CrimesDF$`District`)[match(CrimesDF$`Beat`, na.omit(CrimesDF$`Beat`))]
CrimesDF$`Ward` <- na.omit(CrimesDF$`Ward`)[match(CrimesDF$`Location`, na.omit(CrimesDF$`Location`))]
CrimesDF$`Community` <- na.omit(CrimesDF$`Community`)[match(CrimesDF$`Location`, na.omit(CrimesDF$`Location`))]
colSums(is.na(CrimesDF))

##           ID         Case         Date        Block         IUCR         Type 
##            0            0            0            0            0            0 
##  Description   Locdescrip       Arrest     Domestic         Beat     District 
##            0          637            0            0            0            0 
##         Ward    Community     FBI Code X Coordinate Y Coordinate         Year 
##            5            1            0            0            0            0 
##   Updated On     Latitude    Longitude     Location 
##            0            0            0            0

CrimesDF <- CrimesDF[!is.na(CrimesDF[["Locdescrip"]])]
any(is.na(CrimesDF))

## [1] FALSE

CrimesDF <- CrimesDF[which(Community != 0),]

CrimesDF <- CrimesDF[, !c("ID", "IUCR", "Description", "FBI Code", "Block", "Ward", "X Coordinate", "Y Coordinate", "Updated On")]
CrimesDF[["Date"]] <- parse_date_time(CrimesDF[["Date"]], orders = "mdY IMSp")

Create four time intervals and Extract hours

tint <- c("0", "5.9", "11.9", "17.9", "23.9")

hours <- hour(CrimesDF[["Date"]])

CrimesDF[["Tint"]] <- cut(hours, breaks = tint, labels = c("0-5H", "6-11H", "12-17H", "18-24H"), include.lowest = T)

Create the column Day showing the weekday, month, season when the incident occurred

CrimesDF[["Day"]] <- wday(CrimesDF[["Date"]], label = T)
CrimesDF[["Month"]] <- month(CrimesDF[["Date"]], label = T)
quarters <- quarter(CrimesDF$Date)
sint <- c("0.9", "1.9", "2.9", "3.9", "4.9")
CrimesDF[["Season"]] <- cut(quarters, breaks = sint, labels = c("SPRING", "SUMMER", "FALL", "WINTER"))

Regrouping similar crimes into one type

CrimesDF[["Type"]] <- ifelse(CrimesDF[["Type"]] %in% c("CRIMINAL DAMAGE"), "DAMAGE", 
                   ifelse(CrimesDF[["Type"]] %in% c("DECEPTIVE PRACTICE"), "DECEIVE",
                   ifelse(CrimesDF[["Type"]] %in% c("KIDNAPPING", "OFFENSE INVOLVING CHILDREN", "HUMAN TRAFFICKING"), "HUMANCHILD",
                   ifelse(CrimesDF[["Type"]] %in% c("NARCOTICS", "OTHER NARCOTIC VIOLATION"), "NARCOTICS", 
                   ifelse(CrimesDF[["Type"]] %in% c("MOTOR VEHICLE THEFT"), "MOTO", 
                   ifelse(CrimesDF[["Type"]] %in% c("OTHER OFFENSE"), "OTHER", 
                   ifelse(CrimesDF[["Type"]] %in% c("CRIM SEXUAL ASSAULT", "PROSTITUTION", "SEX OFFENSE"), "SEX", 
                   ifelse(CrimesDF[["Type"]] %in% c("GAMBLING", "INTERFERENCE WITH PUBLIC OFFICER", "INTIMIDATION", "LIQUOR LAW VIOLATION", "OBSCENITY", "PUBLIC INDECENCY", "PUBLIC PEACE VIOLATION", "STALKING", "NON-CRIMINAL", "NON-CRIMINAL (SUBJECT SPECIFIED)", "NON - CRIMINAL"), "SOCIETY", 
                   ifelse(CrimesDF[["Type"]] %in% c("CRIMINAL TRESPASS"), "TRESPASS", 
                   ifelse(CrimesDF[["Type"]] %in% c("CONCEALED CARRY LICENSE VIOLATION", "WEAPONS VIOLATION"), "WEAPONS", CrimesDF[["Type"]]))))))))))

CrimesDF[["Locdescrip"]] <- ifelse(CrimesDF[["Locdescrip"]] %in% c("VEHICLE-COMMERCIAL", "VEHICLE - DELIVERY TRUCK", "VEHICLE - OTHER RIDE SERVICE", "VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)", "VEHICLE NON-COMMERCIAL", "TRAILER", "TRUCK", "DELIVERY TRUCK", "TAXICAB", "OTHER COMMERCIAL TRANSPORTATION"), "VEHICLE", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("BAR OR TAVERN", "TAVERN", "TAVERN/LIQUOR STORE"), "TAVERN",
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("SCHOOL YARD", "SCHOOL, PRIVATE, BUILDING", "SCHOOL, PRIVATE, GROUNDS", "SCHOOL, PUBLIC, BUILDING", "SCHOOL, PUBLIC, GROUNDS", "COLLEGE/UNIVERSITY GROUNDS", "COLLEGE/UNIVERSITY RESIDENCE HALL"), "SCHOOL",
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("RESIDENCE", "RESIDENCE-GARAGE", "RESIDENCE PORCH/HALLWAY", "RESIDENTIAL YARD (FRONT/BACK)", "DRIVEWAY - RESIDENTIAL", "GARAGE", "HOUSE", "PORCH", "YARD"), "RESIDENCE", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("PARKING LOT", "PARKING LOT/GARAGE(NON.RESID.)", "POLICE FACILITY/VEH PARKING LOT"), "PARKING", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("OTHER", "OTHER RAILROAD PROP / TRAIN DEPOT", "ABANDONED BUILDING", "ANIMAL HOSPITAL", "ATHLETIC CLUB", "BASEMENT", "BOAT/WATERCRAFT", "CHURCH", "CHURCH/SYNAGOGUE/PLACE OF WORSHIP", "COIN OPERATED MACHINE", "CONSTRUCTION SITE", "SEWER", "STAIRWELL", "VACANT LOT", "VACANT LOT/LAND", "VESTIBULE", "WOODED AREA", "FARM", "FACTORY", "FACTORY/MANUFACTURING BUILDING", "FEDERAL BUILDING", "FIRE STATION", "FOREST PRESERVE", "GOVERNMENT BUILDING", "GOVERNMENT BUILDING/PROPERTY", "JAIL / LOCK-UP FACILITY", "LIBRARY", "MOVIE HOUSE/THEATER", "POOL ROOM", "SPORTS ARENA/STADIUM", "WAREHOUSE", "AUTO", "AUTO / BOAT / RV DEALERSHIP", "CEMETARY"), "OTHERS", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("COMMERCIAL / BUSINESS OFFICE"), "BIGBUSINESS", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("PARK PROPERTY"), "PARK", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("ATM (AUTOMATIC TELLER MACHINE)", "BANK", "CREDIT UNION", "CURRENCY EXCHANGE", "SAVINGS AND LOAN"), "BANK", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("HOTEL", "HOTEL/MOTEL"), "HOTEL", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("HOSPITAL", "HOSPITAL BUILDING/GROUNDS", "DAY CARE CENTER", "NURSING HOME", "NURSING HOME/RETIREMENT HOME", "MEDICAL/DENTAL OFFICE"), "HEALTH", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("ALLEY", "BOWLING ALLEY"), "ALLEY", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("CHA APARTMENT", "CHA HALLWAY/STAIRWELL/ELEVATOR", "CHA PARKING LOT", "CHA PARKING LOT/GROUNDS"), "CHA", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("CTA BUS", "CTA BUS STOP", "CTA GARAGE / OTHER PROPERTY", "CTA PLATFORM", "CTA STATION", "CTA TRACKS - RIGHT OF WAY", "CTA TRAIN", "CTA \"\"L\"\" TRAIN"), "CTA", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA", "AIRPORT BUILDING NON-TERMINAL - SECURE AREA", "AIRPORT EXTERIOR - NON-SECURE AREA", "AIRPORT EXTERIOR - SECURE AREA", "AIRPORT PARKING LOT", "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA", "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA", "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA", "AIRPORT TRANSPORTATION SYSTEM (ATS)", "AIRPORT VENDING ESTABLISHMENT", "AIRPORT/AIRCRAFT", "AIRCRAFT"), "AIRPORT", 
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("APPLIANCE STORE", "BARBERSHOP", "CAR WASH", "CLEANING STORE", "CONVENIENCE STORE", "DEPARTMENT STORE", "DRUG STORE", "GARAGE/AUTO REPAIR", "GAS STATION", "GAS STATION DRIVE/PROP.", "GROCERY FOOD STORE", "NEWSSTAND", "OFFICE", "PAWN SHOP", "RETAIL STORE", "SMALL RETAIL STORE"), "STORE",
                   ifelse(CrimesDF[["Locdescrip"]] %in% c("BRIDGE", "DRIVEWAY", "GANGWAY", "HIGHWAY/EXPRESSWAY", "LAKEFRONT/WATERFRONT/RIVERBANK", "SIDEWALK", "STREET", "HALLWAY"), "STREET",
                   CrimesDF[["Locdescrip"]])))))))))))))))))

Converting DataTable into Dataframe and normalising the values.

CrimesDF <- as.data.frame(CrimesDF)
CrimesDF <- CrimesDF[c("Case", "Date", "Year", "Month", "Day", "Season", "Tint", "Type", "Arrest", "Domestic", "Locdescrip", "Beat", "District", "Community", "Latitude", "Longitude", "Location")]
CrimesDF[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")] <- lapply(CrimesDF[, c("Beat", "Type", "District", "Community", "Month", "Day", "Locdescrip")], as.factor)

options(scipen=200)

crimes <- data.frame(Dataset) %>% 
  select(c(Date, Primary.Type)) %>% 
  mutate(Primary.Type = as.factor(Primary.Type),
         Date = mdy_hms(Date), 
         Date = floor_date(Date, unit = "hours")) %>% #takes a date-time object and rounds it down to hours unit
  arrange(Date)

# Types and number of crimes
p1 <- CrimesDF %>%
  group_by(Type) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Type, y = Count)) +
  geom_bar(aes(x = reorder(Type, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) + 
  coord_flip() + 
  labs(x = "Number of crimes", y = "Type", title = "Evolution of number of crimes for different types") +
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())
p1

crimes %>% 
  count(Primary.Type, sort = T) %>% 
  head(5) %>% 
  ggplot(aes(x = n, y = reorder(Primary.Type, n))) +
  geom_col()

  labs(title = 'Top 5 Crimes in Chicago', 
       x = 'Number of Crimes', 
       y = 'Crimes')

## $x
## [1] "Number of Crimes"
## 
## $y
## [1] "Crimes"
## 
## $title
## [1] "Top 5 Crimes in Chicago"
## 
## attr(,"class")
## [1] "labels"

From the above plot, we can see that “THEFT” is the highest occuring crime. Let’s do Time series Analysis on THEFT.

#Time series analysis for Theft Create Prediction Time Frame We are taking only 5 years data so that it will be accurate

theft_crime <- crimes %>% 
  filter(Primary.Type == 'THEFT') %>% 
  group_by(Date) %>% 
  summarise(Theft = n()) %>% 
  filter(Date >= '2018-01-01' & Date <= '2022-12-31')

Printing range of dates, head and tail

head(theft_crime, 5)

tail(theft_crime, 5)

range(theft_crime$Date)

## [1] "2018-01-01 06:00:00 UTC" "2022-12-31 05:00:00 UTC"

range(theft_crime$Date)

## [1] "2018-01-01 06:00:00 UTC" "2022-12-31 05:00:00 UTC"

theft_crime <- theft_crime %>% 
  pad(start_val = ymd_hms("2018-01-01 00:00:00"), end_val = ymd_hms("2021-12-31 23:00:00")) %>% 
  replace(., is.na(.), 0)

## pad applied on the interval: hour

To create a time-series model, we need to create a time-series object from our train data. Time-series object will be based on theft as it is the one that we are going to predict, we set the frequency to be 24 as it is total hour of reported crime for 1 day.

theft_ts <- ts(theft_crime$Theft, frequency = 24)

Theft_plot <- theft_crime %>%
   ggplot(aes(x = Date, y = Theft)) +
   geom_line(aes(color = "theft")) +
   scale_x_datetime(name = "Date", date_breaks = "1 year") +
   scale_y_continuous(breaks = seq(0, 400, 100)) + 
   theme_minimal() +
   labs(title = "Chicago Theft Crime", subtitle = "2018 - 2022")
 
 ggplotly(Theft_plot)

Now, we will use autoplot to see the trend and sesonality

theft_ts_ap <- theft_ts %>%
  tail(365) %>%
  decompose()
theft_ts_ap %>%
  autoplot()

Upon examining the plot, it becomes evident that the trend still reveals certain patterns, resembling a seasonal nature. This suggests the existence of additional seasonality patterns that haven’t been captured by the current visualization. To address this, we aim to construct a Multi-Seasonal Time Series Object.

Create and Decompose MSTS Object

theft_multi <- msts(theft_crime$Theft, seasonal.periods = c(24, # Daily
                                                            24*7, # Weekly
                                                            24*30)) # Monthly

theft_multi_dec <- theft_multi %>%
  mstl()
theft_multi_dec %>%
  tail(365) %>%
  autoplot()

From the plot above, we can see the trend of the Theft Crime is already going smooth. The Theft Crime trend itself is decreasing in the last years.

#Seasonality Analysis

# Decompose MSTS Object
#theft_multi_dec <- theft_multi %>%
#  mstl()

#theft_multi_dec %>%
#  tail(24*7*4*12) %>%
#  autoplot()
# Decompose MSTS Object
theft_multi_dec <- theft_multi %>% mstl()

# Convert the decomposed object to a data frame
df_theft_multi <- as.data.frame(theft_multi_dec)
df_theft_multi = as.data.frame(theft_multi_dec)

Hourly Seasonality

HourlyPlot <- df_theft_multi %>%
  mutate(day = theft_crime$Date) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  head(24*7) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_point(col = "maroon") + 
  geom_line(col = "blue") +
  theme_minimal()
HourlyPlot

From the above graph, we can see that thefts are occurring more during midday and falling during night.

Daily Seasonality

DailyPlot <- df_theft_multi %>%
  mutate(day = wday(theft_crime$Date, label = T)) %>%
  group_by(day) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  ggplot(aes(x = day, y = seasonal)) +
  geom_col() +
  theme_minimal()
DailyPlot

From the Daily Seasonality Graph, we can see that theft count increases from wednesday and reaches its peak on friday and will fall. The least number of thefts are on sunday.

Monthly seasonality

MonthlyPlot <- df_theft_multi %>%
  mutate(day = theft_crime$Date, month = month(theft_crime$Date, label = T)) %>%
  group_by(month) %>%
  summarise(seasonal = sum(Seasonal24 + Seasonal168 + Seasonal720)) %>%
  head(24*30) %>%
  ggplot(aes(x = month, y = seasonal)) +
  geom_point() + geom_col() +
  theme_minimal()
MonthlyPlot

From the above graph, we can see that the crimes increases from June and are on peak on August and will fall from then. The lowest crimes occur in March.

In summary We have successfully predicted the frequency of theft crimes based on our analysis. It is reasonable to conclude that theft crime will probably start to rise at 10 a.m., peak at 5 p.m. (after business hours), and then continue to rise until 12 a.m. More crimes occur on Fridays. The actual crime is more likely to occur between June and October.

Similarly, we can do Time series analysis for each crime.

#Analysis and Visualisation

Plotting Number of Crimes versus Year

# Detach plyr if it's loaded and not required
#if ("package:plyr" %in% search()) {
#  detach("package:plyr", unload=TRUE)
#}

CrimesDF %>%
  dplyr::group_by(Year) %>%
  dplyr::summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) +
  geom_line(colour = "grey") +
  geom_point(colour = "grey") +
  geom_bar(aes(x = Year, y = Count), stat = "identity", fill = "blue", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Year", y = "Number of Crimes", title = "Evolution of Number of Crimes") +
  geom_text(aes(x = Year, y = Count, label = Count), size = 3, vjust = -1, position = position_dodge(0.9)) +
  theme_minimal() +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank())

The number of cases decreased from 2019 to 2020 and the trend increased after 2021.

By time intervals

TimeIntervalsPlot <- CrimesDF %>%
  group_by(Tint) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Tint, y = Count)) +
  geom_bar(aes(x = Tint, y = Count), stat = "identity", fill = "blue", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Time intervals", y = "Number of crimes", title = "Evolution by time intervals") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) + 
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())
print(TimeIntervalsPlot)

From the above plot, we can see that the most number of crimes are happening from 12-5 and the trend decreases later.

By WeekDays

WeekDaysPlot <- CrimesDF %>%
  group_by(Day) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = factor(Day, level = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")), y = Count)) +
  geom_bar(stat = "identity", fill = "blue", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Weekdays", y = "Number of crimes", title = "Evolution by weekdays") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
print(WeekDaysPlot)

By the above graph we can see that the trend is almost the same across all the days. There is a slight increase of crimes on fridays and saturdays.

By Months

MonthPlot <- CrimesDF %>%
  group_by(Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = factor(Month, level = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")), y = Count)) +
  geom_bar(stat = "identity", fill = "blue", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Months", y = "Number of crimes", title = "Evolution by months") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
print(MonthPlot)

Crimes were more likely to happen in June to August and are less likely to happen in December, February.

# By seasons
p4 <- CrimesDF %>%
  group_by(Season) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Season, y = Count)) +
  geom_bar(aes(x = factor(Season, level = c("SPRING", "SUMMER", "FALL", "WINTER")), y = Count), stat = "identity", fill = "blue", width = 0.3, position=position_dodge(0.4)) +
  labs(x = "Seasons", y = "Number of crimes", title = "Evolution by seasons") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

# Combine plots into one plot
print(p4)

Crimes are more likely to happen in fall and less likely to happen in winter.

Top 5 most frequent Crime areas

top5_Places <- CrimesDF %>%
  group_by(Locdescrip) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  head(5)  

plot_top5_Places <- top5_Places %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "blue", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Places", y = "Number of crimes", title = "Top 5 most frequent places") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
plot_top5_Places

print(top5_Places)

## # A tibble: 5 × 2
##   Locdescrip  Count
##   <fct>       <int>
## 1 STREET     367187
## 2 APARTMENT  192348
## 3 RESIDENCE  187694
## 4 STORE       94047
## 5 OTHERS      29915

Street is the top place where crime can happen. Dont think your apartment/residence is safe. The next place where crimes could take place is apartment followed by residence.

Bottom 5 most frequent Crime areas

bottom5_places <- CrimesDF %>%
  group_by(Locdescrip) %>%
  summarise(Count = n()) %>%
  arrange(Count) %>%
  head(5)  # Select the bottom 5 places

plot_bottom5_places <- bottom5_places %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = Count)) +
  geom_bar(stat = "identity", fill = "blue", width = 0.3, position = position_dodge(0.4)) +
  labs(x = "Places", y = "Number of crimes", title = "Bottom 5 most frequent places") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 75, vjust = 1, hjust = 1)) +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank())
plot_bottom5_places

print(bottom5_places)

## # A tibble: 5 × 2
##   Locdescrip                        Count
##   <fct>                             <int>
## 1 HORSE STABLE                          1
## 2 PUBLIC GRAMMAR SCHOOL                 1
## 3 RAILROAD PROPERTY                     1
## 4 VEHICLE - COMMERCIAL: TROLLEY BUS     1
## 5 CLUB                                  2

Least amount of crimes happen in the above places.

shapefile_path <- "/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp"
mapcomu <- st_read(shapefile_path)

## Reading layer `geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6' from data source `/Users/divyakampalli/Downloads/boundaries-communityareas/geo_export_e07c1c74-44b6-459c-98d9-e8c9587ea2b6.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -87.94011 ymin: 41.64454 xmax: -87.52414 ymax: 42.02304
## Geodetic CRS:  WGS84(DD)

names(mapcomu)

##  [1] "area"       "area_num_1" "area_numbe" "comarea"    "comarea_id"
##  [6] "community"  "perimeter"  "shape_area" "shape_len"  "geometry"

temp <- CrimesDF %>%
  group_by(Community) %>%
  summarise(Count = n())
temp2df <- left_join(st_as_sf(mapcomu), temp, by = c("area_numbe" = "Community"))
locplot <- ggplot(data = temp2df) +
  geom_sf(aes(fill = Count), color = "black", size = 0.25) +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Number of crimes per community") +
  theme_void() +
  theme(legend.position = "bottom")

dfpolice <- fread(file = "/Users/divyakampalli/Downloads/Police_Stations_-_Map.csv", header = T, sep = ",", na.strings = "")
dfpolice$LOCATION <- gsub("[(*)]", "", dfpolice$LOCATION)
policeloc <- str_split_fixed(dfpolice$LOCATION, ", ", 2)
policeloc <- as.data.frame(policeloc)
colnames(policeloc) <- c("lat", "long")
policeloc$lat <- as.numeric(as.character(policeloc$lat))
policeloc$long <- as.numeric(as.character(policeloc$long))
policeloc$id <- dfpolice$DISTRICT

locplot <- locplot +
  geom_point(data = policeloc, aes(x = long, y = lat), size = 1, shape = 24, fill = "black")

Checking Number of Commuinities and count of crimes in each Community

length(unique(CrimesDF[["Community"]]))

## [1] 77

table(CrimesDF[["Community"]])

## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 18660 15983 17694  8474  5275 26294 16758 51818  1087  4851  4176  1973  3757 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
##  9835 12616 10923  6222  2532 19001  6076  8830 19511 33106 30946 63537 23829 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
## 19967 42053 36892 18467 11267 39542 10533  4779 13412  3577  3474 16155  7765 
##    40    41    42    43    44    45    46    47    48    49    50    51    52 
## 11257  9210 16225 41896 29853  5634 19785  1709  6342 28362  4851  8303  4852 
##    53    54    55    56    57    58    59    60    61    62    63    64    65 
## 17427  5943  2673  8032  3815  9291  3697  6162 18743  3734  9065  4130  6834 
##    66    67    68    69    70    71    72    73    74    75    76    77 
## 24192 26677 25579 30992  9085 32409  3776 13465  2106  8279  8036 12970

locplot

From the above plot, we can see that Austin has highest number of crimes(63537).

# Evolution over years
p2 <- CrimesDF %>% 
  group_by(Year, Type) %>% 
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count, fill = Type)) + 
  geom_area() +
  labs(x = "Years", y = "Number of crimes", title = "Evolution of crime types over years")

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

p2

# Evolution over years multiplots
CrimesDF %>% 
  group_by(Year, Type) %>% 
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) + 
  geom_smooth(method = "lm") + 
  geom_point()+
  facet_wrap(~Type, ncol = 4, scales = "free") + 
  labs(x = "Years", y = "Number of crimes", title = "Evolution of crime types over years")

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
## `geom_smooth()` using formula = 'y ~ x'

# Transform the type
CrimesDF[, c("Month", "Day", "Season", "Tint")] <- lapply(CrimesDF[, c("Month", "Day", "Season", "Tint")], as.character)

# By time intervals
p1 <- CrimesDF %>%
  group_by(Type, Tint) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Tint, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Time intervals", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by time intervals") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p1)

# By weekdays
p2 <- CrimesDF %>%
  group_by(Type, Day) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Day, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Weekdays", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by weekdays") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p2)

# By months
p3 <- CrimesDF %>%
  group_by(Type, Month) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Month, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Months", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by months") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p3)

# By seasons
p4 <- CrimesDF %>%
  group_by(Type, Season) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Season, y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Seasons", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by seasons") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

print(p4)

# Find top10 most frequent places
top10P <- head(names(sort(table(CrimesDF$Locdescrip), decreasing = TRUE)), 10)

# Find top10 most frequent crime types
top10T <- head(names(sort(table(CrimesDF$Type), decreasing = TRUE)), 10)

# Plot
filter(CrimesDF, Locdescrip %in% top10P) %>%
  filter(Type %in% top10T) %>%
  group_by(Type, Locdescrip) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = reorder(Locdescrip, Count), y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Places", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by places") +
  theme_bw() +
  theme(
    panel.grid.major = element_line(colour = NA),
    panel.grid.minor = element_line(colour = NA),
    axis.text.x = element_text(angle = 45, vjust = 0.1, hjust = 0.1)  # Diagonal X-axis labels
  )

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

# Find top10 most dangerous community areas
top10C <- head(names((sort(table(CrimesDF$Community), decreasing = TRUE))), 10)

# Plot
filter(CrimesDF, Type %in% top10T) %>%
  filter(Community %in% top10C) %>%
  group_by(Type, Community) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = reorder(Community, Count), y = reorder(Type, Count))) +
  geom_tile(aes(fill = Count)) + 
  scale_x_discrete("Community areas", expand = c(0, 0), position = "top") +
  scale_y_discrete("Crime types", expand = c(0, -2)) +
  scale_fill_gradient("Number of crimes", low = "white", high = "red") +
  ggtitle("Evolution by areas") +
  theme_bw() +
  theme(panel.grid.major =element_line(colour = NA), panel.grid.minor = element_line(colour = NA))

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

# Locations
#CrimesDF %>%
#  filter(Domestic == T) %>%
#  group_by(Locdescrip) %>%
#  summarise(Count = n()) %>%
#  ggplot(aes(x = Locdescrip, y = Count)) +
#  geom_bar(aes(x = reorder(Locdescrip, Count), y = Count), stat = "identity", fill = "#6495ED", width = 0.3, position=position_dodge(0.4)) +
#  labs(x = "Places", y = "Number of crimes", title = "Evolution by places") +
 # theme_minimal() +
 # theme(axis.text.x = element_text(angle = 75,vjust = 1,hjust = 1)) +
#  theme(axis.title.x=element_blank()) +
 # theme(axis.title.y=element_blank())

#Analysis of Arrest Rate

# Extract data
temp <- CrimesDF %>%
  filter(Arrest == T) %>%
  group_by(Year) %>%
  summarise(Count = n())

# Compute the crime rates
temp$rate <- lapply(temp$Count, function(x) x / nrow(CrimesDF))
temp$rate <- as.numeric(temp$rate)

# Plot
ggplot(temp, aes(x = Year, y = rate)) + 
  geom_line() +  
  theme_minimal() +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank())

There is a steady decrease of arrest rate from 2019 to 2020 and after 2021 it slowly increased.

Number of arrests in top 5 dangerous areas and arrest rate in that area

# Find top 5 most dangerous community areas
top5C <- head(names(sort(table(CrimesDF$Community), decreasing = TRUE)), 5)


crime_plot <- filter(CrimesDF, Community %in% top5C) %>%
  group_by(Year, Community) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) +
  geom_smooth(method = "lm") + 
  geom_point() +
  facet_wrap(~Community, ncol = 2, scales = "free") + 
  labs(x = "Years", y = "Number of crimes", title = "Evolution of number of crimes in different community areas over years")

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Extract data and compute crime rates
arrest_data <- CrimesDF %>%
  filter(Arrest == TRUE, Community %in% top5C) %>%
  group_by(Year, Community) %>%
  summarise(Count = n()) %>%
  mutate(rate = Count / nrow(CrimesDF))

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Plot arrest rates
arrest_plot <- ggplot(arrest_data, aes(x = Year, y = rate)) + 
  geom_line() + 
  facet_wrap(~Community, ncol = 2, scales = "free") + 
  labs(x = "Years", y = "Crime rates", title = "Evolution of arrested crime rates in different community areas over years")

# Combine the two plots
combined_plot <- crime_plot + arrest_plot + plot_layout(ncol = 2)

# Display the combined plot
combined_plot

## `geom_smooth()` using formula = 'y ~ x'

Arrest trend per crime

temp <- filter(CrimesDF, Arrest == T) %>%
  group_by(Year, Type) %>%
  summarise(Count = n())

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

temp$rate <- lapply(temp$Count, function(x) x / nrow(CrimesDF))
temp$rate <- as.numeric(temp$rate)

ggplot(temp, aes(x = Year, y = rate, colour = Type)) + 
  geom_line()

There is a steady decrease in the number of crimes even in the most dangerous communities. But there was also significant reduction in the arrest rate. This shows the police inefficiency.

Top 5 areas with least crime rate and arrest rate

bottom5C <- tail(names(sort(table(CrimesDF$Community), decreasing = TRUE)), 5)

crime_plot_bottom5 <- filter(CrimesDF, Community %in% bottom5C) %>%
  group_by(Year, Community) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Year, y = Count)) +
  geom_smooth(method = "lm") + 
  geom_point() +
  facet_wrap(~Community, ncol = 2, scales = "free") + 
  labs(x = "Years", y = "Number of crimes", title = "Evolution of number of crimes in different community areas over years")

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

arrest_data_bottom5 <- CrimesDF %>%
  filter(Arrest == TRUE, Community %in% bottom5C) %>%
  group_by(Year, Community) %>%
  summarise(Count = n()) %>%
  mutate(rate = Count / nrow(CrimesDF))

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

arrest_plot_bottom5 <- ggplot(arrest_data_bottom5, aes(x = Year, y = rate)) + 
  geom_line() + 
  facet_wrap(~Community, ncol = 2, scales = "free") + 
  labs(x = "Years", y = "Crime rates", title = "Evolution of arrested crime rates in different community areas over years")
combined_plot_bottom5 <- crime_plot_bottom5 + arrest_plot_bottom5 + plot_layout(ncol = 2)

combined_plot_bottom5

## `geom_smooth()` using formula = 'y ~ x'

From the above graph we can see that the crime rate is steadily decreasing in the top 5 safe areas. Except in 18 and 47 where we have steady decrease in the arrest rate, all the other areas have pretty good arrest rate.

From this we can conclude that The best areas to live in Chicago with good arrest rate 1. Edison Prak 2. Forest Glen 3. Mount Greenwood.

Chicago Crime Data Analysis

Divya Kampalli , Bhavya Sirobhushanam , Venkateshwar Reddy Kasturi

2023-12-06