The data is downloaded from Alameda County Open Data Hub and the address is:
https://hub.arcgis.com/datasets/9e459776d4c3463cad52fe6003ffc668_0/data

The process of cleaning data is saved into the ‘cleaning_data.Rds’ file and here is where I start to analyze the data.

library(pacman)
p_load(tidyverse, DT, dygraphs, plotly, lubridate, xts, ggmap)

source('cleaning_data.R')

alam <- read_rds('alameda_crime_data.Rds')
alam <- alam %>% mutate(date = as.Date.POSIXct(time))
glimpse(alam)

## Rows: 173,636
## Columns: 10
## $ DateTime         <fct> 2012/01/01 00:17:59, 2012/01/01 00:33:59, 2012/01/01…
## $ City             <fct> CASTRO VALLEY, OAKLAND, OAKLAND, HAYWARD, SAN LEANDR…
## $ Longitude        <dbl> -122.0633, -122.2069, -122.2069, -122.1000, -122.109…
## $ Latitude         <dbl> 37.68589, 37.76260, 37.76260, 37.66655, 37.69520, 37…
## $ CrimeDescription <fct> DOMESTIC - DISTURB BY LOUD/UNREASONABLE NOISE, PERSO…
## $ CrimeCode        <fct> 90C, 90D, 999, 90D, 90C, 999, 90D, 999, 90D, 90D, 90…
## $ time             <dttm> 2012-01-01 00:17:59, 2012-01-01 00:33:59, 2012-01-0…
## $ year             <dbl> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012…
## $ month            <yearmon> Jan 2012, Jan 2012, Jan 2012, Jan 2012, Jan 2012…
## $ date             <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 201…

Trends and Patterns

Changing the value of City into ‘OTHERS’ for those crime numbers are less than 800.

ala <- alam %>% 
         mutate(City=as.character(City)) %>%
         group_by(City) %>%
         summarize(n = n(), .groups = 'drop') %>%
         mutate(city_n = as.factor(ifelse(n > 800, City, 'OTHERS'))) %>%
         left_join(alam, City = City)

Time series of the total number of crimes for Alameda county and each cities.

options(dplyr.summarise.inform = FALSE)
alame <- ala %>% filter(!is.na(month)) %>%
                    group_by(month) %>%
                    summarise(n_crime=n(), .groups = 'drop') %>%
                    mutate(city_n = 'TOTAL', city_n= as.factor(city_n))

alamed <- ala %>% filter(!is.na(month)) %>%
                    group_by(month, city_n) %>%
                    summarise(n_crime=n()) %>% 
                    mutate(city_n= as.factor(city_n)) %>%
                    rbind(alame)

plot1 <- alamed %>% ggplot(aes(month, n_crime)) +
                 geom_line(aes(col = city_n)) +
  labs(col = 'city') 
ggplotly(plot1)

Indicating the exact date of the peaks.

peak <- tibble(date = c(as.numeric(as.yearmon('2012-12-17')),
                        as.numeric(as.yearmon('2013-12-18')),
                        as.numeric(as.yearmon('2014-12-20')),
                        as.numeric(as.yearmon('2018-12-22')),
                        as.numeric(as.yearmon('2019-12-20'))))

plot2 <- plot1 + geom_vline(data = peak, aes(xintercept = date), 
                         linetype = 'dotted', col = 'blue4') + 
  geom_text(data = peak, aes(x=date, y = c(2360, 2500, 2340, 2270, 2020),
                             label = c('2012-12-17', '2013-12-18', '2014-12-20', 
                                       '2018-12-22', '2019-12-20')), col ='blue4')
ggplotly(plot2)

Time series of different CrimeCode

options(dplyr.summarise.inform = FALSE)
a <- alam %>% group_by(month, CrimeCode) %>%
  summarise(n_crime=n()) %>%
  ggplot(aes(month, n_crime, col = CrimeCode)) +
  geom_line() 
a <- ggplotly(a)
hide_legend(a)

CrimeDescription

999: 72 HOUR MENTAL HEALTH / A PERSON WILLFULLY FLEES OR ATEMPTS TO ELUDE AN OFFICER

90D: DUI ALCOHOL/DRUGS

35A: USE/UNDER INFLUENCE OF CONTROLLED SUBSTANCE

Hypothesis Testing

Test 1

Impact of Lock Down on Alameda Crime Numbers.

I assumed the crime number during lockdown should be less than the same period of time in the other years, according to the plot below.

lockdown <- tibble(date = c(as.numeric(as.yearmon('2020-03-17')), 
                            as.numeric(as.yearmon('2020-05-22')),
                            as.numeric(as.yearmon('2018-03-17')), 
                            as.numeric(as.yearmon('2018-05-22')),
                            as.numeric(as.yearmon('2019-03-17')), 
                            as.numeric(as.yearmon('2019-05-22'))))
                           
plot1 %+% filter(alamed, month > 'Jan 2018') +
  geom_vline(data = lockdown, aes(xintercept = date), linetype = 'dotted', col = 'blue4') +
  geom_text(aes(x =  as.numeric(as.yearmon('2020-04-22')), y = 2050), 
            label = 'Lock down', col = 'blue4', size = 3)

\(H_0 : \mu = 53.369\)
\(H_1 : \mu \neq 53.369\)

Population: the data from March 17 - May 22, 2012 - 2019.
\(\mu = 53.369\)

Sample : March 17 - May 22 (lock down), 2020.
\(\bar{x} = 39.119\)
Sample size: 67

Since the p-value is very small and the values in the 95% confident interval are all smaller than \(\mu\), I reject the null hypothesis and concluded the crime number during lockdown is significantly less than the same period of time in the years before.

options(dplyr.summarise.inform = FALSE)
population_1 <- alam %>% 
  filter(!is.na(date)) %>%
  filter(date >= '2012-03-17'& date <= '2012-05-22'|
         date >= '2013-03-17'& date <= '2013-05-22'|
         date >= '2014-03-17'& date <= '2014-05-22'|
         date >= '2015-03-17'& date <= '2015-05-22'|
         date >= '2016-03-17'& date <= '2016-05-22'|
         date >= '2017-03-17'& date <= '2017-05-22'|
         date >= '2018-03-17'& date <= '2018-05-22'|
         date >= '2019-03-17'& date <= '2019-05-22') %>% 
  group_by(date) %>%
  summarise(n_crime=n())

mu_test1 = mean(population_1$n_crime)

sample_1 <- alam %>% 
  filter(!is.na(date)) %>%
  filter(date >= '2020-03-17'& date <= '2020-05-22') %>% 
  group_by(date) %>%
  summarise(n_crime=n())

t.test(sample_1$n_crime, mu=mu_test1)

## 
##  One Sample t-test
## 
## data:  sample_1$n_crime
## t = -13.791, df = 66, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 53.3694
## 95 percent confidence interval:
##  37.05643 41.18237
## sample estimates:
## mean of x 
##   39.1194

Test 2

\(H_0 : \mu = 54.348\)
\(H_1 : \mu \neq 54.348\)

Population: the data from 2012-01-01 to 2019-12-31.
\(\mu = 54.348\)

Sample : the data from 2020-01-01 to 2020-11-27.
\(\bar{x} = 44.377\)
Sample size: 334

Since the p-value is very small and the values in the 95% confident interval are all smaller than \(\mu\), I reject the null hypothesis and concluded the crime number during Covid-19 pandemic in 2020 is significantly less than the other years.

options(dplyr.summarise.inform = FALSE)
population_2 <- alam %>% 
  filter(!is.na(date)) %>%
  filter(date < '2020-01-01') %>% 
  group_by(date) %>%
  summarise(n_crime=n())

mu_test2 <- mean(population_2$n_crime)

sample_2 <- alam %>% 
  filter(!is.na(date)) %>%
  filter(date >= '2020-01-01') %>% 
  group_by(date) %>%
  summarise(n_crime=n())

t.test(sample_2$n_crime, mu=mu_test2)

## 
##  One Sample t-test
## 
## data:  sample_2$n_crime
## t = -13.885, df = 333, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 54.34839
## 95 percent confidence interval:
##  42.96465 45.78984
## sample estimates:
## mean of x 
##  44.37725

Mapping

All the crime events in different cities shown on the map

base <- get_map('Dublin, CA', zoom = 10, maptype = 'roadmap')

ggmap(base) + 
  geom_point(data = ala, aes(x=Longitude, y=Latitude, col=city_n), size =0.1) +
  ggtitle('Alameda County Crime Events 2012-2020')

Heatmap in Cities

options(dplyr.summarise.inform = FALSE)
alameda <- ala %>% 
  group_by(Longitude, Latitude) %>%
  summarise(loc_n=n()) %>%
  arrange(loc_n)

ggmap(base) + 
  geom_point(data = alameda, aes(x=Longitude, y=Latitude, col = loc_n)) +
  scale_colour_gradient(high="#132B43", low = "#F8766D") +
  ggtitle('Alameda County Crime Events 2012-2020')

Heatmap in Hayward

options(dplyr.summarise.inform = FALSE)
hayward <- ala %>% filter(City == 'HAYWARD', year == '2020') %>%
  group_by(Longitude, Latitude) %>%
  summarise(location_n=n()) %>%
  arrange(location_n)

base_1 <- get_map('Hayward', zoom = 13, maptype = 'roadmap')

b <- ggmap(base_1) + 
  geom_point(data = hayward, aes(x=Longitude, y=Latitude, col = location_n)) +
  scale_colour_gradient(high="black", low = "#00BFC4") +
  ggtitle('Hayward Crime Events 2020') 

ggplotly(b)

Hayward-CrimeCode

options(dplyr.summarise.inform = FALSE)
hayward <- ala %>% filter(City == 'HAYWARD', month < 'Dec 2020' & month > 'Aug 2020') %>%
  group_by(Longitude, Latitude, CrimeCode) %>%
  summarise(crime_number=n()) 

hayward_1 <- hayward %>% filter(CrimeCode == '999')
hayward_2 <- hayward %>% filter(CrimeCode != '999')
  
c <- ggmap(base_1) + 
  geom_point(data = hayward_1, 
             aes(x=Longitude, y=Latitude, size = crime_number, col = CrimeCode)) +
  geom_point(data = hayward_2, 
             aes(x=Longitude, y=Latitude, size = crime_number, col = CrimeCode)) +
  ggtitle('Hayward Crime Events (September - November, 2020)') 
c <- ggplotly(c)
hide_legend(c)

The datatable help us to search the relative information about each CrimeCode.

datatable(alam)

Stat. 694 Project - Alameda Crime Data