The data is downloaded from Alameda County Open Data Hub and the address is:
https://hub.arcgis.com/datasets/9e459776d4c3463cad52fe6003ffc668_0/data
The process of cleaning data is saved into the ‘cleaning_data.Rds’ file and here is where I start to analyze the data.
library(pacman)
p_load(tidyverse, DT, dygraphs, plotly, lubridate, xts, ggmap)
source('cleaning_data.R')
alam <- read_rds('alameda_crime_data.Rds')
alam <- alam %>% mutate(date = as.Date.POSIXct(time))
glimpse(alam)
## Rows: 173,636
## Columns: 10
## $ DateTime <fct> 2012/01/01 00:17:59, 2012/01/01 00:33:59, 2012/01/01…
## $ City <fct> CASTRO VALLEY, OAKLAND, OAKLAND, HAYWARD, SAN LEANDR…
## $ Longitude <dbl> -122.0633, -122.2069, -122.2069, -122.1000, -122.109…
## $ Latitude <dbl> 37.68589, 37.76260, 37.76260, 37.66655, 37.69520, 37…
## $ CrimeDescription <fct> DOMESTIC - DISTURB BY LOUD/UNREASONABLE NOISE, PERSO…
## $ CrimeCode <fct> 90C, 90D, 999, 90D, 90C, 999, 90D, 999, 90D, 90D, 90…
## $ time <dttm> 2012-01-01 00:17:59, 2012-01-01 00:33:59, 2012-01-0…
## $ year <dbl> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012…
## $ month <yearmon> Jan 2012, Jan 2012, Jan 2012, Jan 2012, Jan 2012…
## $ date <date> 2012-01-01, 2012-01-01, 2012-01-01, 2012-01-01, 201…
Changing the value of City into ‘OTHERS’ for those crime numbers are less than 800.
ala <- alam %>%
mutate(City=as.character(City)) %>%
group_by(City) %>%
summarize(n = n(), .groups = 'drop') %>%
mutate(city_n = as.factor(ifelse(n > 800, City, 'OTHERS'))) %>%
left_join(alam, City = City)
Time series of the total number of crimes for Alameda county and each cities.
options(dplyr.summarise.inform = FALSE)
alame <- ala %>% filter(!is.na(month)) %>%
group_by(month) %>%
summarise(n_crime=n(), .groups = 'drop') %>%
mutate(city_n = 'TOTAL', city_n= as.factor(city_n))
alamed <- ala %>% filter(!is.na(month)) %>%
group_by(month, city_n) %>%
summarise(n_crime=n()) %>%
mutate(city_n= as.factor(city_n)) %>%
rbind(alame)
plot1 <- alamed %>% ggplot(aes(month, n_crime)) +
geom_line(aes(col = city_n)) +
labs(col = 'city')
ggplotly(plot1)
Indicating the exact date of the peaks.
peak <- tibble(date = c(as.numeric(as.yearmon('2012-12-17')),
as.numeric(as.yearmon('2013-12-18')),
as.numeric(as.yearmon('2014-12-20')),
as.numeric(as.yearmon('2018-12-22')),
as.numeric(as.yearmon('2019-12-20'))))
plot2 <- plot1 + geom_vline(data = peak, aes(xintercept = date),
linetype = 'dotted', col = 'blue4') +
geom_text(data = peak, aes(x=date, y = c(2360, 2500, 2340, 2270, 2020),
label = c('2012-12-17', '2013-12-18', '2014-12-20',
'2018-12-22', '2019-12-20')), col ='blue4')
ggplotly(plot2)
Time series of different CrimeCode
options(dplyr.summarise.inform = FALSE)
a <- alam %>% group_by(month, CrimeCode) %>%
summarise(n_crime=n()) %>%
ggplot(aes(month, n_crime, col = CrimeCode)) +
geom_line()
a <- ggplotly(a)
hide_legend(a)
999: 72 HOUR MENTAL HEALTH / A PERSON WILLFULLY FLEES OR ATEMPTS TO ELUDE AN OFFICER
90D: DUI ALCOHOL/DRUGS
35A: USE/UNDER INFLUENCE OF CONTROLLED SUBSTANCE
Impact of Lock Down on Alameda Crime Numbers.
I assumed the crime number during lockdown should be less than the same period of time in the other years, according to the plot below.
lockdown <- tibble(date = c(as.numeric(as.yearmon('2020-03-17')),
as.numeric(as.yearmon('2020-05-22')),
as.numeric(as.yearmon('2018-03-17')),
as.numeric(as.yearmon('2018-05-22')),
as.numeric(as.yearmon('2019-03-17')),
as.numeric(as.yearmon('2019-05-22'))))
plot1 %+% filter(alamed, month > 'Jan 2018') +
geom_vline(data = lockdown, aes(xintercept = date), linetype = 'dotted', col = 'blue4') +
geom_text(aes(x = as.numeric(as.yearmon('2020-04-22')), y = 2050),
label = 'Lock down', col = 'blue4', size = 3)
\(H_0 : \mu = 53.369\)
\(H_1 : \mu \neq 53.369\)
Population: the data from March 17 - May 22, 2012 - 2019.
\(\mu = 53.369\)
Sample : March 17 - May 22 (lock down), 2020.
\(\bar{x} = 39.119\)
Sample size: 67
Since the p-value is very small and the values in the 95% confident interval are all smaller than \(\mu\), I reject the null hypothesis and concluded the crime number during lockdown is significantly less than the same period of time in the years before.
options(dplyr.summarise.inform = FALSE)
population_1 <- alam %>%
filter(!is.na(date)) %>%
filter(date >= '2012-03-17'& date <= '2012-05-22'|
date >= '2013-03-17'& date <= '2013-05-22'|
date >= '2014-03-17'& date <= '2014-05-22'|
date >= '2015-03-17'& date <= '2015-05-22'|
date >= '2016-03-17'& date <= '2016-05-22'|
date >= '2017-03-17'& date <= '2017-05-22'|
date >= '2018-03-17'& date <= '2018-05-22'|
date >= '2019-03-17'& date <= '2019-05-22') %>%
group_by(date) %>%
summarise(n_crime=n())
mu_test1 = mean(population_1$n_crime)
sample_1 <- alam %>%
filter(!is.na(date)) %>%
filter(date >= '2020-03-17'& date <= '2020-05-22') %>%
group_by(date) %>%
summarise(n_crime=n())
t.test(sample_1$n_crime, mu=mu_test1)
##
## One Sample t-test
##
## data: sample_1$n_crime
## t = -13.791, df = 66, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 53.3694
## 95 percent confidence interval:
## 37.05643 41.18237
## sample estimates:
## mean of x
## 39.1194
\(H_0 : \mu = 54.348\)
\(H_1 : \mu \neq 54.348\)
Population: the data from 2012-01-01 to 2019-12-31.
\(\mu = 54.348\)
Sample : the data from 2020-01-01 to 2020-11-27.
\(\bar{x} = 44.377\)
Sample size: 334
Since the p-value is very small and the values in the 95% confident interval are all smaller than \(\mu\), I reject the null hypothesis and concluded the crime number during Covid-19 pandemic in 2020 is significantly less than the other years.
options(dplyr.summarise.inform = FALSE)
population_2 <- alam %>%
filter(!is.na(date)) %>%
filter(date < '2020-01-01') %>%
group_by(date) %>%
summarise(n_crime=n())
mu_test2 <- mean(population_2$n_crime)
sample_2 <- alam %>%
filter(!is.na(date)) %>%
filter(date >= '2020-01-01') %>%
group_by(date) %>%
summarise(n_crime=n())
t.test(sample_2$n_crime, mu=mu_test2)
##
## One Sample t-test
##
## data: sample_2$n_crime
## t = -13.885, df = 333, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 54.34839
## 95 percent confidence interval:
## 42.96465 45.78984
## sample estimates:
## mean of x
## 44.37725
All the crime events in different cities shown on the map
base <- get_map('Dublin, CA', zoom = 10, maptype = 'roadmap')
ggmap(base) +
geom_point(data = ala, aes(x=Longitude, y=Latitude, col=city_n), size =0.1) +
ggtitle('Alameda County Crime Events 2012-2020')
Heatmap in Cities
options(dplyr.summarise.inform = FALSE)
alameda <- ala %>%
group_by(Longitude, Latitude) %>%
summarise(loc_n=n()) %>%
arrange(loc_n)
ggmap(base) +
geom_point(data = alameda, aes(x=Longitude, y=Latitude, col = loc_n)) +
scale_colour_gradient(high="#132B43", low = "#F8766D") +
ggtitle('Alameda County Crime Events 2012-2020')
Heatmap in Hayward
options(dplyr.summarise.inform = FALSE)
hayward <- ala %>% filter(City == 'HAYWARD', year == '2020') %>%
group_by(Longitude, Latitude) %>%
summarise(location_n=n()) %>%
arrange(location_n)
base_1 <- get_map('Hayward', zoom = 13, maptype = 'roadmap')
b <- ggmap(base_1) +
geom_point(data = hayward, aes(x=Longitude, y=Latitude, col = location_n)) +
scale_colour_gradient(high="black", low = "#00BFC4") +
ggtitle('Hayward Crime Events 2020')
ggplotly(b)
Hayward-CrimeCode
options(dplyr.summarise.inform = FALSE)
hayward <- ala %>% filter(City == 'HAYWARD', month < 'Dec 2020' & month > 'Aug 2020') %>%
group_by(Longitude, Latitude, CrimeCode) %>%
summarise(crime_number=n())
hayward_1 <- hayward %>% filter(CrimeCode == '999')
hayward_2 <- hayward %>% filter(CrimeCode != '999')
c <- ggmap(base_1) +
geom_point(data = hayward_1,
aes(x=Longitude, y=Latitude, size = crime_number, col = CrimeCode)) +
geom_point(data = hayward_2,
aes(x=Longitude, y=Latitude, size = crime_number, col = CrimeCode)) +
ggtitle('Hayward Crime Events (September - November, 2020)')
c <- ggplotly(c)
hide_legend(c)
The datatable help us to search the relative information about each CrimeCode.
datatable(alam)