## Rows: 342,867
## Columns: 30
## $ report_dat <chr> "8/31/2008 8:47:00 PM", "9/1/2008 12:45:00 AM"...
## $ month <dbl> 8, 9, 9, 9, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9...
## $ day <dbl> 31, 1, 1, 9, 24, 24, 25, 1, 1, 1, 9, 9, 9, 9, ...
## $ year <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008...
## $ hour <dbl> 20, 0, 3, 7, 20, 21, 6, 13, 14, 17, 15, 16, 18...
## $ minute <dbl> 47, 45, 0, 46, 0, 40, 0, 30, 0, 10, 35, 15, 30...
## $ second <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ anc <chr> "2E", "2B", "2C", "2B", "2C", "2C", "2A", "2B"...
## $ block <chr> "3500 - 3599 BLOCK OF R STREET NW", "2000 - 20...
## $ block_group <chr> "000300 1", "005500 5", "005800 1", "005301 3"...
## $ crimetype <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date <chr> "8/31/2008 6:40:00 PM", "8/31/2008 11:30:00 PM...
## $ ew <chr> "West", "West", "East", "West", "East", "East"...
## $ method <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 4", "Cluster 6", "Cluster 8", "Cluste...
## $ ns <chr> "North", "North", "North", "North", "North", "...
## $ offense <chr> "THEFT/OTHER", "MOTOR VEHICLE THEFT", "THEFT/O...
## $ quad <chr> "Northwest", "Northwest", "Northeast", "Northw...
## $ shift <chr> "EVENING", "MIDNIGHT", "MIDNIGHT", "DAY", "EVE...
## $ start_date <chr> "8/30/2008 9:30:00 PM", "8/31/2008 7:30:00 PM"...
## $ voting_precinct <chr> "Precinct 6", "Precinct 14", "Precinct 129", "...
## $ ccn <dbl> 8123749, 8123824, 8123835, 8127848, 8120153, 8...
## $ census_tract <dbl> 300, 5500, 5800, 5301, 5900, 5800, 5600, 4202,...
## $ district <dbl> 2, 2, 1, 2, 1, 1, 2, 3, 3, 1, 3, 2, 1, 2, 1, 2...
## $ psa <dbl> 206, 208, 101, 208, 102, 105, 207, 301, 308, 1...
## $ ward <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ x <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ x1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ xblock <dbl> -77.07021, -77.04532, -77.02704, -77.04009, -7...
## $ yblock <dbl> 38.91363, 38.90856, 38.89906, 38.90964, 38.894...
#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
mutate(
report_dat = anytime::anydate(report_dat),
#start_date = anytime::anydate(start_date),
#end_date = anytime::anydate(end_date),
across(where(is.character), factor),
census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
ward = factor(ward, levels = a$ward %>% unique %>% sort),
psa = factor(psa, levels = a$psa %>% unique %>% sort),
year = factor(year, levels = a$year %>% unique %>% sort),
month = factor(month, levels = a$month %>% unique %>% sort),
day = factor(day, levels = a$day %>% unique %>% sort),
hour = factor(hour, levels = a$hour %>% unique %>% sort)
) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
month, day, year, hour,
where(is.character),
where(is.factor),
where(is.numeric)
) %>% arrange(report_dat, crimetype, offense)
#abak = a
#saveRDS(abak, 'cleaned_data.RDS')
#a = abak## Rows: 212,278
## Columns: 12
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour <fct> 3, 10, 22, 9, 3, 14, 3, 13, 15, 5, 22, 8, 10, 8, 14, 3...
## $ census_tract <fct> 1001, 8702, 9301, 8001, 7809, 7504, 7409, 9204, 9509, ...
## $ crimetype <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense <fct> BURGLARY, BURGLARY, BURGLARY, BURGLARY, BURGLARY, BURG...
## $ psa <fct> 202, 502, 504, 108, 608, 701, 704, 502, 405, 502, 108,...
## $ quad <fct> Northwest, Northeast, Northeast, Northeast, Northeast,...
## $ shift <fct> MIDNIGHT, DAY, EVENING, DAY, MIDNIGHT, DAY, MIDNIGHT, ...
## $ ward <fct> 3, 5, 5, 6, 7, 8, 8, 5, 5, 5, 6, 6, 7, 1, 2, 2, 3, 4, ...
#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)
a %>% select(where(is.factor)) %>% map_df(n_unique)## $month
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
##
## $day
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
##
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
##
## $hour
## [1] 3 10 22 9 14 13 15 5 8 17 1 16 2 20 11 0 19 12 4 6 23 18 21 7
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
##
## $census_tract
## [1] 1001 8702 9301 8001 7809 7504 7409 9204 9509 11100 6801 10600
## [13] 7803 4400 5201 1200 2502 1902 4600 8803 6804 4701 5900 6700
## [25] 6900 3200 3800 202 5800 10100 5301 1302 1100 8402 11000 7708
## [37] 7304 10700 7200 7707 7703 7502 9810 3100 2501 9504 6400 7601
## [49] 9700 7404 2801 100 8903 8804 7603 7804 7407 3000 2102 2400
## [61] 8904 3301 9508 8802 9503 6802 10400 9803 7403 4202 600 501
## [73] 1600 1702 8301 6600 8200 4902 4801 9902 9804 4001 2002 9102
## [85] 10200 7408 7807 502 2101 9302 7000 7709 1901 10900 9811 9400
## [97] 8701 8100 8002 7806 300 1500 4702 9603 9602 7604 7406 3500
## [109] 1804 9905 9601 2702 3400 9904 7808 9906 9801 4100 5002 701
## [121] 1402 3302 7401 4002 6202 5500 4802 2802 3900 8410 10500 9903
## [133] 7605 3700 9802 9505 1301 7100 7503 2701 201 5600 3600 9501
## [145] 8302 2900 10800 5001 4300 4201 2600 10300 9604 7903 4901 9907
## [157] 9201 6500 2202 9807 9901 400 1002 901 7901 2201 9507 902
## [169] 1401 2301 1803 801 9203 702 9000 2302 802 2001 7301
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
##
## $crimetype
## [1] Non-Violent Violent
## Levels: Non-Violent Violent
##
## $method
## [1] OTHERS KNIFE GUN
## Levels: GUN KNIFE OTHERS
##
## $offense
## [1] BURGLARY MOTOR VEHICLE THEFT
## [3] THEFT F/AUTO THEFT/OTHER
## [5] ASSAULT W/DANGEROUS WEAPON HOMICIDE
## [7] ROBBERY SEX ABUSE
## [9] ARSON
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
##
## $psa
## [1] 202 502 504 108 608 701 704 405 104 602 305 301 208 404 402 103 506 501 503
## [20] 102 107 302 303 304 409 206 101 307 207 203 105 603 705 505 106 604 702 708
## [39] 403 607 706 507 606 406 407 707 204 401 308 605 201 601 703 408 306 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
##
## $quad
## [1] Northwest Northeast Southeast Southwest
## Levels: Northeast Northwest Southeast Southwest
##
## $shift
## [1] MIDNIGHT DAY EVENING
## Levels: DAY EVENING MIDNIGHT
##
## $ward
## [1] 3 5 6 7 8 1 2 4
## Levels: 1 2 3 4 5 6 7 8
a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories
a %>% filter(year != 2017) %>% count(psa, sort = TRUE, name = 'count') %>%
head(10) %>%
mutate(psa = factor(psa)) %>%
mutate(psa = fct_reorder(psa, -count)) %>%
plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>%
add_bars() %>%
layout(
title = 'Police Service Areas with the Most Crime'
)Observations:
a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories
ipal = grDevices::colorRampPalette(brewer.pal(12,'Paired'))(56)
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~quad, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Quadrants')a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Wards')a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~psa, colors = ipal) %>% add_markers() %>% layout(title = 'WDC PSAs') %>% hide_legend()Observations relative to Historic
Recommendations for the year 2018 based on 2017 findings
ggplotly(
a %>% filter(year != 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations:
ggplotly(
a %>% filter(year == 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations relative to Historic
Recommendations for the year 2018 based on 2017 findings
# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
monthkey = lubridate::make_datetime(
as.numeric(as.character(year)),
as.numeric(as.character(month)),
1)
) %>% relocate(report_dat, monthkey, everything())#Total Offenses over time
a %>% group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
ungroup() %>%
plot_ly(x = ~monthkey, y =~total.offenses) %>%
add_lines(size = I(3)) %>% layout(
xaxis = list(title = ''),
yaxis = list(title = ''),
title = 'Total Offenses by Month'
)Observations
ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = offense)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = ward)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) ## Rows: 211,502
## Columns: 2
## $ xblock <dbl> -77.08439, -77.00204, -76.99310, -76.98720, -76.92713, -76.9...
## $ yblock <dbl> 38.95470, 38.91445, 38.92946, 38.89478, 38.90078, 38.86353, ...
## == Use anomalize to improve your Forecasts by 50%! =========================================================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
a.anomalize = a %>%
group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
time_recompose()## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
filter(monthkey < as.Date('2017-11-01')) %>%
plot_anomalies(
ncol = 1,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 1.5,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'total.offenses', title = 'total.offenses')
)library(prophet)
#renaming cols to prophet's col conventions
a.agg = a %>%
#filter(crimetype == 'Violent') %>%
group_by(report_dat = round_date(report_dat,'month')) %>%
summarise(total.offenses = n()) %>%
select(ds = report_dat, y = total.offenses)
#creating model
a.agg.model = a.agg %>% prophet()
#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
periods = 3, #next 2 months
freq = 'month')
#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)
#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.