Objective

  • Understand crime in WDC by exploring data at both macro and micro levels, make recommendations for 2018, and then make future predictions.

Goals

  1. Find total offenses by each factor group
    • Historically
    • Latest Year 2017
  2. Find total offenses by each offense type/ward group
    • Historically
    • Latest Year 2017
  3. Trend over time (last 3 years)
    • total offenses
    • total offenses by type
  4. Predict total offenses for the next 3 months

Get Data

a = read_csv('raw.csv') %>% 
  clean_names() %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    report_dat,
    month, day, year, hour, minute, second,
    where(is.Date),
    where(is.character),
    where(is.factor),
    where(is.numeric)
  )

Resources

  1. WDC Wards
  2. psa = police service area

sample data

a %>% sample_n(10)

glimpse structure

a %>% glimpse
## Rows: 342,867
## Columns: 30
## $ report_dat           <chr> "8/31/2008 8:47:00 PM", "9/1/2008 12:45:00 AM"...
## $ month                <dbl> 8, 9, 9, 9, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9...
## $ day                  <dbl> 31, 1, 1, 9, 24, 24, 25, 1, 1, 1, 9, 9, 9, 9, ...
## $ year                 <dbl> 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008...
## $ hour                 <dbl> 20, 0, 3, 7, 20, 21, 6, 13, 14, 17, 15, 16, 18...
## $ minute               <dbl> 47, 45, 0, 46, 0, 40, 0, 30, 0, 10, 35, 15, 30...
## $ second               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ anc                  <chr> "2E", "2B", "2C", "2B", "2C", "2C", "2A", "2B"...
## $ block                <chr> "3500 - 3599 BLOCK OF R STREET NW", "2000 - 20...
## $ block_group          <chr> "000300 1", "005500 5", "005800 1", "005301 3"...
## $ crimetype            <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date             <chr> "8/31/2008 6:40:00 PM", "8/31/2008 11:30:00 PM...
## $ ew                   <chr> "West", "West", "East", "West", "East", "East"...
## $ method               <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 4", "Cluster 6", "Cluster 8", "Cluste...
## $ ns                   <chr> "North", "North", "North", "North", "North", "...
## $ offense              <chr> "THEFT/OTHER", "MOTOR VEHICLE THEFT", "THEFT/O...
## $ quad                 <chr> "Northwest", "Northwest", "Northeast", "Northw...
## $ shift                <chr> "EVENING", "MIDNIGHT", "MIDNIGHT", "DAY", "EVE...
## $ start_date           <chr> "8/30/2008 9:30:00 PM", "8/31/2008 7:30:00 PM"...
## $ voting_precinct      <chr> "Precinct 6", "Precinct 14", "Precinct 129", "...
## $ ccn                  <dbl> 8123749, 8123824, 8123835, 8127848, 8120153, 8...
## $ census_tract         <dbl> 300, 5500, 5800, 5301, 5900, 5800, 5600, 4202,...
## $ district             <dbl> 2, 2, 1, 2, 1, 1, 2, 3, 3, 1, 3, 2, 1, 2, 1, 2...
## $ psa                  <dbl> 206, 208, 101, 208, 102, 105, 207, 301, 308, 1...
## $ ward                 <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ x                    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ x1                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ xblock               <dbl> -77.07021, -77.04532, -77.02704, -77.04009, -7...
## $ yblock               <dbl> 38.91363, 38.90856, 38.89906, 38.90964, 38.894...

counts of unique levels

a %>% map_df(n_unique)

clean data

#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
  mutate(
    report_dat = anytime::anydate(report_dat),
    #start_date = anytime::anydate(start_date),
    #end_date = anytime::anydate(end_date),
    across(where(is.character), factor),
    census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
    ward = factor(ward, levels = a$ward %>% unique %>% sort),
    psa = factor(psa, levels = a$psa %>% unique %>% sort),
    year = factor(year, levels = a$year %>% unique %>% sort),
    month = factor(month, levels = a$month %>% unique %>% sort),
    day = factor(day, levels = a$day %>% unique %>% sort),
    hour = factor(hour, levels = a$hour %>% unique %>% sort)
    ) %>%
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    month, day, year, hour,
    where(is.character),
    where(is.factor),
    where(is.numeric) 
    ) %>% arrange(report_dat, crimetype, offense)
    
#abak = a
#saveRDS(abak, 'cleaned_data.RDS')
#a = abak

If we wanted to do more detailed geographic analysis, we might want to include block and block group

EDA: factor vars

sample data

a %>% select(where(is.factor)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.factor)) %>% glimpse
## Rows: 212,278
## Columns: 12
## $ month        <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day          <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year         <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour         <fct> 3, 10, 22, 9, 3, 14, 3, 13, 15, 5, 22, 8, 10, 8, 14, 3...
## $ census_tract <fct> 1001, 8702, 9301, 8001, 7809, 7504, 7409, 9204, 9509, ...
## $ crimetype    <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method       <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense      <fct> BURGLARY, BURGLARY, BURGLARY, BURGLARY, BURGLARY, BURG...
## $ psa          <fct> 202, 502, 504, 108, 608, 701, 704, 502, 405, 502, 108,...
## $ quad         <fct> Northwest, Northeast, Northeast, Northeast, Northeast,...
## $ shift        <fct> MIDNIGHT, DAY, EVENING, DAY, MIDNIGHT, DAY, MIDNIGHT, ...
## $ ward         <fct> 3, 5, 5, 6, 7, 8, 8, 5, 5, 5, 6, 6, 7, 1, 2, 2, 3, 4, ...

check missing values

a %>% select(where(is.factor)) %>% miss_var_summary

With so little data missing, I feel comfortable omitting rows with NAs

a = a %>% na.omit()

distribution of level counts per factor

#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)

a %>% select(where(is.factor)) %>% map_df(n_unique)
a %>% select(where(is.factor)) %>%
  map_df(n_unique) %>%
  pivot_longer(. , everything(), 'features') %>%
  plot_ly(x = ~features, y = ~value, color = ~features, colors = jpal) %>% 
  add_bars() %>%
  layout(title = 'Unique Level Counts per Factor')

reference: names of unique levels

a %>% select(where(is.factor)) %>% map(unique)
## $month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
## 
## $day
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 
## $hour
##  [1] 3  10 22 9  14 13 15 5  8  17 1  16 2  20 11 0  19 12 4  6  23 18 21 7 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 
## $census_tract
##   [1] 1001  8702  9301  8001  7809  7504  7409  9204  9509  11100 6801  10600
##  [13] 7803  4400  5201  1200  2502  1902  4600  8803  6804  4701  5900  6700 
##  [25] 6900  3200  3800  202   5800  10100 5301  1302  1100  8402  11000 7708 
##  [37] 7304  10700 7200  7707  7703  7502  9810  3100  2501  9504  6400  7601 
##  [49] 9700  7404  2801  100   8903  8804  7603  7804  7407  3000  2102  2400 
##  [61] 8904  3301  9508  8802  9503  6802  10400 9803  7403  4202  600   501  
##  [73] 1600  1702  8301  6600  8200  4902  4801  9902  9804  4001  2002  9102 
##  [85] 10200 7408  7807  502   2101  9302  7000  7709  1901  10900 9811  9400 
##  [97] 8701  8100  8002  7806  300   1500  4702  9603  9602  7604  7406  3500 
## [109] 1804  9905  9601  2702  3400  9904  7808  9906  9801  4100  5002  701  
## [121] 1402  3302  7401  4002  6202  5500  4802  2802  3900  8410  10500 9903 
## [133] 7605  3700  9802  9505  1301  7100  7503  2701  201   5600  3600  9501 
## [145] 8302  2900  10800 5001  4300  4201  2600  10300 9604  7903  4901  9907 
## [157] 9201  6500  2202  9807  9901  400   1002  901   7901  2201  9507  902  
## [169] 1401  2301  1803  801   9203  702   9000  2302  802   2001  7301 
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
## 
## $crimetype
## [1] Non-Violent Violent    
## Levels: Non-Violent Violent
## 
## $method
## [1] OTHERS KNIFE  GUN   
## Levels: GUN KNIFE OTHERS
## 
## $offense
## [1] BURGLARY                   MOTOR VEHICLE THEFT       
## [3] THEFT F/AUTO               THEFT/OTHER               
## [5] ASSAULT W/DANGEROUS WEAPON HOMICIDE                  
## [7] ROBBERY                    SEX ABUSE                 
## [9] ARSON                     
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
## 
## $psa
##  [1] 202 502 504 108 608 701 704 405 104 602 305 301 208 404 402 103 506 501 503
## [20] 102 107 302 303 304 409 206 101 307 207 203 105 603 705 505 106 604 702 708
## [39] 403 607 706 507 606 406 407 707 204 401 308 605 201 601 703 408 306 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
## 
## $quad
## [1] Northwest Northeast Southeast Southwest
## Levels: Northeast Northwest Southeast Southwest
## 
## $shift
## [1] MIDNIGHT DAY      EVENING 
## Levels: DAY EVENING MIDNIGHT
## 
## $ward
## [1] 3 5 6 7 8 1 2 4
## Levels: 1 2 3 4 5 6 7 8

Goal 1. Find total offenses by each factor group X

Historically (2012 - 2016)

a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')
## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories

a %>% filter(year != 2017) %>% count(psa, sort = TRUE, name = 'count') %>%
  head(10) %>%
  mutate(psa = factor(psa)) %>%
  mutate(psa = fct_reorder(psa, -count)) %>% 
  plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>% 
  add_bars() %>% 
  layout(
    title = 'Police Service Areas with the Most Crime'
  )

Observations:

  1. Most offenses occurs late summer to early fall (top 3 months: Oct, July, Aug)
  2. More offenses occurs towards the latter half the month (>= Day 16)
  3. Non-violent offenses occur ~5x as much as violent offenses
  4. Theft is, by far, the most common offense
    • auto-theft is so predominant, it has its own category
  5. The Northeast quad has significantly more crime
    • can partly be attributed to its slightly larger size as seen below in the quick map
  6. Wards 2 and 6 are the most dangerous wards by a good margin
  7. PSA service areas 207, 302, and 208 have the most crime by a good margin

Latest Year 2017

a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')
## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories

a %>% filter(year == 2017) %>% count(psa, sort = TRUE, name = 'count') %>%
  head(10) %>%
  mutate(psa = factor(psa)) %>%
  mutate(psa = fct_reorder(psa, -count)) %>% 
  plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>% 
  add_bars() %>% 
  layout(
    title = 'Police Service Areas with the Most Crime'
  )

quick map

ipal = grDevices::colorRampPalette(brewer.pal(12,'Paired'))(56)

a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~quad, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Quadrants')
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Wards')
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~psa, colors = ipal) %>% add_markers() %>% layout(title = 'WDC PSAs') %>% hide_legend()

Observations relative to Historic

  1. January and May 2017 were atypically more dangerous
  2. Non-violent offenses made up a greater percent of total offenses then they did historically (5x), occurring about ~6.5 x as much as violent offenses
  3. Wards 2 and 6 continue to be the top 2 dangerous wards
  4. PSA service areas 207, 302, and 208 continue to have the most crime, albeit by a smaller margin

Recommendations Batch 1

Recommendations for the year 2018 based on 2017 findings

  1. Staff More Policemen and women from 1 to 3 pm..
  2. Staff More Policemen and women in wards 2 and 6.
  3. Staff More Policemen and women in PSA 208.

Goal 2. Find total offenses by each offense type/ward group

Historically (2012 - 2016)

ggplotly(
  a %>% filter(year != 2017) %>%
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations:

  1. Ward 2 has the greatest number of cases as observed above, but this is disproportionately mostly composed of theft/other offenses
    • the offense count of its second largest offense category, theft/auto, is even slightly less than that of Ward 1’s’
  2. Ward 7 has a relatively high proportion of motor vehicle theft offenses
  3. Ward 8 has a relatively high proportion of burglary offenses

Latest Year 2017

ggplotly(
  a %>% filter(year == 2017) %>% 
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations relative to Historic

  1. Ward 2 continues to have the greatest number of cases as observed above, but this is disproportionately mostly composed of theft/other offenses
  2. Ward 6’s share of theft/other is higher
  3. Relative to other wards, in 2017, Wards 5 and 6 are more dangerous than they were historically
    • on an absolute basis, all wards are less dangerous
      • see time series graphs below

Recommendations Batch 2

Recommendations for the year 2018 based on 2017 findings

  1. Work on educating citizens on proper theft deterrent methods, especially in ward 2; perhaps install more cameras there.
  2. Looking at wards 7 and 8, it seems like motor vehicle theft may be correlated by assault with a dangerous weapon. + Check out if both offenses occurred simultaneously.

Goal 3. Trend over time

# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
  monthkey = lubridate::make_datetime(
    as.numeric(as.character(year)),
    as.numeric(as.character(month)),
    1)
  ) %>% relocate(report_dat, monthkey, everything())

#Total Offenses over time

Total Offenses by Month

a %>% group_by(monthkey) %>%
  summarise(total.offenses = n()) %>%
  ungroup() %>% 
  plot_ly(x = ~monthkey, y =~total.offenses) %>%
  add_lines(size = I(3)) %>% layout(
  xaxis = list(title = ''),
  yaxis = list(title = ''),
  title = 'Total Offenses by Month'
  )

Observations

  1. Crime follows clear seasonality
    • peaks occurring late summer/early fall
  2. Peaked in mid 2014
  3. Troughed in early 2015

Total Offenses by Month/Type

ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = offense)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)

Total Offenses by Month/Ward

ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = ward)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) 

EDA: num vars

sample data

a %>% select(where(is.numeric)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.numeric)) %>% glimpse
## Rows: 211,502
## Columns: 2
## $ xblock <dbl> -77.08439, -77.00204, -76.99310, -76.98720, -76.92713, -76.9...
## $ yblock <dbl> 38.95470, 38.91445, 38.92946, 38.89478, 38.90078, 38.86353, ...

check missing values

a %>% select(where(is.numeric)) %>% miss_var_summary

Anomaly Detection

Total Offenses by Month

library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! =========================================================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)

# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.

# max_anoms: The maximum percent of anomalies permitted to be identified.

a.anomalize = a %>%
  group_by(monthkey) %>% 
  summarise(total.offenses = n()) %>%
  time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
  time_recompose()
## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
  filter(monthkey < as.Date('2017-11-01')) %>% 
  plot_anomalies(
    ncol = 1,
    alpha_dots = 0.5,
    alpha_circles = 0.5,
    size_circles = 1.5,
    time_recomposed = TRUE,
    alpha_ribbon = 0.05
    ) + scale_y_continuous(labels = comma) +
  labs(x = '', y = 'total.offenses', title = 'total.offenses')
)

There was statistically high crime activity in August 2014 There was statistically low crime activity in August 2017

Predict Total Offenses for the 3 Months

Create Model

library(prophet)

#renaming cols to prophet's col conventions
a.agg = a %>%
  #filter(crimetype == 'Violent') %>% 
  group_by(report_dat = round_date(report_dat,'month')) %>% 
  summarise(total.offenses = n()) %>% 
  select(ds = report_dat, y = total.offenses)

#creating model
a.agg.model = a.agg %>% prophet()

#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
  periods = 3, #next 2 months
  freq = 'month') 

#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)

#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot forecast components
a.agg.model %>% prophet_plot_components(a.agg.forecast)

Looking at bottom chart of the component graphs, we can see crime has seasonal monthly peaks

Prediction Summary

a.agg.forecast %>% tail(3) %>% select(ds, yhat) %>% 
  rename(month = ds, total_offenses_prediction = yhat)