A crime analysis of Washington, DC

The following is our analysis and conclusion:

library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.6     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()

library(stringr)
library(forcats)

DC_crimes <- read.csv(file = "dc-crimes-search-results.csv")
head(DC_crimes)

unique(DC_crimes$offense.text)

## [1] "theft/other"                "theft f/auto"              
## [3] "motor vehicle theft"        "robbery"                   
## [5] "burglary"                   "assault w/dangerous weapon"
## [7] "sex abuse"                  "homicide"                  
## [9] "arson"

First part:1. Basic information of DC

DC_crimes$offensegroup %>% 
  fct_count() %>% 
  mutate(pct = prop.table(n)) %>% 
  ggplot(aes(x = f, 
             y = pct, 
             fill = n, 
             label = scales::percent(pct)))+
  geom_col(position = "dodge")+
  geom_text(position = position_dodge(0.9),
            vjust = -0.5,
            size = 3)+
  scale_y_continuous(labels = scales::percent)+
  xlab("crime type")+
  ylab("percent of total")

DC_crimes$offensekey %>% 
  fct_count() %>% 
  mutate(nmid = mean(range(n))) %>% 
  ggplot(aes(x = f,
             y = n,
             fill = n))+
  geom_bar(stat = "identity")+
  geom_text(mapping = aes(label = n, y = n, 
            hjust = ifelse(n < nmid, -0.1, 1.1)), size = 3)+
  coord_flip()

DC_crimes %>% 
  ggplot(mapping = aes(x = WARD, 
                       y = offensegroup, 
                       color = offensegroup))+
  geom_col()+
  xlab(label = "the Ward")+
  ylab(label = "the offense group")+
  theme_bw()

Second part: 1. The relationship between crime ratio percent and ward.

library(dplyr)
library(readr)
library(ggplot2)

## read data
DC_crimes <- read_csv(file = "dc-crimes-search-results.csv",quote = "", col_types =
                        cols(WARD=col_character(), START_DATE = 
                             col_datetime(), START_DATE = col_datetime(), END_DATE = col_datetime()))

## Warning: 68317 parsing failures.
## row col   expected     actual                           file
##   1  -- 29 columns 30 columns 'dc-crimes-search-results.csv'
##   2  -- 29 columns 30 columns 'dc-crimes-search-results.csv'
##   3  -- 29 columns 30 columns 'dc-crimes-search-results.csv'
##   4  -- 29 columns 30 columns 'dc-crimes-search-results.csv'
##   5  -- 29 columns 30 columns 'dc-crimes-search-results.csv'
## ... ... .......... .......... ..............................
## See problems(...) for more details.

head(DC_crimes)

Ward <- read_csv(file = "Wards_from_2012.csv", col_types = cols(WARD = col_character()))
head(Ward)

ACS_population <- read_csv(file ="ACS_2017_Population_Variables_Tract.csv", col_types = 
                             cols(GEOID=col_character()))
head(ACS_population)

Census_Tracts <- read_csv(file = "Census_Tracts_in_2010.csv", col_types = 
                            cols(GEOID=col_character()))
head(Census_Tracts)

crime_by_ward <- group_by (DC_crimes, WARD) #group by ward
crime_by_ward

crime_by_ward_count <- summarise(crime_by_ward, count = n()) #ward count
crime_by_ward_count

Ward <- Ward[order(Ward$WARD),] #order by ward

#the crime ratio percent of each ward:
crime_by_ward_ratio <- crime_by_ward_count[,"count"]/Ward[,"POP_2011_2015"]
Ward_new <- data.frame(Crime_ratio = crime_by_ward_ratio[,1]*100,
                       Crime_count = crime_by_ward_count[,"count"], Ward, stringsAsFactors = FALSE)

#the crime ratio percent vs WARD
ggplot(data = Ward_new,aes(x = WARD,y = Crime_ratio)) +
  geom_bar(stat = "identity")

Conclusion: The crime ratio of ward 2 is the highest, and ward 3 is the lowest.

The relationship between crime ratio and Ward

#add seasons
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

DC_crimes %>% 
  mutate(Season = quarter(REPORT_DAT)) -> DC_crimes_new
crime_by_ward_season_count <- summarise(group_by(DC_crimes_new, WARD, Season), count=n())

## `summarise()` has grouped output by 'WARD'. You can override using the `.groups` argument.

crime_by_ward_season_count

Ward_new2 <- merge(crime_by_ward_season_count, Ward_new, by = "WARD")
Ward_new2 <- data.frame(Crime_season_ratio = Ward_new2[,"count.x"]/Ward_new2[,"POP_2011_2015"]*100, 
                       Ward_new2, stringsAsFactors = FALSE)

#4) season Crime ratio percent vs WARD
ggplot(data=Ward_new2,aes(x = Season, y = Crime_season_ratio)) + 
  geom_bar(stat="identity") + facet_grid(WARD~.)

ggplot(data=Ward_new2,aes(x = WARD, y = Crime_season_ratio)) + 
  geom_bar(stat="identity") + 
  facet_grid(Season~.)

ggplot(data = Ward_new2,aes(x = WARD, y = Crime_season_ratio,fill = factor(Season))) + 
  geom_bar(stat = "identity",position = "stack")

ggplot(data = Ward_new2,aes(x = Season, y = Crime_season_ratio, colour = factor(Season))) + 
  geom_boxplot()

Conclusion: From the season crime ratio percent vs WARD, we found that different wards are with different crime ratio in different seasons. For example, wards 2 is with the highest crime ratio in winter across 8 wards indicating that the police of wards 2 may be pay more attention at winter for the crime. Additionally, autumn is with the highest crime ration across 8 wards.

The relationship between season Crime ratio percent vs Unemployment Rate Percent

ggplot(data=Ward_new2,aes(x = Crime_season_ratio, y = UNEMPLOYMENT_RATE, colour=factor(WARD))) + 
  geom_point(size = 4) + 
  facet_grid(Season~.)

Conclusion: From the season crime ratio percent vs Unemployment Rate Percent, we found that wards with the highest crime ratio and the higher Unemployment Rate Percent. For example, the ward 3 is with the lowest people with Unemployment Rate Percent, and also showed lowest crime ratio percent in 4 seasons. The season crime ratio percent showed weak negative correlation with Unemployment Rate Percent.

The relationship between season Crime ratio percent vs MEDIAN_AGE

#6) season Crime ratio vs MEDIAN_AGE
ggplot(data=Ward_new2, aes(x = factor(MEDIAN_AGE), y = Crime_season_ratio, fill = WARD)) + 
  geom_bar(stat ="identity", position = "stack") + 
  facet_grid(Season~.)

Conclusion: From the season crime ratio percent vs MEDIAN_AGE, we found that ward 2 is with the highest crime ratio at average age of 30.9 group in 4 seasons. The youngest group of season crime rate is in ward 8 with average of age 29.3, and the average of season crime ratio of ward 4 is the largest (39.3). Ward 3 and 7 are both at 37 average age. The results suggest that the average age of crime ratio of different wards are different.

The relationship between season Crime ratio percent vs PCT_BELOW_POV

#7) season Crime ratio percent vs PCT_BELOW_POV
ggplot(data = Ward_new2, aes(x = Crime_season_ratio, y = PCT_BELOW_POV, colour = WARD)) + 
  geom_point(size=4) + 
  facet_grid(Season~.)

Conclusion: From the season crime ratio percent vs PCT_BELOW_POV plot, we found that the lower Percent below poverty for all people, the lower season crime ratio percent is. For example, the ward 3 is with the lowest crime ratio and also showed low Percent below poverty for all people in 4 seasons.

The relationship between seasons Crime ratio percent vs Per capita income in dollars/Median household income Dollars

#8) season Crime ratio percent vs Per capita income in dollars/Median household income Dollars
ggplot(data = Ward_new2, aes(x = Crime_season_ratio ,y = PER_CAPITA_INCOME, colour = WARD)) +
  geom_point(size = 4) + 
  facet_grid(Season~.)

ggplot(data = Ward_new2, aes(x = Crime_season_ratio ,y = MEDIAN_HH_INCOME,colour = WARD)) +
  geom_point(size = 4) + 
  facet_grid(Season~.)

Conclusion: From the season crime ratio percent vs Per capita income in dollars/Median household income Dollars plot, we found that the higher Per capita income in dollars or Median household income Dollars, the lower season crime ratio percent. For example, the ward 3 is with the highest income for family or person, and also showed lowest crime ratio percent in 4 seasons.

The relationship between seasons Crime ratio percent vs education degree

#9) season Crime ratio percent vs BACH_DEGREE25PLUS
ggplot(data = Ward_new2, aes(x = Crime_season_ratio ,y = BACH_DEGREE_25_PLUS, colour = WARD)) + 
  geom_point(size = 4) +
  facet_grid(Season~.)

Conclusion: From the season crime ratio percent vs education degree plot, we found that the more people Bachelor degree the lower season crime ratio percent. For example, the ward 3 is with the highest people with Bachelor degree, and also showed lowest crime ratio percent in 4 seasons.
In summary, through analysis the relationships pf crime ratio with Unemployment Rate Percent, Per capita income in dollars/Median household income Dollars, Percent below poverty for all people, Persons 25 years and over who are Bachelor degree and MEDIAN_AGE, the crime ratio of each ward showed that the higher Unemployment Rate Percent the higher ratio crime ratio, the higher Percent below poverty the higher crime ratio, the higher income the lower crime ratio, the more Persons 25 years and over who are Bachelor degree the lower crime ratio.

Third Part:

library(tidyverse)
library(tidycensus)
library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

## 
## Attaching package: 'tigris'

## The following object is masked from 'package:tidycensus':
## 
##     fips_codes

library(sf)

## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1

library(viridis) #a color palette

## Loading required package: viridisLite

options(tigris_use_cache = TRUE)
library(ggplot2)
library(magrittr)
library(lubridate)
library(usmap)

crime = read_csv(file = "dc-crimes-search-results.csv", 
                    col_types = cols(
                      offensegroup = col_factor(),
                      'offense-text' = col_factor(), 
                      DISTRICT = col_factor(), 
                      WARD = col_factor(), 
                      SHIFT = col_factor(),
                      METHOD= col_factor()
                    ))
head(crime)

crime %>% 
  mutate(cmonth = month(START_DATE, label=TRUE, abbr = TRUE),
         cmonthday = day(START_DATE),
         cweekday = wday(START_DATE,label = TRUE, abbr = TRUE),
         WARD = fct_inseq(WARD )
         )-> crime

crime %>%
  count(WARD,METHOD) %>%
  pivot_wider(names_from = METHOD, values_from = n)%>%
  ggplot(aes(x = WARD, y = (others+knife+gun)))+
  geom_point()+
  xlab("WARD")+
  ylab('total crime')+
  theme_bw()

Summarize by day for each ward and method

crime %>% 
  group_by(WARD,YEAR, cmonth, cmonthday, cweekday, METHOD) %>%
      summarize(tot_daily_crimes = n()) -> sumcrime

## `summarise()` has grouped output by 'WARD', 'YEAR', 'cmonth', 'cmonthday', 'cweekday'. You can override using the `.groups` argument.

  sumcrime

Look at total by month by WARD

sumcrime %>% 
group_by(YEAR, cmonth, WARD, METHOD) %>% 
    summarize(tot_crimes = sum(tot_daily_crimes)) ->
      sumWARD

## `summarise()` has grouped output by 'YEAR', 'cmonth', 'WARD'. You can override using the `.groups` argument.

sumWARD %>% 
  ggplot(aes(x= cmonth, y=tot_crimes)) + 
    geom_boxplot() +
    facet_wrap(~METHOD, scales = "free")

  aout <-  aov(tot_crimes ~  cmonth + WARD + METHOD, data = sumWARD)
  summary(aout)

##              Df   Sum Sq Mean Sq F value   Pr(>F)    
## cmonth       11   203637   18512   1.986   0.0276 *  
## WARD          7   689178   98454  10.560 1.54e-12 ***
## METHOD        2 10276793 5138397 551.161  < 2e-16 ***
## Residuals   585  5453876    9323                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Look at by population

DC_ward = read_csv('./Wards_from_2012.csv', col_types = cols(
                    WARD = col_factor(),
                    NAME = col_factor(), 
                    REP_NAME = col_factor(), 
                    WEB_URL = col_character(),
                    REP_PHONE = col_factor(), 
                    REP_EMAIL = col_factor(), 
                    REP_OFFICE = col_character(), 
                    LABEL = col_factor(), 
                    POP_25_PLUS = col_factor(), 
                    MARRIED_COUPLE_FAMILY = col_factor()))

ACSpop = read_csv('./ACS_2017_Population_Variables_Tract.csv', col_types = cols(
  .default = col_factor(),
  NAME = col_factor(),
  State = col_factor(),
  County = col_factor()
))

DC_ward %>% 
  mutate(WARD = fct_inseq(WARD))->
  DC_ward
sumWARD %>% 
  filter(YEAR == 2019) %>% 
  group_by(WARD, METHOD) %>% 
  summarize(n = sum(tot_crimes)) %>% 
  pivot_wider(names_from = METHOD, values_from = n) %>% 
  right_join(DC_ward) %>% 
  mutate(crimes_per_pop = (others + gun + knife)/POP_2011_2015) -> 
  DC_ward

## `summarise()` has grouped output by 'WARD'. You can override using the `.groups` argument.

## Joining, by = "WARD"

DC_ward %>% 
  ggplot(aes(WARD, crimes_per_pop)) +
  geom_point()

crime%>%
  count(WARD,METHOD)%>%
  pivot_wider(names_from = METHOD, values_from = n)%>%
  mutate(total_crime = (others+knife+gun)) ->crime1
crime1

DC_ward%>%
  select(WARD, MEDIAN_AGE, UNEMPLOYMENT_RATE,crimes_per_pop)%>%
  arrange(WARD)->ward1
ward1

left_join(crime1, ward1, by = 'WARD')

left_join(crime1, ward1, by = 'WARD')%>%
  ggplot(aes(x = MEDIAN_AGE, y = crimes_per_pop, color = WARD))+
  geom_point()+
  facet_wrap(~MEDIAN_AGE)

left_join(crime1, ward1, by = 'WARD')%>%
  ggplot(aes(x = WARD, y = crimes_per_pop, color = WARD))+
  geom_point()+
  facet_wrap(~UNEMPLOYMENT_RATE)

DC_ward %>% 
  pivot_longer(cols = others:knife, names_to = "Method", values_to = "crimes" ) %>% 
  group_by(WARD, Method) %>% 
  summarize(crimes_per_popM = sum(crimes)/POP_2011_2015, 
            MEDIAN_AGE = mean(MEDIAN_AGE), 
            MEDIAN_HH_INCOME = mean(MEDIAN_HH_INCOME)) ->
  DC_ward_sum

## `summarise()` has grouped output by 'WARD'. You can override using the `.groups` argument.

DC_ward_sum %>% 
  filter(WARD != "3") %>% 
  ggplot(aes(MEDIAN_HH_INCOME, crimes_per_popM, color = WARD))+
  geom_point()+
  geom_smooth(aes(color = "blue"), se = FALSE, method = "lm")+
facet_wrap(~Method, scales ="free")

## `geom_smooth()` using formula 'y ~ x'

DC_ward%>%
  filter(WARD != 7)%>%
  mutate(pop_increase_rate = (POP_2011_2015 - POP_2000)/15)%>%
  ggplot(aes(x = pop_increase_rate, y = crimes_per_pop, color = WARD))+
  geom_point()+
  xlab('population increase rate')+
  ylab('crime rate')+
  geom_smooth(aes(color = "blue"), se = FALSE, method = "lm")+
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

DC_ward%>%
  mutate(pop_increase_rate = (POP_2011_2015 - POP_2000)/15)%>%
  mutate(pop_density = (POP_2011_2015/AREASQMI))%>%
  ggplot(aes(x = pop_density, y = crimes_per_pop, color = WARD))+
  geom_point()+
  theme_bw()

v17 <- load_variables(2017, "acs5", cache = TRUE)
categories <- enframe(sort(unique(str_extract(v17$concept,"^.{35}"))))

dc_median <- get_acs(state = "DC", county = "District of Columbia", geography = "tract", 
                  variables = "B19013_001", geometry = TRUE, key = Sys.getenv("CENSUS_API_KEY"))

## Getting data from the 2015-2019 5-year ACS

dc_median %>% 
  mutate(CENSUS_TRACT = str_sub(GEOID,6,11)) ->
  dc_median
head(dc_median)

# Get a new set of shape files
    ward_geom <- read_sf("DC_Ward_Tracts_in_2010_Shapefiles 2")
    ward_geom %>% 
      mutate(WARD = parse_factor(as.character(WARD)), 
             WARD = fct_inseq(WARD )) ->
      ward_geom
ward_geom <- cbind(ward_geom, st_coordinates(st_centroid(ward_geom$geometry)))

dc_median %>% 
ggplot(aes(fill = estimate)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c()

crime %>% 
  filter(METHOD != "others") %>% 
  group_by(CENSUS_TRACT, METHOD) %>%
  count() %>% 
  left_join(dc_median) %>% 
  ggplot(aes(fill = n)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
    facet_wrap(~METHOD)

## Joining, by = "CENSUS_TRACT"

wards <- read_sf("./Wards_from_2012.csv")
wards %>% 
  mutate(WARD = parse_factor(as.character(WARD))) ->
  wards

crime %>% 
  filter(METHOD == "gun") %>% 
  group_by(WARD, METHOD) %>%
  count() %>% 
  left_join(ward_geom) %>% 
  ggplot(aes(fill = n)) +
    geom_sf(aes(geometry = geometry),color = NA) +
    coord_sf(crs = 26915) + 
    scale_fill_viridis_c() +
    #facet_wrap(~METHOD, ncol = 2) +
  geom_text(aes(X, Y, label = WARD), size = 5, color = 'red')

## Joining, by = "WARD"

Conclusion We have found out the crime rate and compare with different virable. First, there is no significant relation between unemployment rate and crime rate and also age is not a useful observation. But we have figured out some relation for house hold income’s relationship with crime rate. The higher income that people have the lower gun and knife crime rate. It shows that for higher income place, it may has an higher security level. Population increase also come with higher crime rate that has shown in the graph. The higher increase rate, the higher crime rate and is no relate with total population.

References:
Jr. Martinez, R., & Jr. Valenzuela, A. (2006).Immigration and crime : Race, Ethnicity, and Violence. New York University Press.

Savage, J. (2006). Interpreting “Percent Black”: An analysis of race and violent crime in Washington D.C. Journal of Ethnicity in Criminal Justice, 4(1-2), 29–63. https://doi.org/10.1300/J222v04n01_02

A crime analysis of Washington, DC

5/1/2020

Team members:

Introduction:

The following is our analysis and conclusion: