DATA 607 - Project 2.2

# install.packages("tidyverse")
library(readxl)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)

Data Posted by Brian Liles
NYC COMPSTAT Data
http://www1.nyc.gov/site/nypd/stats/crime-statistics/historical.page

Downloaded XLS file for “Citywide Non-Seven Major Felony Offenses 2000-2017”

Here’s how the data looks initially: image:

Read XLS file, rows 5 through 13

# Get path [in platform independent way using file.path() function]
path <- file.path("~", "R", "Project 2", "non-seven-major-felony-offenses-2000-2017.xls")
df <-  read_xls(path, sheet = 1, cell_rows(5:13)) %>% tbl_df()
df

## # A tibble: 8 x 19
##   OFFENSE   `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007` `2008`
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 FELONY P~  2194.  2114.  1976.  1545.  1358.  1250.  1175.  1073.  1016.
## 2 FORGERY/~  5714.  5594.  5859.  8112. 12984. 12896. 12579. 12075. 12216.
## 3 ARSON      3189.  2681.  1902.  1492.  1789.  1831.  1679.  1527.  1896.
## 4 FELONY S~  1839.  1831.  1513.  1295.  1263.  1162.  1096.  1031.   909.
## 5 FELONY D~ 38088. 30743. 29692. 24076. 25824. 27391. 29516. 31459. 28936.
## 6 FELONY D~  2958.  3030.  2804.  2511.  2987.  3748.  4315.  5004.  5263.
## 7 FEL. CRI~ 12934. 16026. 13291.  8400.  8074.  7776.  7605.  7090.  7003.
## 8 OTHER FE~ 13945. 13059. 13956. 13786. 12454. 11800. 11063. 11699. 11719.
## # ... with 9 more variables: `2009` <dbl>, `2010` <dbl>, `2011` <dbl>,
## #   `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>, `2016` <dbl>,
## #   `2017` <dbl>

Perform data transformation, suggested by Brian: “At first glance I would use the gather function and condense the table using the following headers: offense, year, and count instead of the wide format.”

dFelon <- gather(df, "YEAR", "COUNT", 2:(2+(2017-2000)), convert = T) %>% tbl_df()
head(dFelon)

## # A tibble: 6 x 3
##   OFFENSE                               YEAR  COUNT
##   <chr>                                <int>  <dbl>
## 1 FELONY POSSESSION OF STOLEN PROPERTY  2000  2194.
## 2 FORGERY/THEFT_FRAUD/IDENTITY THEFT    2000  5714.
## 3 ARSON                                 2000  3189.
## 4 FELONY SEX CRIMES (3)                 2000  1839.
## 5 FELONY DANGEROUS DRUGS (1)            2000 38088.
## 6 FELONY DANGEROUS WEAPONS (2)          2000  2958.

ggplot(data = dFelon, mapping = aes(x = YEAR, y = COUNT, label = OFFENSE, color = OFFENSE, shape = OFFENSE)) + 
  labs(x = "Years", y ="Counts") +
  geom_point() +
  geom_line()

## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 8.
## Consider specifying shapes manually if you must have them.

## Warning: Removed 36 rows containing missing values (geom_point).

  #geom_smooth()
  #facet_wrap(~ OFFENSE)
  #geom_line(mapping = aes(color = OFFENSE))
  #facet_grid(OFFENSE ~ .)
  #geom_text(aes(label = COUNT), vjust = 0, nudge_y = 1, color = "red") +

dFelon %>% group_by(OFFENSE) %>% summarize(MEAN = mean(COUNT)) %>% arrange(desc(MEAN)) %>% mutate(RANK = row_number())

## # A tibble: 8 x 3
##   OFFENSE                                     MEAN  RANK
##   <chr>                                      <dbl> <int>
## 1 FELONY DANGEROUS DRUGS (1)                24204.     1
## 2 OTHER FELONIES (4)                        12703.     2
## 3 FORGERY/THEFT_FRAUD/IDENTITY THEFT         9943.     3
## 4 FEL. CRIMINAL MISCHIEF & RELATED OFFENSES  8830.     4
## 5 FELONY DANGEROUS WEAPONS (2)               4384.     5
## 6 ARSON                                      1575.     6
## 7 FELONY SEX CRIMES (3)                      1243.     7
## 8 FELONY POSSESSION OF STOLEN PROPERTY       1188.     8

dFelon <- left_join(dFelon, dFelon %>% group_by(OFFENSE) %>% summarize(MEAN = mean(COUNT)) %>% arrange(desc(MEAN)) %>% mutate(RANK = row_number()), by = "OFFENSE") %>% arrange(YEAR, RANK)
head(dFelon, 10)

## # A tibble: 10 x 5
##    OFFENSE                                    YEAR  COUNT   MEAN  RANK
##    <chr>                                     <int>  <dbl>  <dbl> <int>
##  1 FELONY DANGEROUS DRUGS (1)                 2000 38088. 24204.     1
##  2 OTHER FELONIES (4)                         2000 13945. 12703.     2
##  3 FORGERY/THEFT_FRAUD/IDENTITY THEFT         2000  5714.  9943.     3
##  4 FEL. CRIMINAL MISCHIEF & RELATED OFFENSES  2000 12934.  8830.     4
##  5 FELONY DANGEROUS WEAPONS (2)               2000  2958.  4384.     5
##  6 ARSON                                      2000  3189.  1575.     6
##  7 FELONY SEX CRIMES (3)                      2000  1839.  1243.     7
##  8 FELONY POSSESSION OF STOLEN PROPERTY       2000  2194.  1188.     8
##  9 FELONY DANGEROUS DRUGS (1)                 2001 30743. 24204.     1
## 10 OTHER FELONIES (4)                         2001 13059. 12703.     2

ggplot(data = filter(dFelon, between(RANK, 1, 2)), mapping = aes(x = YEAR, y = COUNT, label = OFFENSE, color = OFFENSE, shape = OFFENSE)) + 
  labs(x = "Years", y ="Counts") +
  geom_point() +
  #geom_line() +
  geom_smooth()

## `geom_smooth()` using method = 'loess'

ggplot(data = filter(dFelon, between(RANK, 3, 5)), mapping = aes(x = YEAR, y = COUNT, label = OFFENSE, color = OFFENSE, shape = OFFENSE)) + 
  labs(x = "Years", y ="Counts") +
  geom_point() +
  #geom_line() +
  geom_smooth()

## `geom_smooth()` using method = 'loess'

ggplot(data = filter(dFelon, between(RANK, 6, 8)), mapping = aes(x = YEAR, y = COUNT, label = OFFENSE, color = OFFENSE, shape = OFFENSE)) + 
  labs(x = "Years", y ="Counts") +
  geom_point() +
  #geom_line() +
  geom_smooth()

## `geom_smooth()` using method = 'loess'

DATA 607 - Project 2.2

Simon63

March 12, 2018