library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

setwd("~/Documents/0 - Montgomery College/0 - DATA 110/Datasets")
hatecrimes <- read_csv("hateCrimes2010.csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   County = col_character(),
##   `Crime Type` = col_character()
## )

## See spec(...) for full column specifications.

# Look at the variables before you clean up the data

str(hatecrimes)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 423 obs. of  44 variables:
##  $ County                                      : chr  "Albany" "Albany" "Allegany" "Bronx" ...
##  $ Year                                        : num  2016 2016 2016 2016 2016 ...
##  $ Crime Type                                  : chr  "Crimes Against Persons" "Property Crimes" "Property Crimes" "Crimes Against Persons" ...
##  $ Anti-Male                                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Female                                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Transgender                            : num  0 0 0 4 0 0 0 0 0 0 ...
##  $ Anti-Gender Identity Expression             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Age*                                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-White                                  : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ Anti-Black                                  : num  1 2 1 0 0 1 0 1 0 2 ...
##  $ Anti-American Indian/Alaskan Native         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Asian                                  : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Anti-Native Hawaiian/Pacific Islander       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Multi-Racial Groups                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Other Race                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Jewish                                 : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ Anti-Catholic                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Protestant                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Islamic (Muslim)                       : num  1 0 0 6 0 0 0 0 1 0 ...
##  $ Anti-Multi-Religious Groups                 : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Anti-Atheism/Agnosticism                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Religious Practice Generally           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Other Religion                         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Buddhist                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Eastern Orthodox (Greek, Russian, etc.): num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Hindu                                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Jehovahs Witness                       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Mormon                                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Other Christian                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Sikh                                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Hispanic                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Arab                                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Other Ethnicity/National Origin        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Non-Hispanic*                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Gay Male                               : num  1 0 0 8 0 1 0 0 0 0 ...
##  $ Anti-Gay Female                             : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Anti-Gay (Male and Female)                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Heterosexual                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Bisexual                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Physical Disability                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Anti-Mental Disability                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total Incidents                             : num  3 3 1 20 2 3 1 1 1 2 ...
##  $ Total Victims                               : num  4 3 1 20 2 3 1 1 1 2 ...
##  $ Total Offenders                             : num  3 3 1 25 2 3 1 1 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   County = col_character(),
##   ..   Year = col_double(),
##   ..   `Crime Type` = col_character(),
##   ..   `Anti-Male` = col_double(),
##   ..   `Anti-Female` = col_double(),
##   ..   `Anti-Transgender` = col_double(),
##   ..   `Anti-Gender Identity Expression` = col_double(),
##   ..   `Anti-Age*` = col_double(),
##   ..   `Anti-White` = col_double(),
##   ..   `Anti-Black` = col_double(),
##   ..   `Anti-American Indian/Alaskan Native` = col_double(),
##   ..   `Anti-Asian` = col_double(),
##   ..   `Anti-Native Hawaiian/Pacific Islander` = col_double(),
##   ..   `Anti-Multi-Racial Groups` = col_double(),
##   ..   `Anti-Other Race` = col_double(),
##   ..   `Anti-Jewish` = col_double(),
##   ..   `Anti-Catholic` = col_double(),
##   ..   `Anti-Protestant` = col_double(),
##   ..   `Anti-Islamic (Muslim)` = col_double(),
##   ..   `Anti-Multi-Religious Groups` = col_double(),
##   ..   `Anti-Atheism/Agnosticism` = col_double(),
##   ..   `Anti-Religious Practice Generally` = col_double(),
##   ..   `Anti-Other Religion` = col_double(),
##   ..   `Anti-Buddhist` = col_double(),
##   ..   `Anti-Eastern Orthodox (Greek, Russian, etc.)` = col_double(),
##   ..   `Anti-Hindu` = col_double(),
##   ..   `Anti-Jehovahs Witness` = col_double(),
##   ..   `Anti-Mormon` = col_double(),
##   ..   `Anti-Other Christian` = col_double(),
##   ..   `Anti-Sikh` = col_double(),
##   ..   `Anti-Hispanic` = col_double(),
##   ..   `Anti-Arab` = col_double(),
##   ..   `Anti-Other Ethnicity/National Origin` = col_double(),
##   ..   `Anti-Non-Hispanic*` = col_double(),
##   ..   `Anti-Gay Male` = col_double(),
##   ..   `Anti-Gay Female` = col_double(),
##   ..   `Anti-Gay (Male and Female)` = col_double(),
##   ..   `Anti-Heterosexual` = col_double(),
##   ..   `Anti-Bisexual` = col_double(),
##   ..   `Anti-Physical Disability` = col_double(),
##   ..   `Anti-Mental Disability` = col_double(),
##   ..   `Total Incidents` = col_double(),
##   ..   `Total Victims` = col_double(),
##   ..   `Total Offenders` = col_double()
##   .. )

# Notice that all the variables are capitalized. The "anti-" variables have dashes, and some have asterisks, parantheses, slashes, periods, commas, and spaces too.

# Clean up the variables. Make them all lowercase and remove the spaces.

names(hatecrimes) <- tolower(names(hatecrimes))
names(hatecrimes) <- gsub(" ","",names(hatecrimes))
str(hatecrimes)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 423 obs. of  44 variables:
##  $ county                                  : chr  "Albany" "Albany" "Allegany" "Bronx" ...
##  $ year                                    : num  2016 2016 2016 2016 2016 ...
##  $ crimetype                               : chr  "Crimes Against Persons" "Property Crimes" "Property Crimes" "Crimes Against Persons" ...
##  $ anti-male                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-female                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-transgender                        : num  0 0 0 4 0 0 0 0 0 0 ...
##  $ anti-genderidentityexpression           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-age*                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-white                              : num  0 0 0 1 1 0 0 0 0 0 ...
##  $ anti-black                              : num  1 2 1 0 0 1 0 1 0 2 ...
##  $ anti-americanindian/alaskannative       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-asian                              : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ anti-nativehawaiian/pacificislander     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-multi-racialgroups                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-otherrace                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-jewish                             : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ anti-catholic                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-protestant                         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-islamic(muslim)                    : num  1 0 0 6 0 0 0 0 1 0 ...
##  $ anti-multi-religiousgroups              : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ anti-atheism/agnosticism                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-religiouspracticegenerally         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-otherreligion                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-buddhist                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-easternorthodox(greek,russian,etc.): num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-hindu                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-jehovahswitness                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-mormon                             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-otherchristian                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-sikh                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-hispanic                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-arab                               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-otherethnicity/nationalorigin      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-non-hispanic*                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-gaymale                            : num  1 0 0 8 0 1 0 0 0 0 ...
##  $ anti-gayfemale                          : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ anti-gay(maleandfemale)                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-heterosexual                       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-bisexual                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-physicaldisability                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anti-mentaldisability                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ totalincidents                          : num  3 3 1 20 2 3 1 1 1 2 ...
##  $ totalvictims                            : num  4 3 1 20 2 3 1 1 1 2 ...
##  $ totaloffenders                          : num  3 3 1 25 2 3 1 1 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   County = col_character(),
##   ..   Year = col_double(),
##   ..   `Crime Type` = col_character(),
##   ..   `Anti-Male` = col_double(),
##   ..   `Anti-Female` = col_double(),
##   ..   `Anti-Transgender` = col_double(),
##   ..   `Anti-Gender Identity Expression` = col_double(),
##   ..   `Anti-Age*` = col_double(),
##   ..   `Anti-White` = col_double(),
##   ..   `Anti-Black` = col_double(),
##   ..   `Anti-American Indian/Alaskan Native` = col_double(),
##   ..   `Anti-Asian` = col_double(),
##   ..   `Anti-Native Hawaiian/Pacific Islander` = col_double(),
##   ..   `Anti-Multi-Racial Groups` = col_double(),
##   ..   `Anti-Other Race` = col_double(),
##   ..   `Anti-Jewish` = col_double(),
##   ..   `Anti-Catholic` = col_double(),
##   ..   `Anti-Protestant` = col_double(),
##   ..   `Anti-Islamic (Muslim)` = col_double(),
##   ..   `Anti-Multi-Religious Groups` = col_double(),
##   ..   `Anti-Atheism/Agnosticism` = col_double(),
##   ..   `Anti-Religious Practice Generally` = col_double(),
##   ..   `Anti-Other Religion` = col_double(),
##   ..   `Anti-Buddhist` = col_double(),
##   ..   `Anti-Eastern Orthodox (Greek, Russian, etc.)` = col_double(),
##   ..   `Anti-Hindu` = col_double(),
##   ..   `Anti-Jehovahs Witness` = col_double(),
##   ..   `Anti-Mormon` = col_double(),
##   ..   `Anti-Other Christian` = col_double(),
##   ..   `Anti-Sikh` = col_double(),
##   ..   `Anti-Hispanic` = col_double(),
##   ..   `Anti-Arab` = col_double(),
##   ..   `Anti-Other Ethnicity/National Origin` = col_double(),
##   ..   `Anti-Non-Hispanic*` = col_double(),
##   ..   `Anti-Gay Male` = col_double(),
##   ..   `Anti-Gay Female` = col_double(),
##   ..   `Anti-Gay (Male and Female)` = col_double(),
##   ..   `Anti-Heterosexual` = col_double(),
##   ..   `Anti-Bisexual` = col_double(),
##   ..   `Anti-Physical Disability` = col_double(),
##   ..   `Anti-Mental Disability` = col_double(),
##   ..   `Total Incidents` = col_double(),
##   ..   `Total Victims` = col_double(),
##   ..   `Total Offenders` = col_double()
##   .. )

# Look at the 5-number stats for these variables
summary(hatecrimes)

##     county               year       crimetype           anti-male       
##  Length:423         Min.   :2010   Length:423         Min.   :0.000000  
##  Class :character   1st Qu.:2011   Class :character   1st Qu.:0.000000  
##  Mode  :character   Median :2013   Mode  :character   Median :0.000000  
##                     Mean   :2013                      Mean   :0.007092  
##                     3rd Qu.:2015                      3rd Qu.:0.000000  
##                     Max.   :2016                      Max.   :1.000000  
##   anti-female      anti-transgender  anti-genderidentityexpression
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000              
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000              
##  Median :0.00000   Median :0.00000   Median :0.00000              
##  Mean   :0.01655   Mean   :0.04728   Mean   :0.05674              
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000              
##  Max.   :1.00000   Max.   :5.00000   Max.   :3.00000              
##    anti-age*         anti-white        anti-black    
##  Min.   :0.00000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median :0.00000   Median : 0.0000   Median : 1.000  
##  Mean   :0.05201   Mean   : 0.3357   Mean   : 1.761  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  Max.   :9.00000   Max.   :11.0000   Max.   :18.000  
##  anti-americanindian/alaskannative   anti-asian    
##  Min.   :0.000000                  Min.   :0.0000  
##  1st Qu.:0.000000                  1st Qu.:0.0000  
##  Median :0.000000                  Median :0.0000  
##  Mean   :0.007092                  Mean   :0.1773  
##  3rd Qu.:0.000000                  3rd Qu.:0.0000  
##  Max.   :1.000000                  Max.   :8.0000  
##  anti-nativehawaiian/pacificislander anti-multi-racialgroups
##  Min.   :0                           Min.   :0.00000        
##  1st Qu.:0                           1st Qu.:0.00000        
##  Median :0                           Median :0.00000        
##  Mean   :0                           Mean   :0.08511        
##  3rd Qu.:0                           3rd Qu.:0.00000        
##  Max.   :0                           Max.   :3.00000        
##  anti-otherrace  anti-jewish     anti-catholic     anti-protestant  
##  Min.   :0      Min.   : 0.000   Min.   : 0.0000   Min.   :0.00000  
##  1st Qu.:0      1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:0.00000  
##  Median :0      Median : 0.000   Median : 0.0000   Median :0.00000  
##  Mean   :0      Mean   : 3.981   Mean   : 0.2695   Mean   :0.02364  
##  3rd Qu.:0      3rd Qu.: 3.000   3rd Qu.: 0.0000   3rd Qu.:0.00000  
##  Max.   :0      Max.   :82.000   Max.   :12.0000   Max.   :1.00000  
##  anti-islamic(muslim) anti-multi-religiousgroups anti-atheism/agnosticism
##  Min.   : 0.0000      Min.   : 0.00000           Min.   :0               
##  1st Qu.: 0.0000      1st Qu.: 0.00000           1st Qu.:0               
##  Median : 0.0000      Median : 0.00000           Median :0               
##  Mean   : 0.4704      Mean   : 0.07565           Mean   :0               
##  3rd Qu.: 0.0000      3rd Qu.: 0.00000           3rd Qu.:0               
##  Max.   :10.0000      Max.   :10.00000           Max.   :0               
##  anti-religiouspracticegenerally anti-otherreligion anti-buddhist
##  Min.   :0.000000                Min.   :0.000      Min.   :0    
##  1st Qu.:0.000000                1st Qu.:0.000      1st Qu.:0    
##  Median :0.000000                Median :0.000      Median :0    
##  Mean   :0.007092                Mean   :0.104      Mean   :0    
##  3rd Qu.:0.000000                3rd Qu.:0.000      3rd Qu.:0    
##  Max.   :2.000000                Max.   :4.000      Max.   :0    
##  anti-easternorthodox(greek,russian,etc.)   anti-hindu      
##  Min.   :0.000000                         Min.   :0.000000  
##  1st Qu.:0.000000                         1st Qu.:0.000000  
##  Median :0.000000                         Median :0.000000  
##  Mean   :0.002364                         Mean   :0.002364  
##  3rd Qu.:0.000000                         3rd Qu.:0.000000  
##  Max.   :1.000000                         Max.   :1.000000  
##  anti-jehovahswitness  anti-mormon anti-otherchristian   anti-sikh
##  Min.   :0            Min.   :0    Min.   :0.00000     Min.   :0  
##  1st Qu.:0            1st Qu.:0    1st Qu.:0.00000     1st Qu.:0  
##  Median :0            Median :0    Median :0.00000     Median :0  
##  Mean   :0            Mean   :0    Mean   :0.01655     Mean   :0  
##  3rd Qu.:0            3rd Qu.:0    3rd Qu.:0.00000     3rd Qu.:0  
##  Max.   :0            Max.   :0    Max.   :3.00000     Max.   :0  
##  anti-hispanic       anti-arab       anti-otherethnicity/nationalorigin
##  Min.   : 0.0000   Min.   :0.00000   Min.   : 0.0000                   
##  1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.0000                   
##  Median : 0.0000   Median :0.00000   Median : 0.0000                   
##  Mean   : 0.3735   Mean   :0.06619   Mean   : 0.2837                   
##  3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 0.0000                   
##  Max.   :17.0000   Max.   :2.00000   Max.   :19.0000                   
##  anti-non-hispanic*  anti-gaymale    anti-gayfemale  
##  Min.   :0          Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0          1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0          Median : 0.000   Median :0.0000  
##  Mean   :0          Mean   : 1.499   Mean   :0.2411  
##  3rd Qu.:0          3rd Qu.: 1.000   3rd Qu.:0.0000  
##  Max.   :0          Max.   :36.000   Max.   :8.0000  
##  anti-gay(maleandfemale) anti-heterosexual  anti-bisexual     
##  Min.   :0.0000          Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.0000          1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.0000          Median :0.000000   Median :0.000000  
##  Mean   :0.1017          Mean   :0.002364   Mean   :0.004728  
##  3rd Qu.:0.0000          3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :4.0000          Max.   :1.000000   Max.   :1.000000  
##  anti-physicaldisability anti-mentaldisability totalincidents  
##  Min.   :0.00000         Min.   :0.000000      Min.   :  1.00  
##  1st Qu.:0.00000         1st Qu.:0.000000      1st Qu.:  1.00  
##  Median :0.00000         Median :0.000000      Median :  3.00  
##  Mean   :0.01182         Mean   :0.009456      Mean   : 10.09  
##  3rd Qu.:0.00000         3rd Qu.:0.000000      3rd Qu.: 10.00  
##  Max.   :1.00000         Max.   :1.000000      Max.   :101.00  
##   totalvictims    totaloffenders  
##  Min.   :  1.00   Min.   :  1.00  
##  1st Qu.:  1.00   1st Qu.:  1.00  
##  Median :  3.00   Median :  3.00  
##  Mean   : 10.48   Mean   : 11.77  
##  3rd Qu.: 10.00   3rd Qu.: 11.00  
##  Max.   :106.00   Max.   :113.00

Select only certain hate crimes

# Look at the most prominent types of hate crimes (8 groups of people).

hatecrimes2 <- hatecrimes %>%
  select(county, year, `anti-black`, 'anti-white', `anti-jewish`, 'anti-catholic','anti-age*','anti-islamic(muslim)', 'anti-gaymale', 'anti-hispanic', totalincidents, totalvictims, totaloffenders)

head(hatecrimes2)

## # A tibble: 6 x 13
##   county  year `anti-black` `anti-white` `anti-jewish` `anti-catholic`
##   <chr>  <dbl>        <dbl>        <dbl>         <dbl>           <dbl>
## 1 Albany  2016            1            0             0               0
## 2 Albany  2016            2            0             0               0
## 3 Alleg…  2016            1            0             0               0
## 4 Bronx   2016            0            1             0               0
## 5 Bronx   2016            0            1             1               0
## 6 Broome  2016            1            0             0               0
## # … with 7 more variables: `anti-age*` <dbl>,
## #   `anti-islamic(muslim)` <dbl>, `anti-gaymale` <dbl>,
## #   `anti-hispanic` <dbl>, totalincidents <dbl>, totalvictims <dbl>,
## #   totaloffenders <dbl>

# In the head data frame, Bronx county had the highest number of incidents.

# QUESTION - How is the select function different from the filter function?

Check Summary to make sure no missing values

# Check dimensions to count how many variables remain

 dim(hatecrimes2)

## [1] 423  13

# We started off with 44 variables, and now have only 13. The number of rows (423) remains the same.

summary(hatecrimes2)

##     county               year        anti-black       anti-white     
##  Length:423         Min.   :2010   Min.   : 0.000   Min.   : 0.0000  
##  Class :character   1st Qu.:2011   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Mode  :character   Median :2013   Median : 1.000   Median : 0.0000  
##                     Mean   :2013   Mean   : 1.761   Mean   : 0.3357  
##                     3rd Qu.:2015   3rd Qu.: 2.000   3rd Qu.: 0.0000  
##                     Max.   :2016   Max.   :18.000   Max.   :11.0000  
##   anti-jewish     anti-catholic       anti-age*       anti-islamic(muslim)
##  Min.   : 0.000   Min.   : 0.0000   Min.   :0.00000   Min.   : 0.0000     
##  1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.0000     
##  Median : 0.000   Median : 0.0000   Median :0.00000   Median : 0.0000     
##  Mean   : 3.981   Mean   : 0.2695   Mean   :0.05201   Mean   : 0.4704     
##  3rd Qu.: 3.000   3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 0.0000     
##  Max.   :82.000   Max.   :12.0000   Max.   :9.00000   Max.   :10.0000     
##   anti-gaymale    anti-hispanic     totalincidents    totalvictims   
##  Min.   : 0.000   Min.   : 0.0000   Min.   :  1.00   Min.   :  1.00  
##  1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:  1.00   1st Qu.:  1.00  
##  Median : 0.000   Median : 0.0000   Median :  3.00   Median :  3.00  
##  Mean   : 1.499   Mean   : 0.3735   Mean   : 10.09   Mean   : 10.48  
##  3rd Qu.: 1.000   3rd Qu.: 0.0000   3rd Qu.: 10.00   3rd Qu.: 10.00  
##  Max.   :36.000   Max.   :17.0000   Max.   :101.00   Max.   :106.00  
##  totaloffenders  
##  Min.   :  1.00  
##  1st Qu.:  1.00  
##  Median :  3.00  
##  Mean   : 11.77  
##  3rd Qu.: 11.00  
##  Max.   :113.00

Order the crimes in descending order

Order the data, first by total indcidents, then total offenders, then by total victims. It will be interesting to see if counties and years correlate with certain types of crimes.

# Use the arrange function to order, and desc to specify how it is ordered

ordered <- hatecrimes2 %>%
  arrange(desc(totalincidents, totaloffenders, totalvictims))

head(ordered)

## # A tibble: 6 x 13
##   county  year `anti-black` `anti-white` `anti-jewish` `anti-catholic`
##   <chr>  <dbl>        <dbl>        <dbl>         <dbl>           <dbl>
## 1 Kings   2012            4            1            82               6
## 2 Suffo…  2012           18            0            48               7
## 3 Kings   2010           10            3            34               0
## 4 New Y…  2016            6            5             9               0
## 5 Kings   2015            6            3            35               0
## 6 Kings   2016            4            6            26               1
## # … with 7 more variables: `anti-age*` <dbl>,
## #   `anti-islamic(muslim)` <dbl>, `anti-gaymale` <dbl>,
## #   `anti-hispanic` <dbl>, totalincidents <dbl>, totalvictims <dbl>,
## #   totaloffenders <dbl>

# Kings and Suffolk counties seem to have the most hate crimes.

Use Facet_Wrap

Look at each set of hate-crimes for each type for each year. Use the package “tidyr” to convert the dataset from wide to long with the command “gather”. It will take each column’s hate-crime type combine them all into one column called “id”. Then each cell count will go into the new column, “crimecount”. Finally, we are only doing this for the quantitiative variables, which are in columns 3 - 13. Note the command facet_wrap requires (~) before “id”.

# install.package("reshape2") - to use facet_wrap
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

# Use gather to combine all the hate crime types into one column called "id"
# Each cell count will go into a new column called "crimecount"
# Do this only for the quantitative numbers, which are in Columns 3-13

hatecrimeslong <- ordered %>% tidyr::gather("id", "crimecount", 3:13)

# Create a scatterplot with this new data subset
# x-axis is year, y-axis is crimecount
# Color-code by "id" 
hatecrimesplot <-hatecrimeslong %>%
  ggplot(., aes(year, crimecount))+
  geom_point()+
  aes(color = id)+
  facet_wrap(~id)
hatecrimesplot

# Notice that a separate plot is created for each variable, and each plot has its own color. These plots are arranged side by side. This was accomplished by facet_wrap.

Look deeper into crimes against blacks, gay males, and jews

From the facet_wrap plot above, anti-black, anti-gay males, and anti-jewish categories seem to have highest rates of offenses reported.

# Filter out just for those 3 crimes.
hatenew <- hatecrimeslong %>%
  filter( id== "anti-black" | id == "anti-jewish" | id == "anti-gaymale")

# After using filter, the new data subset contains just those 3 crimes. 
# Notice the == operator.

Plot these three types of hate crimes together

Use the following commands to finalize your barplot: - position = “dodge” makes side-by-side bars, rather than stacked bars - stat = “identity” allows you to plot each set of bars for each year between 2010 and 2016 - ggtitle gives the plot a title - labs gives a title to the legend

# position = "dodge" makes side-by-side bars instead of stacked bars
# stat = “identity” allows you to plot each set of bars for each year between 2010 and 2016

plot2 <- hatenew %>%
  ggplot() +
  geom_bar(aes(x=year, y=crimecount, fill = id),
      position = "dodge", stat = "identity") +
  ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
  ylab("Number of Hate Crime Incidents") +
  labs(fill = "Hate Crime Type")
plot2

# Let's see the same plot WITHOUT position = "dodge" 

plot2 <- hatenew %>%
  ggplot() +
  geom_bar(aes(x=year, y=crimecount, fill = id),
      position = , stat = "identity") +
  ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
  ylab("Number of Hate Crime Incidents") +
  labs(fill = "Hate Crime Type")
plot2

# You get stacked bars. Stat = "identity" does nothing.

What about the counties?

The next place to explore. Make bar graphs by county instead of by year.

# To show the bar graphs by county, change the x variable in the aes function

plot3 <- hatenew %>%
  ggplot() +
  geom_bar(aes(x=county, y=crimecount, fill = id),
      position = "dodge", stat = "identity") +
  ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
  ylab("Number of Hate Crime Incidents") +
  labs(fill = "Hate Crime Type")
plot3

# The resulting graph is very messy and not useful.

So many counties

There are too many counties for this plot to make sense, but maybe we can just look at the 10 counties with the highest number of incidents.

# use “group_by” to group each row by counties
# use summarize to get the total sum of incidents by county
# use arrange(desc) to arrange those sums of total incidents by counties in descending order
# use top_n to list the 5 counties with highest total incidents

counties <- hatenew %>%
  group_by(county)%>%
  summarize(sum = sum(crimecount)) %>%
  arrange(desc(sum)) %>%
  top_n(n=5)

## Selecting by sum

counties

## # A tibble: 5 x 2
##   county     sum
##   <chr>    <dbl>
## 1 Kings      713
## 2 New York   459
## 3 Suffolk    360
## 4 Nassau     298
## 5 Queens     235

# Now we have only 5 rows (the top 5 counties)
# The list is arranged from highest to lowest crimecount (represented by sum)

Finally, create the barplot above, but only for the 5 counties with the highest incidents of hate-crimes. The command “labs” is nice, because you can get a title, subtitle, y-axis label, and legend title, all in one command.

# Create a barplot of those top 5 counties.
# Use filter to select those top 5 counties
# Use labs to give to label the plot in one command

plot4 <- hatenew %>%
  filter(county =="Kings" | county =="New York" | county == "Suffolk" | county == "Nassau" | county == "Queens") %>%
  ggplot() +
  geom_bar(aes(x=county, y=crimecount, fill = id),
      position = "dodge", stat = "identity") +
  labs(ylab = "Number of Hate Crime Incidents",
    title = "5 Counties in NY with Highest Incidents of Hate Crimes",
    subtitle = "Between 2010-2016",
    fill = "Hate Crime Type")
plot4

# Notice that fill= is used twice here. This is tricky. 
# CONFIRM THIS IS CORRECT - The first fill appears in aes(). This specifies how the bars are color-coded. The second fill appears in labs(). This specifies the legend title.
# Notice subtitle= in labs. Subtitle appears below the main title.

HOMEWORK - MY PLOT

Hate crimes are comprised of Crimes Against Persons and Property Crimes. How do these two types of hate crimes break down in NY? Are there any interesting patterns?

For example, is one type more prevalent than the other? Where? Are there any upward or downward trends for the number of crimes over time?

I intend to create a barplot showing the number of Crimes Against Person compared with the number of Property Crimes over the period 2010-2016.

To get there, I have created some smaller plots to analyze the data.

The final plot appears at the end of this file.

Let’s first look at just Property Crimes.

PROPERTY CRIMES ANALYSIS

# Filter out just the property crimes

hateProp <- hatecrimes %>%
  filter(crimetype == "Property Crimes")

# The new data subset is 190 rows x 44 variables. Note that we are looking at ALL the hate groups now, not just the top 8 types from the tutorial above.

# Add a filter for year

hatePropByYear <- hatecrimes %>%
  filter( crimetype == "Property Crimes", year == 2016)

# The number of counties that show up in the filter results will vary by year.

For instance, in 2010, 26 counties have property hate crime incidents. In 2016, 32 counties do.

If the number of counties was higher, did the number of property hate crimes go up over this period too? Let’s create a plot to find out.

plotPropTrend <- hateProp %>%
  ggplot() +
  geom_bar(aes(x=year, y=totalincidents),
      position = , stat = "identity") +
  labs(title = "Property Hate Crimes in All NY Counties", 
       subtitle = "Between 2010-2016",
       ylab = "Number of Property Hate Crime Incidents")

plotPropTrend

# Hmm, the y-axis label is not showing up. WHY?
# There is no fill in aes() or labs( ). Adding it in makes no difference. Need to understand fill better.
# The bars are further subdivided (light gray lines), showing the share (proportion) of each county.

# Find the total number of property crimes for each year. To do this, sum up the totals for all the counties for each year. There should be 7 rows (corresponding to a year), arranged in descending order.

county_sum1 <- hatecrimes %>%
  filter(crimetype=="Property Crimes") %>%
  group_by(year) %>%
  summarize(year_sum = sum(totalincidents)) %>%
  arrange(desc(year_sum))

head(county_sum1)

## # A tibble: 6 x 2
##    year year_sum
##   <dbl>    <dbl>
## 1  2012      419
## 2  2016      318
## 3  2013      306
## 4  2014      301
## 5  2011      277
## 6  2010      239

Whew! I held my breath to see if the above snippet of code would work (lifted right off the Pfizer Tutorial). Looks like it did. This should make the remaining graphs far easier.

# Re-plot the property crimes from 2010-2016 using county_sum

plotPropTrend2 <- county_sum1 %>%
  ggplot() +
  geom_bar(aes(x=year, y=year_sum),
      position = , stat = "identity") +
  labs(title = "Property Hate Crimes in All NY Counties", 
       subtitle = "Between 2010-2016",
       ylab = "Number of Property Hate Crime Incidents") 

plotPropTrend2

# The bars are now a solid color. No longer subdivided by lines.
# FOLLOW-UP - Tried different ways to change the bar color: fill=, color=, scale_fill_discrete, etc. None are working. The bars keep showing up black, red, or black with a red border.

The above barplot shows there were indeed more property crimes in 2016 than in 2012. But more interesting is 2012, which had the highest number of property crimes.

A TEMPORARY SEGWAY- LOOK AT 2012 DATA

Let’s look at how many counties reported property hate crimes in 2012.

# Filter by crimetype and year
hatePropByYear <- hatecrimes %>%
  filter( crimetype == "Property Crimes", year == 2012)

The dataset shows only 23 counties reported property crimes in 2012. That means, there were more hate crimes in particular counties driving the overall number up.

# Sort this dataset to see which counties have the highest number of property hate crimes.

hatePropByYear <- hatecrimes %>%
  filter( crimetype == "Property Crimes", year == 2012) %>%
  arrange( desc(totalincidents))

# Plot the number of property hate crimes in 2012 for all counties

plotProp2012 <- hatePropByYear %>%
  ggplot() +
  geom_bar(aes(x=county, y=totalincidents),
      position = "dodge", stat = "identity") +
  labs(ylab = "Number of Property Hate Crime Incidents",
    title = "Total Property Hate Crimes for All NY Counties",
    subtitle = "2012 only",
    fill = "")

plotProp2012

# NOTE- ylabs is not working for the y-axis label.

While this is a messy graph, you can see the counties appear dichotomous - they have either a significant number of property crimes or very few.

Getting back to Crimes Against Persons.

CRIMES AGAINST PERSONS ANALYSIS

# Filter out just the person crimes

hatePerson <- hatecrimes %>%
  filter( crimetype == "Crimes Against Persons")

# The new data subset is 233 rows x 44 variables.

# Roll-up the dataset for Crimes Against Persons. Show just the yearly totals for 2010-2016.

county_sum2 <- hatecrimes %>%
  filter(crimetype=="Crimes Against Persons") %>%
  group_by(year) %>%
  summarize(year_sum = sum(totalincidents)) %>%
  arrange(desc(year_sum))

head(county_sum2)

## # A tibble: 6 x 2
##    year year_sum
##   <dbl>    <dbl>
## 1  2010      464
## 2  2012      315
## 3  2013      311
## 4  2016      280
## 5  2011      279
## 6  2015      275

# Plot the number of person crimes over 2010-2016

plotPersonTrend <- county_sum2 %>%
  ggplot() +
  geom_bar(aes(x=year, y=year_sum, fill=year),
      position = , stat = "identity") +
  labs(title = "Hate Crimes Against Persons in All NY Counties", 
       subtitle = "Between 2010-2016",
       ylab = "Yearly Total of Person Hate Crime Incidents") 

plotPersonTrend

# Added fill = year in aes(). It puts the bars into a scaling blue color scheme like the side-by-side boxplots in Week 2 Air Quality Tutorial. How to make it one color that is not black?

# Adding color="blue" does something VERY WEIRD -- black bars with red borders. Never mind about the color. Moving on..

# Plot the total number of Person crimes vs Property crimes over 2010-2016

plotComp <- hatecrimes %>%
  ggplot() +
  geom_bar(aes(x=year, y=totalincidents, fill = crimetype),
      position = , stat = "identity") +
    labs(title = "Crimes Against Persons vs Property Crimes in All NY Counties", 
       subtitle = "Between 2010-2016",
       ylab = "Number of Hate Crime Incidents", 
       fill = "Hate Crime Type")

plotComp

# Notice that this is a stacked barplot because position = "dodge" was omitted.

# It is hard to make comparisons on a stacked barplot. Let's look at it unstacked.

plotComp1 <- hatecrimes %>%
  ggplot() +
  geom_bar(aes(x=year, y=totalincidents, fill = crimetype),
      position = "dodge", stat = "identity") +
  ylab("Number of Hate Crime Incidents") +
  xlab("Year") +
  ggtitle("Crimes Against Person vs Property Crimes in All NY Counties") +
  labs(subtitle="Between 2010-2016") 

plotComp1

# FOLLOW-UP - is there a way to add a subtitle without labs()?

Something is wrong. The person and property data on this unstacked chart do not match the data on the stacked chart. The y-axis totals are different. The unstacked chart has the correct totals. The stacked one has the wrong totals, but the proportions look right.

I’ve been looking at this too long and don’t want to go down this wormhole any longer. So, for the purpose of this exercise, let’s assume that the y-axis total are correct..

Showing Crimes Against Persons next to Property Crimes on the same chart is helpful.

Person crimes increased each year from 2011-2016. Why was 2010 higher than the following years? A lingering effect from the 2008 financial crisis? What caused the number to drop significantly in 2011?

Property crimes also show a general upward trend, from 2011-2014, with an anomaly in 2012. This peak in 2012 is more evident now than it was in the barplot for just property crimes. What happened in 2012? And what caused it to drop significantly in 2013, and then again in 2015?

The initial analysis invites more analysis!

Week 5 Homework - Hate Crimes

R.Lee

October 1, 2019