library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
setwd("~/Documents/0 - Montgomery College/0 - DATA 110/Datasets")
hatecrimes <- read_csv("hateCrimes2010.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## County = col_character(),
## `Crime Type` = col_character()
## )
## See spec(...) for full column specifications.
# Look at the variables before you clean up the data
str(hatecrimes)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 423 obs. of 44 variables:
## $ County : chr "Albany" "Albany" "Allegany" "Bronx" ...
## $ Year : num 2016 2016 2016 2016 2016 ...
## $ Crime Type : chr "Crimes Against Persons" "Property Crimes" "Property Crimes" "Crimes Against Persons" ...
## $ Anti-Male : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Female : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Transgender : num 0 0 0 4 0 0 0 0 0 0 ...
## $ Anti-Gender Identity Expression : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Age* : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-White : num 0 0 0 1 1 0 0 0 0 0 ...
## $ Anti-Black : num 1 2 1 0 0 1 0 1 0 2 ...
## $ Anti-American Indian/Alaskan Native : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Asian : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Anti-Native Hawaiian/Pacific Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Multi-Racial Groups : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Other Race : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Jewish : num 0 0 0 0 1 0 1 0 0 0 ...
## $ Anti-Catholic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Protestant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Islamic (Muslim) : num 1 0 0 6 0 0 0 0 1 0 ...
## $ Anti-Multi-Religious Groups : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Anti-Atheism/Agnosticism : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Religious Practice Generally : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Other Religion : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Buddhist : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Eastern Orthodox (Greek, Russian, etc.): num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Hindu : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Jehovahs Witness : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Mormon : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Other Christian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Sikh : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Hispanic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Arab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Other Ethnicity/National Origin : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Non-Hispanic* : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Gay Male : num 1 0 0 8 0 1 0 0 0 0 ...
## $ Anti-Gay Female : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Anti-Gay (Male and Female) : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Heterosexual : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Bisexual : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Physical Disability : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Anti-Mental Disability : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total Incidents : num 3 3 1 20 2 3 1 1 1 2 ...
## $ Total Victims : num 4 3 1 20 2 3 1 1 1 2 ...
## $ Total Offenders : num 3 3 1 25 2 3 1 1 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. County = col_character(),
## .. Year = col_double(),
## .. `Crime Type` = col_character(),
## .. `Anti-Male` = col_double(),
## .. `Anti-Female` = col_double(),
## .. `Anti-Transgender` = col_double(),
## .. `Anti-Gender Identity Expression` = col_double(),
## .. `Anti-Age*` = col_double(),
## .. `Anti-White` = col_double(),
## .. `Anti-Black` = col_double(),
## .. `Anti-American Indian/Alaskan Native` = col_double(),
## .. `Anti-Asian` = col_double(),
## .. `Anti-Native Hawaiian/Pacific Islander` = col_double(),
## .. `Anti-Multi-Racial Groups` = col_double(),
## .. `Anti-Other Race` = col_double(),
## .. `Anti-Jewish` = col_double(),
## .. `Anti-Catholic` = col_double(),
## .. `Anti-Protestant` = col_double(),
## .. `Anti-Islamic (Muslim)` = col_double(),
## .. `Anti-Multi-Religious Groups` = col_double(),
## .. `Anti-Atheism/Agnosticism` = col_double(),
## .. `Anti-Religious Practice Generally` = col_double(),
## .. `Anti-Other Religion` = col_double(),
## .. `Anti-Buddhist` = col_double(),
## .. `Anti-Eastern Orthodox (Greek, Russian, etc.)` = col_double(),
## .. `Anti-Hindu` = col_double(),
## .. `Anti-Jehovahs Witness` = col_double(),
## .. `Anti-Mormon` = col_double(),
## .. `Anti-Other Christian` = col_double(),
## .. `Anti-Sikh` = col_double(),
## .. `Anti-Hispanic` = col_double(),
## .. `Anti-Arab` = col_double(),
## .. `Anti-Other Ethnicity/National Origin` = col_double(),
## .. `Anti-Non-Hispanic*` = col_double(),
## .. `Anti-Gay Male` = col_double(),
## .. `Anti-Gay Female` = col_double(),
## .. `Anti-Gay (Male and Female)` = col_double(),
## .. `Anti-Heterosexual` = col_double(),
## .. `Anti-Bisexual` = col_double(),
## .. `Anti-Physical Disability` = col_double(),
## .. `Anti-Mental Disability` = col_double(),
## .. `Total Incidents` = col_double(),
## .. `Total Victims` = col_double(),
## .. `Total Offenders` = col_double()
## .. )
# Notice that all the variables are capitalized. The "anti-" variables have dashes, and some have asterisks, parantheses, slashes, periods, commas, and spaces too.
# Clean up the variables. Make them all lowercase and remove the spaces.
names(hatecrimes) <- tolower(names(hatecrimes))
names(hatecrimes) <- gsub(" ","",names(hatecrimes))
str(hatecrimes)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 423 obs. of 44 variables:
## $ county : chr "Albany" "Albany" "Allegany" "Bronx" ...
## $ year : num 2016 2016 2016 2016 2016 ...
## $ crimetype : chr "Crimes Against Persons" "Property Crimes" "Property Crimes" "Crimes Against Persons" ...
## $ anti-male : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-female : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-transgender : num 0 0 0 4 0 0 0 0 0 0 ...
## $ anti-genderidentityexpression : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-age* : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-white : num 0 0 0 1 1 0 0 0 0 0 ...
## $ anti-black : num 1 2 1 0 0 1 0 1 0 2 ...
## $ anti-americanindian/alaskannative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-asian : num 0 0 0 0 0 1 0 0 0 0 ...
## $ anti-nativehawaiian/pacificislander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-multi-racialgroups : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-otherrace : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-jewish : num 0 0 0 0 1 0 1 0 0 0 ...
## $ anti-catholic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-protestant : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-islamic(muslim) : num 1 0 0 6 0 0 0 0 1 0 ...
## $ anti-multi-religiousgroups : num 0 1 0 0 0 0 0 0 0 0 ...
## $ anti-atheism/agnosticism : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-religiouspracticegenerally : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-otherreligion : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-buddhist : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-easternorthodox(greek,russian,etc.): num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-hindu : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-jehovahswitness : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-mormon : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-otherchristian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-sikh : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-hispanic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-arab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-otherethnicity/nationalorigin : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-non-hispanic* : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-gaymale : num 1 0 0 8 0 1 0 0 0 0 ...
## $ anti-gayfemale : num 0 0 0 1 0 0 0 0 0 0 ...
## $ anti-gay(maleandfemale) : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-heterosexual : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-bisexual : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-physicaldisability : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anti-mentaldisability : num 0 0 0 0 0 0 0 0 0 0 ...
## $ totalincidents : num 3 3 1 20 2 3 1 1 1 2 ...
## $ totalvictims : num 4 3 1 20 2 3 1 1 1 2 ...
## $ totaloffenders : num 3 3 1 25 2 3 1 1 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. County = col_character(),
## .. Year = col_double(),
## .. `Crime Type` = col_character(),
## .. `Anti-Male` = col_double(),
## .. `Anti-Female` = col_double(),
## .. `Anti-Transgender` = col_double(),
## .. `Anti-Gender Identity Expression` = col_double(),
## .. `Anti-Age*` = col_double(),
## .. `Anti-White` = col_double(),
## .. `Anti-Black` = col_double(),
## .. `Anti-American Indian/Alaskan Native` = col_double(),
## .. `Anti-Asian` = col_double(),
## .. `Anti-Native Hawaiian/Pacific Islander` = col_double(),
## .. `Anti-Multi-Racial Groups` = col_double(),
## .. `Anti-Other Race` = col_double(),
## .. `Anti-Jewish` = col_double(),
## .. `Anti-Catholic` = col_double(),
## .. `Anti-Protestant` = col_double(),
## .. `Anti-Islamic (Muslim)` = col_double(),
## .. `Anti-Multi-Religious Groups` = col_double(),
## .. `Anti-Atheism/Agnosticism` = col_double(),
## .. `Anti-Religious Practice Generally` = col_double(),
## .. `Anti-Other Religion` = col_double(),
## .. `Anti-Buddhist` = col_double(),
## .. `Anti-Eastern Orthodox (Greek, Russian, etc.)` = col_double(),
## .. `Anti-Hindu` = col_double(),
## .. `Anti-Jehovahs Witness` = col_double(),
## .. `Anti-Mormon` = col_double(),
## .. `Anti-Other Christian` = col_double(),
## .. `Anti-Sikh` = col_double(),
## .. `Anti-Hispanic` = col_double(),
## .. `Anti-Arab` = col_double(),
## .. `Anti-Other Ethnicity/National Origin` = col_double(),
## .. `Anti-Non-Hispanic*` = col_double(),
## .. `Anti-Gay Male` = col_double(),
## .. `Anti-Gay Female` = col_double(),
## .. `Anti-Gay (Male and Female)` = col_double(),
## .. `Anti-Heterosexual` = col_double(),
## .. `Anti-Bisexual` = col_double(),
## .. `Anti-Physical Disability` = col_double(),
## .. `Anti-Mental Disability` = col_double(),
## .. `Total Incidents` = col_double(),
## .. `Total Victims` = col_double(),
## .. `Total Offenders` = col_double()
## .. )
# Look at the 5-number stats for these variables
summary(hatecrimes)
## county year crimetype anti-male
## Length:423 Min. :2010 Length:423 Min. :0.000000
## Class :character 1st Qu.:2011 Class :character 1st Qu.:0.000000
## Mode :character Median :2013 Mode :character Median :0.000000
## Mean :2013 Mean :0.007092
## 3rd Qu.:2015 3rd Qu.:0.000000
## Max. :2016 Max. :1.000000
## anti-female anti-transgender anti-genderidentityexpression
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.01655 Mean :0.04728 Mean :0.05674
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :5.00000 Max. :3.00000
## anti-age* anti-white anti-black
## Min. :0.00000 Min. : 0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median : 1.000
## Mean :0.05201 Mean : 0.3357 Mean : 1.761
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.: 2.000
## Max. :9.00000 Max. :11.0000 Max. :18.000
## anti-americanindian/alaskannative anti-asian
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.0000
## Median :0.000000 Median :0.0000
## Mean :0.007092 Mean :0.1773
## 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1.000000 Max. :8.0000
## anti-nativehawaiian/pacificislander anti-multi-racialgroups
## Min. :0 Min. :0.00000
## 1st Qu.:0 1st Qu.:0.00000
## Median :0 Median :0.00000
## Mean :0 Mean :0.08511
## 3rd Qu.:0 3rd Qu.:0.00000
## Max. :0 Max. :3.00000
## anti-otherrace anti-jewish anti-catholic anti-protestant
## Min. :0 Min. : 0.000 Min. : 0.0000 Min. :0.00000
## 1st Qu.:0 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:0.00000
## Median :0 Median : 0.000 Median : 0.0000 Median :0.00000
## Mean :0 Mean : 3.981 Mean : 0.2695 Mean :0.02364
## 3rd Qu.:0 3rd Qu.: 3.000 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :0 Max. :82.000 Max. :12.0000 Max. :1.00000
## anti-islamic(muslim) anti-multi-religiousgroups anti-atheism/agnosticism
## Min. : 0.0000 Min. : 0.00000 Min. :0
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.:0
## Median : 0.0000 Median : 0.00000 Median :0
## Mean : 0.4704 Mean : 0.07565 Mean :0
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.:0
## Max. :10.0000 Max. :10.00000 Max. :0
## anti-religiouspracticegenerally anti-otherreligion anti-buddhist
## Min. :0.000000 Min. :0.000 Min. :0
## 1st Qu.:0.000000 1st Qu.:0.000 1st Qu.:0
## Median :0.000000 Median :0.000 Median :0
## Mean :0.007092 Mean :0.104 Mean :0
## 3rd Qu.:0.000000 3rd Qu.:0.000 3rd Qu.:0
## Max. :2.000000 Max. :4.000 Max. :0
## anti-easternorthodox(greek,russian,etc.) anti-hindu
## Min. :0.000000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.000000 Median :0.000000
## Mean :0.002364 Mean :0.002364
## 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.000000
## anti-jehovahswitness anti-mormon anti-otherchristian anti-sikh
## Min. :0 Min. :0 Min. :0.00000 Min. :0
## 1st Qu.:0 1st Qu.:0 1st Qu.:0.00000 1st Qu.:0
## Median :0 Median :0 Median :0.00000 Median :0
## Mean :0 Mean :0 Mean :0.01655 Mean :0
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0.00000 3rd Qu.:0
## Max. :0 Max. :0 Max. :3.00000 Max. :0
## anti-hispanic anti-arab anti-otherethnicity/nationalorigin
## Min. : 0.0000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 0.0000 Median :0.00000 Median : 0.0000
## Mean : 0.3735 Mean :0.06619 Mean : 0.2837
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.0000
## Max. :17.0000 Max. :2.00000 Max. :19.0000
## anti-non-hispanic* anti-gaymale anti-gayfemale
## Min. :0 Min. : 0.000 Min. :0.0000
## 1st Qu.:0 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0 Median : 0.000 Median :0.0000
## Mean :0 Mean : 1.499 Mean :0.2411
## 3rd Qu.:0 3rd Qu.: 1.000 3rd Qu.:0.0000
## Max. :0 Max. :36.000 Max. :8.0000
## anti-gay(maleandfemale) anti-heterosexual anti-bisexual
## Min. :0.0000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.0000 Median :0.000000 Median :0.000000
## Mean :0.1017 Mean :0.002364 Mean :0.004728
## 3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :4.0000 Max. :1.000000 Max. :1.000000
## anti-physicaldisability anti-mentaldisability totalincidents
## Min. :0.00000 Min. :0.000000 Min. : 1.00
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.: 1.00
## Median :0.00000 Median :0.000000 Median : 3.00
## Mean :0.01182 Mean :0.009456 Mean : 10.09
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.: 10.00
## Max. :1.00000 Max. :1.000000 Max. :101.00
## totalvictims totaloffenders
## Min. : 1.00 Min. : 1.00
## 1st Qu.: 1.00 1st Qu.: 1.00
## Median : 3.00 Median : 3.00
## Mean : 10.48 Mean : 11.77
## 3rd Qu.: 10.00 3rd Qu.: 11.00
## Max. :106.00 Max. :113.00
# Look at the most prominent types of hate crimes (8 groups of people).
hatecrimes2 <- hatecrimes %>%
select(county, year, `anti-black`, 'anti-white', `anti-jewish`, 'anti-catholic','anti-age*','anti-islamic(muslim)', 'anti-gaymale', 'anti-hispanic', totalincidents, totalvictims, totaloffenders)
head(hatecrimes2)
## # A tibble: 6 x 13
## county year `anti-black` `anti-white` `anti-jewish` `anti-catholic`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Albany 2016 1 0 0 0
## 2 Albany 2016 2 0 0 0
## 3 Alleg… 2016 1 0 0 0
## 4 Bronx 2016 0 1 0 0
## 5 Bronx 2016 0 1 1 0
## 6 Broome 2016 1 0 0 0
## # … with 7 more variables: `anti-age*` <dbl>,
## # `anti-islamic(muslim)` <dbl>, `anti-gaymale` <dbl>,
## # `anti-hispanic` <dbl>, totalincidents <dbl>, totalvictims <dbl>,
## # totaloffenders <dbl>
# In the head data frame, Bronx county had the highest number of incidents.
# QUESTION - How is the select function different from the filter function?
# Check dimensions to count how many variables remain
dim(hatecrimes2)
## [1] 423 13
# We started off with 44 variables, and now have only 13. The number of rows (423) remains the same.
summary(hatecrimes2)
## county year anti-black anti-white
## Length:423 Min. :2010 Min. : 0.000 Min. : 0.0000
## Class :character 1st Qu.:2011 1st Qu.: 0.000 1st Qu.: 0.0000
## Mode :character Median :2013 Median : 1.000 Median : 0.0000
## Mean :2013 Mean : 1.761 Mean : 0.3357
## 3rd Qu.:2015 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :2016 Max. :18.000 Max. :11.0000
## anti-jewish anti-catholic anti-age* anti-islamic(muslim)
## Min. : 0.000 Min. : 0.0000 Min. :0.00000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median : 0.000 Median : 0.0000 Median :0.00000 Median : 0.0000
## Mean : 3.981 Mean : 0.2695 Mean :0.05201 Mean : 0.4704
## 3rd Qu.: 3.000 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.0000
## Max. :82.000 Max. :12.0000 Max. :9.00000 Max. :10.0000
## anti-gaymale anti-hispanic totalincidents totalvictims
## Min. : 0.000 Min. : 0.0000 Min. : 1.00 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 1.00 1st Qu.: 1.00
## Median : 0.000 Median : 0.0000 Median : 3.00 Median : 3.00
## Mean : 1.499 Mean : 0.3735 Mean : 10.09 Mean : 10.48
## 3rd Qu.: 1.000 3rd Qu.: 0.0000 3rd Qu.: 10.00 3rd Qu.: 10.00
## Max. :36.000 Max. :17.0000 Max. :101.00 Max. :106.00
## totaloffenders
## Min. : 1.00
## 1st Qu.: 1.00
## Median : 3.00
## Mean : 11.77
## 3rd Qu.: 11.00
## Max. :113.00
Order the data, first by total indcidents, then total offenders, then by total victims. It will be interesting to see if counties and years correlate with certain types of crimes.
# Use the arrange function to order, and desc to specify how it is ordered
ordered <- hatecrimes2 %>%
arrange(desc(totalincidents, totaloffenders, totalvictims))
head(ordered)
## # A tibble: 6 x 13
## county year `anti-black` `anti-white` `anti-jewish` `anti-catholic`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Kings 2012 4 1 82 6
## 2 Suffo… 2012 18 0 48 7
## 3 Kings 2010 10 3 34 0
## 4 New Y… 2016 6 5 9 0
## 5 Kings 2015 6 3 35 0
## 6 Kings 2016 4 6 26 1
## # … with 7 more variables: `anti-age*` <dbl>,
## # `anti-islamic(muslim)` <dbl>, `anti-gaymale` <dbl>,
## # `anti-hispanic` <dbl>, totalincidents <dbl>, totalvictims <dbl>,
## # totaloffenders <dbl>
# Kings and Suffolk counties seem to have the most hate crimes.
Look at each set of hate-crimes for each type for each year. Use the package “tidyr” to convert the dataset from wide to long with the command “gather”. It will take each column’s hate-crime type combine them all into one column called “id”. Then each cell count will go into the new column, “crimecount”. Finally, we are only doing this for the quantitiative variables, which are in columns 3 - 13. Note the command facet_wrap requires (~) before “id”.
# install.package("reshape2") - to use facet_wrap
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Use gather to combine all the hate crime types into one column called "id"
# Each cell count will go into a new column called "crimecount"
# Do this only for the quantitative numbers, which are in Columns 3-13
hatecrimeslong <- ordered %>% tidyr::gather("id", "crimecount", 3:13)
# Create a scatterplot with this new data subset
# x-axis is year, y-axis is crimecount
# Color-code by "id"
hatecrimesplot <-hatecrimeslong %>%
ggplot(., aes(year, crimecount))+
geom_point()+
aes(color = id)+
facet_wrap(~id)
hatecrimesplot
# Notice that a separate plot is created for each variable, and each plot has its own color. These plots are arranged side by side. This was accomplished by facet_wrap.
From the facet_wrap plot above, anti-black, anti-gay males, and anti-jewish categories seem to have highest rates of offenses reported.
# Filter out just for those 3 crimes.
hatenew <- hatecrimeslong %>%
filter( id== "anti-black" | id == "anti-jewish" | id == "anti-gaymale")
# After using filter, the new data subset contains just those 3 crimes.
# Notice the == operator.
Use the following commands to finalize your barplot: - position = “dodge” makes side-by-side bars, rather than stacked bars - stat = “identity” allows you to plot each set of bars for each year between 2010 and 2016 - ggtitle gives the plot a title - labs gives a title to the legend
# position = "dodge" makes side-by-side bars instead of stacked bars
# stat = “identity” allows you to plot each set of bars for each year between 2010 and 2016
plot2 <- hatenew %>%
ggplot() +
geom_bar(aes(x=year, y=crimecount, fill = id),
position = "dodge", stat = "identity") +
ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
ylab("Number of Hate Crime Incidents") +
labs(fill = "Hate Crime Type")
plot2
# Let's see the same plot WITHOUT position = "dodge"
plot2 <- hatenew %>%
ggplot() +
geom_bar(aes(x=year, y=crimecount, fill = id),
position = , stat = "identity") +
ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
ylab("Number of Hate Crime Incidents") +
labs(fill = "Hate Crime Type")
plot2
# You get stacked bars. Stat = "identity" does nothing.
The next place to explore. Make bar graphs by county instead of by year.
# To show the bar graphs by county, change the x variable in the aes function
plot3 <- hatenew %>%
ggplot() +
geom_bar(aes(x=county, y=crimecount, fill = id),
position = "dodge", stat = "identity") +
ggtitle("Hate Crime Type in NY Counties Between 2010-2016") +
ylab("Number of Hate Crime Incidents") +
labs(fill = "Hate Crime Type")
plot3
# The resulting graph is very messy and not useful.
There are too many counties for this plot to make sense, but maybe we can just look at the 10 counties with the highest number of incidents.
# use “group_by” to group each row by counties
# use summarize to get the total sum of incidents by county
# use arrange(desc) to arrange those sums of total incidents by counties in descending order
# use top_n to list the 5 counties with highest total incidents
counties <- hatenew %>%
group_by(county)%>%
summarize(sum = sum(crimecount)) %>%
arrange(desc(sum)) %>%
top_n(n=5)
## Selecting by sum
counties
## # A tibble: 5 x 2
## county sum
## <chr> <dbl>
## 1 Kings 713
## 2 New York 459
## 3 Suffolk 360
## 4 Nassau 298
## 5 Queens 235
# Now we have only 5 rows (the top 5 counties)
# The list is arranged from highest to lowest crimecount (represented by sum)
Finally, create the barplot above, but only for the 5 counties with the highest incidents of hate-crimes. The command “labs” is nice, because you can get a title, subtitle, y-axis label, and legend title, all in one command.
# Create a barplot of those top 5 counties.
# Use filter to select those top 5 counties
# Use labs to give to label the plot in one command
plot4 <- hatenew %>%
filter(county =="Kings" | county =="New York" | county == "Suffolk" | county == "Nassau" | county == "Queens") %>%
ggplot() +
geom_bar(aes(x=county, y=crimecount, fill = id),
position = "dodge", stat = "identity") +
labs(ylab = "Number of Hate Crime Incidents",
title = "5 Counties in NY with Highest Incidents of Hate Crimes",
subtitle = "Between 2010-2016",
fill = "Hate Crime Type")
plot4
# Notice that fill= is used twice here. This is tricky.
# CONFIRM THIS IS CORRECT - The first fill appears in aes(). This specifies how the bars are color-coded. The second fill appears in labs(). This specifies the legend title.
# Notice subtitle= in labs. Subtitle appears below the main title.
Hate crimes are comprised of Crimes Against Persons and Property Crimes. How do these two types of hate crimes break down in NY? Are there any interesting patterns?
For example, is one type more prevalent than the other? Where? Are there any upward or downward trends for the number of crimes over time?
I intend to create a barplot showing the number of Crimes Against Person compared with the number of Property Crimes over the period 2010-2016.
To get there, I have created some smaller plots to analyze the data.
The final plot appears at the end of this file.
Let’s first look at just Property Crimes.
# Filter out just the property crimes
hateProp <- hatecrimes %>%
filter(crimetype == "Property Crimes")
# The new data subset is 190 rows x 44 variables. Note that we are looking at ALL the hate groups now, not just the top 8 types from the tutorial above.
# Add a filter for year
hatePropByYear <- hatecrimes %>%
filter( crimetype == "Property Crimes", year == 2016)
# The number of counties that show up in the filter results will vary by year.
For instance, in 2010, 26 counties have property hate crime incidents. In 2016, 32 counties do.
If the number of counties was higher, did the number of property hate crimes go up over this period too? Let’s create a plot to find out.
plotPropTrend <- hateProp %>%
ggplot() +
geom_bar(aes(x=year, y=totalincidents),
position = , stat = "identity") +
labs(title = "Property Hate Crimes in All NY Counties",
subtitle = "Between 2010-2016",
ylab = "Number of Property Hate Crime Incidents")
plotPropTrend
# Hmm, the y-axis label is not showing up. WHY?
# There is no fill in aes() or labs( ). Adding it in makes no difference. Need to understand fill better.
# The bars are further subdivided (light gray lines), showing the share (proportion) of each county.
# Find the total number of property crimes for each year. To do this, sum up the totals for all the counties for each year. There should be 7 rows (corresponding to a year), arranged in descending order.
county_sum1 <- hatecrimes %>%
filter(crimetype=="Property Crimes") %>%
group_by(year) %>%
summarize(year_sum = sum(totalincidents)) %>%
arrange(desc(year_sum))
head(county_sum1)
## # A tibble: 6 x 2
## year year_sum
## <dbl> <dbl>
## 1 2012 419
## 2 2016 318
## 3 2013 306
## 4 2014 301
## 5 2011 277
## 6 2010 239
Whew! I held my breath to see if the above snippet of code would work (lifted right off the Pfizer Tutorial). Looks like it did. This should make the remaining graphs far easier.
# Re-plot the property crimes from 2010-2016 using county_sum
plotPropTrend2 <- county_sum1 %>%
ggplot() +
geom_bar(aes(x=year, y=year_sum),
position = , stat = "identity") +
labs(title = "Property Hate Crimes in All NY Counties",
subtitle = "Between 2010-2016",
ylab = "Number of Property Hate Crime Incidents")
plotPropTrend2
# The bars are now a solid color. No longer subdivided by lines.
# FOLLOW-UP - Tried different ways to change the bar color: fill=, color=, scale_fill_discrete, etc. None are working. The bars keep showing up black, red, or black with a red border.
The above barplot shows there were indeed more property crimes in 2016 than in 2012. But more interesting is 2012, which had the highest number of property crimes.
Let’s look at how many counties reported property hate crimes in 2012.
# Filter by crimetype and year
hatePropByYear <- hatecrimes %>%
filter( crimetype == "Property Crimes", year == 2012)
The dataset shows only 23 counties reported property crimes in 2012. That means, there were more hate crimes in particular counties driving the overall number up.
# Sort this dataset to see which counties have the highest number of property hate crimes.
hatePropByYear <- hatecrimes %>%
filter( crimetype == "Property Crimes", year == 2012) %>%
arrange( desc(totalincidents))
# Plot the number of property hate crimes in 2012 for all counties
plotProp2012 <- hatePropByYear %>%
ggplot() +
geom_bar(aes(x=county, y=totalincidents),
position = "dodge", stat = "identity") +
labs(ylab = "Number of Property Hate Crime Incidents",
title = "Total Property Hate Crimes for All NY Counties",
subtitle = "2012 only",
fill = "")
plotProp2012
# NOTE- ylabs is not working for the y-axis label.
While this is a messy graph, you can see the counties appear dichotomous - they have either a significant number of property crimes or very few.
Getting back to Crimes Against Persons.
# Filter out just the person crimes
hatePerson <- hatecrimes %>%
filter( crimetype == "Crimes Against Persons")
# The new data subset is 233 rows x 44 variables.
# Roll-up the dataset for Crimes Against Persons. Show just the yearly totals for 2010-2016.
county_sum2 <- hatecrimes %>%
filter(crimetype=="Crimes Against Persons") %>%
group_by(year) %>%
summarize(year_sum = sum(totalincidents)) %>%
arrange(desc(year_sum))
head(county_sum2)
## # A tibble: 6 x 2
## year year_sum
## <dbl> <dbl>
## 1 2010 464
## 2 2012 315
## 3 2013 311
## 4 2016 280
## 5 2011 279
## 6 2015 275
# Plot the number of person crimes over 2010-2016
plotPersonTrend <- county_sum2 %>%
ggplot() +
geom_bar(aes(x=year, y=year_sum, fill=year),
position = , stat = "identity") +
labs(title = "Hate Crimes Against Persons in All NY Counties",
subtitle = "Between 2010-2016",
ylab = "Yearly Total of Person Hate Crime Incidents")
plotPersonTrend
# Added fill = year in aes(). It puts the bars into a scaling blue color scheme like the side-by-side boxplots in Week 2 Air Quality Tutorial. How to make it one color that is not black?
# Adding color="blue" does something VERY WEIRD -- black bars with red borders. Never mind about the color. Moving on..
# Plot the total number of Person crimes vs Property crimes over 2010-2016
plotComp <- hatecrimes %>%
ggplot() +
geom_bar(aes(x=year, y=totalincidents, fill = crimetype),
position = , stat = "identity") +
labs(title = "Crimes Against Persons vs Property Crimes in All NY Counties",
subtitle = "Between 2010-2016",
ylab = "Number of Hate Crime Incidents",
fill = "Hate Crime Type")
plotComp
# Notice that this is a stacked barplot because position = "dodge" was omitted.
# It is hard to make comparisons on a stacked barplot. Let's look at it unstacked.
plotComp1 <- hatecrimes %>%
ggplot() +
geom_bar(aes(x=year, y=totalincidents, fill = crimetype),
position = "dodge", stat = "identity") +
ylab("Number of Hate Crime Incidents") +
xlab("Year") +
ggtitle("Crimes Against Person vs Property Crimes in All NY Counties") +
labs(subtitle="Between 2010-2016")
plotComp1
# FOLLOW-UP - is there a way to add a subtitle without labs()?
Something is wrong. The person and property data on this unstacked chart do not match the data on the stacked chart. The y-axis totals are different. The unstacked chart has the correct totals. The stacked one has the wrong totals, but the proportions look right.
I’ve been looking at this too long and don’t want to go down this wormhole any longer. So, for the purpose of this exercise, let’s assume that the y-axis total are correct..
Showing Crimes Against Persons next to Property Crimes on the same chart is helpful.
Person crimes increased each year from 2011-2016. Why was 2010 higher than the following years? A lingering effect from the 2008 financial crisis? What caused the number to drop significantly in 2011?
Property crimes also show a general upward trend, from 2011-2014, with an anomaly in 2012. This peak in 2012 is more evident now than it was in the barplot for just property crimes. What happened in 2012? And what caused it to drop significantly in 2013, and then again in 2015?
The initial analysis invites more analysis!