library(readr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ dplyr 1.0.1
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(RColorBrewer)
library(stringr)
library(ggthemes)
library(leaflet)
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(acs)
## Loading required package: XML
##
## Attaching package: 'acs'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:base':
##
## apply
library(blorr)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.5-18, (SVN revision 1082)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.1.1, released 2020/06/22
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/sf/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/proj
## Linking to sp version:1.4-4
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.1.1, PROJ 6.3.1
library(tmap)
library(tmaptools)
library(rio)
library(sp)
library(leaflet.extras)
setwd("~/Desktop/R Datasets/Kaggle Datasets/")
crime <- read_csv("Crime_Data_2010_2017.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `DR Number` = col_double(),
## `Time Occurred` = col_double(),
## `Area ID` = col_double(),
## `Reporting District` = col_double(),
## `Crime Code` = col_double(),
## `Victim Age` = col_double(),
## `Premise Code` = col_double(),
## `Weapon Used Code` = col_double(),
## `Crime Code 1` = col_double(),
## `Crime Code 2` = col_double(),
## `Crime Code 3` = col_logical(),
## `Crime Code 4` = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 2252 parsing failures.
## row col expected actual file
## 2259 Crime Code 3 1/0/T/F/TRUE/FALSE 998.0 'Crime_Data_2010_2017.csv'
## 3506 Crime Code 3 1/0/T/F/TRUE/FALSE 998.0 'Crime_Data_2010_2017.csv'
## 3745 Crime Code 3 1/0/T/F/TRUE/FALSE 998.0 'Crime_Data_2010_2017.csv'
## 6442 Crime Code 3 1/0/T/F/TRUE/FALSE 998.0 'Crime_Data_2010_2017.csv'
## 8297 Crime Code 3 1/0/T/F/TRUE/FALSE 998.0 'Crime_Data_2010_2017.csv'
## .... ............ .................. ...... ..........................
## See problems(...) for more details.
mo_codes <- read_csv("MO_Codes.csv")
## Parsed with column specification:
## cols(
## MO_Code = col_double(),
## Description = col_character()
## )
str(crime)
## tibble [1,584,316 × 26] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DR Number : num [1:1584316] 1.21e+06 1.02e+08 4.18e+02 1.02e+08 4.21e+07 ...
## $ Date Reported : chr [1:1584316] "03/14/2013" "01/25/2010" "03/19/2013" "11/11/2010" ...
## $ Date Occurred : chr [1:1584316] "03/11/2013" "01/22/2010" "03/18/2013" "11/10/2010" ...
## $ Time Occurred : num [1:1584316] 1800 2300 2030 1800 2300 1400 2230 1600 1600 800 ...
## $ Area ID : num [1:1584316] 12 20 18 18 21 1 11 16 19 9 ...
## $ Area Name : chr [1:1584316] "77th Street" "Olympic" "Southeast" "Southeast" ...
## $ Reporting District : num [1:1584316] 1241 2071 1823 1803 2133 ...
## $ Crime Code : num [1:1584316] 626 510 510 510 745 110 510 510 510 668 ...
## $ Crime Code Description: chr [1:1584316] "INTIMATE PARTNER - SIMPLE ASSAULT" "VEHICLE - STOLEN" "VEHICLE - STOLEN" "VEHICLE - STOLEN" ...
## $ MO Codes : chr [1:1584316] "0416 0446 1243 2000" NA NA NA ...
## $ Victim Age : num [1:1584316] 30 NA 12 NA 84 49 NA NA NA 27 ...
## $ Victim Sex : chr [1:1584316] "F" NA NA NA ...
## $ Victim Descent : chr [1:1584316] "W" NA NA NA ...
## $ Premise Code : num [1:1584316] 502 101 101 101 501 501 108 101 101 203 ...
## $ Premise Description : chr [1:1584316] "MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)" "STREET" "STREET" "STREET" ...
## $ Weapon Used Code : num [1:1584316] 400 NA NA NA NA 400 NA NA NA NA ...
## $ Weapon Description : chr [1:1584316] "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)" NA NA NA ...
## $ Status Code : chr [1:1584316] "AO" "IC" "IC" "IC" ...
## $ Status Description : chr [1:1584316] "Adult Other" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ Crime Code 1 : num [1:1584316] 626 510 510 510 745 110 510 510 510 668 ...
## $ Crime Code 2 : num [1:1584316] NA NA NA NA NA NA NA NA NA NA ...
## $ Crime Code 3 : logi [1:1584316] NA NA NA NA NA NA ...
## $ Crime Code 4 : logi [1:1584316] NA NA NA NA NA NA ...
## $ Address : chr [1:1584316] "6300 BRYNHURST AV" "VAN NESS" "200 E 104TH ST" "88TH" ...
## $ Cross Street : chr [1:1584316] NA "15TH" NA "WALL" ...
## $ Location : chr [1:1584316] "(33.9829, -118.3338)" "(34.0454, -118.3157)" "(33.942, -118.2717)" "(33.9572, -118.2717)" ...
## - attr(*, "problems")= tibble [2,252 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:2252] 2259 3506 3745 6442 8297 8508 8532 8543 9116 10069 ...
## ..$ col : chr [1:2252] "Crime Code 3" "Crime Code 3" "Crime Code 3" "Crime Code 3" ...
## ..$ expected: chr [1:2252] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
## ..$ actual : chr [1:2252] "998.0" "998.0" "998.0" "998.0" ...
## ..$ file : chr [1:2252] "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. `DR Number` = col_double(),
## .. `Date Reported` = col_character(),
## .. `Date Occurred` = col_character(),
## .. `Time Occurred` = col_double(),
## .. `Area ID` = col_double(),
## .. `Area Name` = col_character(),
## .. `Reporting District` = col_double(),
## .. `Crime Code` = col_double(),
## .. `Crime Code Description` = col_character(),
## .. `MO Codes` = col_character(),
## .. `Victim Age` = col_double(),
## .. `Victim Sex` = col_character(),
## .. `Victim Descent` = col_character(),
## .. `Premise Code` = col_double(),
## .. `Premise Description` = col_character(),
## .. `Weapon Used Code` = col_double(),
## .. `Weapon Description` = col_character(),
## .. `Status Code` = col_character(),
## .. `Status Description` = col_character(),
## .. `Crime Code 1` = col_double(),
## .. `Crime Code 2` = col_double(),
## .. `Crime Code 3` = col_logical(),
## .. `Crime Code 4` = col_logical(),
## .. Address = col_character(),
## .. `Cross Street` = col_character(),
## .. Location = col_character()
## .. )
head(crime)
## # A tibble: 6 x 26
## `DR Number` `Date Reported` `Date Occurred` `Time Occurred` `Area ID`
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1208575 03/14/2013 03/11/2013 1800 12
## 2 102005556 01/25/2010 01/22/2010 2300 20
## 3 418 03/19/2013 03/18/2013 2030 18
## 4 101822289 11/11/2010 11/10/2010 1800 18
## 5 42104479 01/11/2014 01/04/2014 2300 21
## 6 120125367 01/08/2013 01/08/2013 1400 1
## # … with 21 more variables: `Area Name` <chr>, `Reporting District` <dbl>,
## # `Crime Code` <dbl>, `Crime Code Description` <chr>, `MO Codes` <chr>,
## # `Victim Age` <dbl>, `Victim Sex` <chr>, `Victim Descent` <chr>, `Premise
## # Code` <dbl>, `Premise Description` <chr>, `Weapon Used Code` <dbl>, `Weapon
## # Description` <chr>, `Status Code` <chr>, `Status Description` <chr>, `Crime
## # Code 1` <dbl>, `Crime Code 2` <dbl>, `Crime Code 3` <lgl>, `Crime Code
## # 4` <lgl>, Address <chr>, `Cross Street` <chr>, Location <chr>
crime <- crime %>% dplyr::select(-(`Crime Code 2`:`Crime Code 4`))
head(crime)
## # A tibble: 6 x 23
## `DR Number` `Date Reported` `Date Occurred` `Time Occurred` `Area ID`
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1208575 03/14/2013 03/11/2013 1800 12
## 2 102005556 01/25/2010 01/22/2010 2300 20
## 3 418 03/19/2013 03/18/2013 2030 18
## 4 101822289 11/11/2010 11/10/2010 1800 18
## 5 42104479 01/11/2014 01/04/2014 2300 21
## 6 120125367 01/08/2013 01/08/2013 1400 1
## # … with 18 more variables: `Area Name` <chr>, `Reporting District` <dbl>,
## # `Crime Code` <dbl>, `Crime Code Description` <chr>, `MO Codes` <chr>,
## # `Victim Age` <dbl>, `Victim Sex` <chr>, `Victim Descent` <chr>, `Premise
## # Code` <dbl>, `Premise Description` <chr>, `Weapon Used Code` <dbl>, `Weapon
## # Description` <chr>, `Status Code` <chr>, `Status Description` <chr>, `Crime
## # Code 1` <dbl>, Address <chr>, `Cross Street` <chr>, Location <chr>
crime1 <- na.omit(crime)
str(crime1)
## tibble [129,430 × 23] (S3: tbl_df/tbl/data.frame)
## $ DR Number : num [1:129430] 1.3e+08 1.3e+08 1.3e+08 1.3e+08 1.3e+08 ...
## $ Date Reported : chr [1:129430] "01/02/2013" "01/05/2013" "01/12/2013" "01/13/2013" ...
## $ Date Occurred : chr [1:129430] "01/02/2013" "01/05/2013" "01/12/2013" "01/10/2013" ...
## $ Time Occurred : num [1:129430] 1430 1600 1815 300 1225 ...
## $ Area ID : num [1:129430] 1 1 1 1 1 1 1 1 1 1 ...
## $ Area Name : chr [1:129430] "Central" "Central" "Central" "Central" ...
## $ Reporting District : num [1:129430] 134 156 105 166 134 147 166 145 176 152 ...
## $ Crime Code : num [1:129430] 626 626 210 821 220 230 230 930 210 210 ...
## $ Crime Code Description: chr [1:129430] "INTIMATE PARTNER - SIMPLE ASSAULT" "INTIMATE PARTNER - SIMPLE ASSAULT" "ROBBERY" "SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH 0007=02" ...
## $ MO Codes : chr [1:129430] "2004 1218 1813 1814 2002 0444 2000" "0305 0400 0429 2000" "1218 1310 0344 0400 0352" "0507 2004 0503 0519 0516 0501 0506 0400" ...
## $ Victim Age : num [1:129430] 39 50 24 18 45 46 49 36 56 26 ...
## $ Victim Sex : chr [1:129430] "F" "F" "M" "M" ...
## $ Victim Descent : chr [1:129430] "H" "B" "H" "W" ...
## $ Premise Code : num [1:129430] 101 102 101 102 102 101 203 102 203 406 ...
## $ Premise Description : chr [1:129430] "STREET" "SIDEWALK" "STREET" "SIDEWALK" ...
## $ Weapon Used Code : num [1:129430] 400 400 400 400 400 500 500 207 400 400 ...
## $ Weapon Description : chr [1:129430] "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)" "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)" "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)" "STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)" ...
## $ Status Code : chr [1:129430] "IC" "AO" "IC" "IC" ...
## $ Status Description : chr [1:129430] "Invest Cont" "Adult Other" "Invest Cont" "Invest Cont" ...
## $ Crime Code 1 : num [1:129430] 626 626 210 821 220 230 230 930 210 210 ...
## $ Address : chr [1:129430] "4TH" "6TH ST" "BISHOPS" "6TH" ...
## $ Cross Street : chr [1:129430] "HILL" "SAN JULIAN ST" "BROADWAY" "SAN JULIAN" ...
## $ Location : chr [1:129430] "(34.0503, -118.2504)" "(34.0423, -118.2452)" "(34.0688, -118.2333)" "(34.0423, -118.2452)" ...
## - attr(*, "problems")= tibble [2,252 × 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:2252] 2259 3506 3745 6442 8297 8508 8532 8543 9116 10069 ...
## ..$ col : chr [1:2252] "Crime Code 3" "Crime Code 3" "Crime Code 3" "Crime Code 3" ...
## ..$ expected: chr [1:2252] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
## ..$ actual : chr [1:2252] "998.0" "998.0" "998.0" "998.0" ...
## ..$ file : chr [1:2252] "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" "'Crime_Data_2010_2017.csv'" ...
## - attr(*, "na.action")= 'omit' Named int [1:1454886] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:1454886] "1" "2" "3" "4" ...
crime1 <- crime1 %>% mutate(`Date Reported` = mdy(`Date Reported`), `Date Occurred` = mdy(`Date Occurred`),`Area Name` = as.factor(`Area Name`),`Crime Code Description` = as.factor(`Crime Code Description`), `Victim Sex` = as.factor(`Victim Sex`), `Victim Descent` = as.factor(`Victim Descent`),`Premise Description` = as.factor(`Premise Description`), `Weapon Description` = as.factor(`Weapon Description`),`Status Description` = as.factor(`Status Description`))
crime1 <- crime1 %>% dplyr::select(-(`Crime Code` | `Weapon Used Code` | `Premise Code` | `Status Code` | `Crime Code 1`))
crime2 <- crime1 %>% extract(Location,c("Latitude","Longitude"), "\\(([^,]+), ([^)]+)\\)")
crime2 <- crime2 %>% mutate(Latitude = as.numeric(Latitude), Longitude = as.numeric(Longitude))
crime2 <- crime2 %>% dplyr::filter(Latitude != 0)
nrow(crime2)
## [1] 128951
mo_codes<- mo_codes %>% mutate(Description = as.factor(Description))
head(mo_codes)
## # A tibble: 6 x 2
## MO_Code Description
## <dbl> <fct>
## 1 100 Suspect Impersonate
## 2 101 Aid victim
## 3 102 Blind
## 4 103 Crippled
## 5 104 Customer
## 6 105 Delivery
crime3 <- crime2 %>% separate(`MO Codes`,c('MO Code1','MO Code2','MO Code3','MO Code4','MO Code5',
'MO Code6','MO Code7','MO Code8'), sep = " ")
## Warning: Expected 8 pieces. Additional pieces discarded in 9955 rows [90, 227,
## 295, 330, 395, 805, 1184, 1198, 1201, 1212, 1229, 1230, 1232, 1235, 1242, 1261,
## 1262, 1309, 1316, 1321, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 113153 rows [1,
## 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
crime3 <- crime3 %>% mutate(`MO Code1` = as.numeric(`MO Code1`),`MO Code2` = as.numeric(`MO Code2`), `MO Code3` = as.numeric(`MO Code3`),
`MO Code4` = as.numeric(`MO Code4`),`MO Code5` = as.numeric(`MO Code5`),`MO Code6` = as.numeric(`MO Code6`),
`MO Code7` = as.numeric(`MO Code7`),`MO Code8` = as.numeric(`MO Code8`))
crime4 <- crime3 %>% select(-(`MO Code2`:`MO Code8`))
crime5 <- left_join(crime4, mo_codes, by=c('MO Code1' = "MO_Code"))
crime5 <- crime5 %>% filter(`Victim Sex` != "H") %>%
mutate(`Victim Descent`= case_when(`Victim Descent` == "A"~"Other Asian", `Victim Descent` == "B"~"Black",
`Victim Descent` == "C"~"Chinese", `Victim Descent` == "F"~"Filipino",
`Victim Descent` == "G"~"Guamanian",`Victim Descent` == "H"~"Hispanic",
`Victim Descent` == "I"~"Native American", `Victim Descent` == "J"~"Japanese",
`Victim Descent` == "K"~"Korean",`Victim Descent` == "O"~"Other", `Victim Descent` == "P"~"Pacific Islander",
`Victim Descent` == "S"~"Samoan",`Victim Descent` == "U"~"Hawaian",
`Victim Descent` == "V"~"Vietnamese", `Victim Descent` == "X"~"Unknown",
`Victim Descent` == "W"~"White", `Victim Descent` == "Z"~"Indian"))
crime6 <- crime5
crime6 <- crime6 %>% mutate(`Crime Code Description 2` = case_when(str_detect(`Crime Code Description`, "CHILD|CHLD")~"CHILD ABUSE",str_detect(`Crime Code Description`, "THEFT|BURGLARY|ROBBERY|STOLEN")~"THEFT",str_detect(`Crime Code Description`, "RAPE|SEX|SEXUAL")~"RAPE",str_detect(`Crime Code Description`, "ASSAULT|BATTERY")~"ASSAULT",str_detect(`Crime Code Description`, "VANDALISM")~"VANDALISM",str_detect(`Crime Code Description`, "DRIVING")~"RECKLESS DRIVING",str_detect(`Crime Code Description`, "VIOLATION")~"VIOLATION",str_detect(`Crime Code Description`, "SHOTS|WEAPON|WEAPONS|BOMB")~"ATTEMPTED HOMICIDE",str_detect(`Crime Code Description`, "KIDNAPPING")~"KIDNAPPING"))
crime7 <- crime6
for (i in 1:nrow(crime7)){
if(is.na(crime7[i,"Crime Code Description 2"])){
crime7[i,"Crime Code Description 2"] = crime7[i,"Crime Code Description"]
}
}
crime7$`Crime Code Description 2` <- as.factor(crime7$`Crime Code Description 2`)
#EDA
crime7 %>% ggplot() + geom_boxplot(mapping = aes(`Victim Age`)) + facet_wrap(~`Victim Sex`)
crime7 %>% ggplot() + geom_boxplot(mapping = aes(`Victim Age`)) + facet_wrap(~`Victim Descent`)
crime7 %>% ggplot() + geom_boxplot(mapping = aes(`Victim Age`)) + facet_wrap(~`Area Name`)
crime7 %>% ggplot() + geom_boxplot(mapping = aes(`Victim Age`)) + facet_wrap(~`Crime Code Description 2`)
crime7 %>% group_by(`Crime Code Description`,`Victim Sex`) %>% count() %>% filter(n>5000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description`,n, fill = `Victim Sex`)) + (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% group_by(`Crime Code Description 2`,`Victim Sex`) %>% count() %>% filter(n>2000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description 2`,n, fill = `Victim Sex`))
crime7 %>% group_by(`Crime Code Description`,`Victim Descent`) %>% count() %>% filter(n>5000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description`,n, fill = `Victim Descent`))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% group_by(`Crime Code Description 2`,`Victim Descent`) %>% count() %>% filter(n>5000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description 2`,n, fill = `Victim Descent`))
crime7 %>% group_by(`Crime Code Description`,`Area Name`) %>% count() %>% filter(n>2000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description`,n, fill = `Area Name`))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% group_by(`Crime Code Description 2`,`Area Name`) %>% count() %>% filter(n>2000) %>% ggplot() + geom_col(mapping = aes(`Crime Code Description 2`,n, fill = `Area Name`))
crime7 %>% count(`Victim Descent`,`Victim Sex`) %>% ggplot(mapping = aes(`Victim Descent`, `Victim Sex`)) + geom_tile(mapping = aes(fill = n))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% count(`Victim Descent`,`Area Name`) %>% ggplot(mapping = aes(`Victim Descent`, `Area Name`)) + geom_tile(mapping = aes(fill = n))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% count(`Victim Descent`,`Crime Code Description 2`) %>% ggplot(mapping = aes(`Victim Descent`, `Crime Code Description 2`)) + geom_tile(mapping = aes(fill = n))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% count(`Victim Sex`,`Crime Code Description 2`) %>% ggplot(mapping = aes(`Victim Sex`, `Crime Code Description 2`)) + geom_tile(mapping = aes(fill = n))
crime7 %>% count(`Victim Descent`,`Crime Code Description`) %>% ggplot(mapping = aes(`Victim Descent`, `Crime Code Description`)) + geom_tile(mapping = aes(fill = n))+ (theme(axis.text.x = element_text(angle = 90)))
crime7 %>% ggplot(mapping = aes(`Date Reported`, `Date Occurred`)) + geom_point()
locations <- crime2 %>%
leaflet() %>%
addTiles() %>%
addMarkers(clusterOptions = markerClusterOptions(lng = ~Longitude, lat = ~Latitude, popup = ~`DR Number`, label = ~`DR Number`))
## Assuming "Longitude" and "Latitude" are longitude and latitude, respectively
locations
library(raster)
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:tidyr':
##
## extract
library(rgdal)
library(sf)
setwd("~/Desktop/R Datasets/Kaggle Datasets/Shapefile/")
la_geo <- shapefile("LAPD_Reporting_Districts.shp")
qtm(la_geo)
### Plot of LA Shapefile
plot(la_geo)
geo.lookup("CA","Los Angeles")
## state state.name county county.name
## 1 6 California NA <NA>
## 2 6 California 37 Los Angeles County
shapefile<- tracts(state = '6', county=37, cb=TRUE)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
api.key.install(key = "b17062bbdff796fdaa3acfaefba0ffd70927d36c")
mytable <- mytable <- acs.lookup(endyear = 2017, table.number = "B19013")
## Warning in acs.lookup(endyear = 2017, table.number = "B19013"): acs.lookup for endyear>2015: using 2015 variable codes to access 2017 data.
## (See ?acs.lookup for details)
## Warning in acs.lookup(endyear = 2017, table.number = "B19013"): temporarily downloading and using archived XML variable lookup files;
## since this is *much* slower, recommend running
## acs.tables.install()
myvar <- mytable[1]
mygeo <- geo.make(state = "CA", county = "Los Angeles County", tract = "*")
mydata <- acs.fetch(endyear = 2017,geography = mygeo,span = 5,variable = myvar)
str(mydata)
## Formal class 'acs' [package "acs"] with 9 slots
## ..@ endyear : int 2017
## ..@ span : int 5
## ..@ geography :'data.frame': 2346 obs. of 4 variables:
## .. ..$ NAME : chr [1:2346] "Census Tract 5315.04, Los Angeles County, California" "Census Tract 5315.03, Los Angeles County, California" "Census Tract 2060.31, Los Angeles County, California" "Census Tract 1863.01, Los Angeles County, California" ...
## .. ..$ state : int [1:2346] 6 6 6 6 6 6 6 6 6 6 ...
## .. ..$ county: int [1:2346] 37 37 37 37 37 37 37 37 37 37 ...
## .. ..$ tract : chr [1:2346] "531504" "531503" "206031" "186301" ...
## ..@ acs.colnames : chr "B19013_001"
## ..@ modified : logi FALSE
## ..@ acs.units : Factor w/ 5 levels "count","dollars",..: 1
## ..@ currency.year : int 2017
## ..@ estimate : num [1:2346, 1] 41476 37022 87500 38672 69250 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:2346] "Census Tract 5315.04, Los Angeles County, California" "Census Tract 5315.03, Los Angeles County, California" "Census Tract 2060.31, Los Angeles County, California" "Census Tract 1863.01, Los Angeles County, California" ...
## .. .. ..$ : chr "B19013_001"
## ..@ standard.error: num [1:2346, 1] 3503 2592 11044 3831 6326 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:2346] "Census Tract 5315.04, Los Angeles County, California" "Census Tract 5315.03, Los Angeles County, California" "Census Tract 2060.31, Los Angeles County, California" "Census Tract 1863.01, Los Angeles County, California" ...
## .. .. ..$ : chr "B19013_001"
# Clean the data
acsgeoid <- paste0(as.character(mydata@geography$state),'0',
as.character(mydata@geography$county),
as.character(mydata@geography$tract))
# Create a dataframe
mydatadf <- data.frame(acsgeoid, mydata@estimate)
colnames(mydatadf)=c("GEOID", "medianincome")
mydatadf2 <- filter(mydatadf, medianincome>0)
head(mydatadf2)
## GEOID medianincome
## Census Tract 5315.04, Los Angeles County, California 6037531504 41476
## Census Tract 5315.03, Los Angeles County, California 6037531503 37022
## Census Tract 2060.31, Los Angeles County, California 6037206031 87500
## Census Tract 1863.01, Los Angeles County, California 6037186301 38672
## Census Tract 1831.04, Los Angeles County, California 6037183104 69250
## Census Tract 3118.01, Los Angeles County, California 6037311801 58199
# Join the data
class(shapefile$GEOID)
## [1] "character"
mydatadf2$GEOID <- paste("0", mydatadf2$GEOID,sep = "")
head(mydatadf2)
## GEOID medianincome
## Census Tract 5315.04, Los Angeles County, California 06037531504 41476
## Census Tract 5315.03, Los Angeles County, California 06037531503 37022
## Census Tract 2060.31, Los Angeles County, California 06037206031 87500
## Census Tract 1863.01, Los Angeles County, California 06037186301 38672
## Census Tract 1831.04, Los Angeles County, California 06037183104 69250
## Census Tract 3118.01, Los Angeles County, California 06037311801 58199
mydatamerged <- geo_join(shapefile, mydatadf2, "GEOID", "GEOID")
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
df <- mydatamerged
df
## Simple feature collection with 2344 features and 11 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -118.9449 ymin: 32.80146 xmax: -117.6464 ymax: 34.8233
## geographic CRS: NAD83
## First 10 features:
## STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD
## 1 06 037 405101 1400000US06037405101 06037405101 4051.01 CT
## 2 06 037 199800 1400000US06037199800 06037199800 1998 CT
## 3 06 037 291300 1400000US06037291300 06037291300 2913 CT
## 4 06 037 292000 1400000US06037292000 06037292000 2920 CT
## 5 06 037 604002 1400000US06037604002 06037604002 6040.02 CT
## 6 06 037 920035 1400000US06037920035 06037920035 9200.35 CT
## 7 06 037 402101 1400000US06037402101 06037402101 4021.01 CT
## 8 06 037 542602 1400000US06037542602 06037542602 5426.02 CT
## 9 06 037 503103 1400000US06037503103 06037503103 5031.03 CT
## 10 06 037 209510 1400000US06037209510 06037209510 2095.10 CT
## ALAND AWATER medianincome rank geometry
## 1 1121229 0 68207 1 MULTIPOLYGON (((-117.9693 3...
## 2 651258 0 31974 1 MULTIPOLYGON (((-118.2156 3...
## 3 2353751 45836 74375 1 MULTIPOLYGON (((-118.3091 3...
## 4 4000998 1856 36799 1 MULTIPOLYGON (((-118.3091 3...
## 5 650672 0 57738 1 MULTIPOLYGON (((-118.3613 3...
## 6 2466773 0 79250 1 MULTIPOLYGON (((-118.4722 3...
## 7 1189547 0 68636 1 MULTIPOLYGON (((-117.7406 3...
## 8 872253 0 57500 1 MULTIPOLYGON (((-118.2323 3...
## 9 1085381 3860 66612 1 MULTIPOLYGON (((-118.0464 3...
## 10 182484 0 29199 1 MULTIPOLYGON (((-118.2842 3...
# Create the popup
mypopup <- paste0("GEOID: ", df$GEOID, "<br>", "Median Income: $", round(df$medianincome,0))
# Set the pallete
mypal <- colorNumeric(
palette = "YlOrRd",
domain = df$medianincome
)
mymap<-leaflet() %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(data = df,
fillColor = ~mypal(medianincome),
color = "#b2aeae", # you need to use hex colors
fillOpacity = 0.7,
weight = 1,
smoothFactor = 0.2,
popup = mypopup) %>%
addLegend(pal = mypal,
values = df$medianincome,
position = "bottomright",
title = "Median Income",
labFormat = labelFormat(prefix = "$"))
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
mymap
map4 <- leaflet() %>%
addProviderTiles("CartoDB.Positron") %>%
addMarkers(data = crime2,
clusterOptions = markerClusterOptions(lng = ~Longitude, lat = ~Latitude, popup = ~`DR Number`, label = ~`DR Number`, group="Crimes")) %>%
addPolygons(data = df,
fillColor = ~mypal(medianincome),
color = "#b2aeae", # you need to use hex colors
fillOpacity = 0.7,
weight = 1,
smoothFactor = 0.2,
popup = mypopup,
group ="Income") %>%
addLegend(pal = mypal,
values = df$medianincome,
position = "bottomright",
title = "Median Income",
labFormat = labelFormat(prefix = "$")) %>%
addLayersControl(overlayGroups = c("Crimes", "Income"), options = layersControlOptions(collapsed = FALSE))
## Assuming "Longitude" and "Latitude" are longitude and latitude, respectively
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
map4
setwd("~/Desktop/R Datasets/Kaggle Datasets/")
la <- read_csv("CrimeLA.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Address = col_character(),
## AreaName = col_character(),
## CrimeCodeDescription = col_character(),
## CrossStreet = col_character(),
## DateOccurred = col_character(),
## DateReported = col_character(),
## Location = col_character(),
## MOCodes = col_character(),
## PremiseDescription = col_character(),
## StatusCode = col_character(),
## StatusDescription = col_character(),
## VictimDescent = col_character(),
## VictimSex = col_character(),
## WeaponDescription = col_character()
## )
## See spec(...) for full column specifications.
head(la)
## # A tibble: 6 x 29
## X1 index Address AreaID AreaName CrimeCode CrimeCode1 CrimeCodeDescri…
## <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 0 0 13200 … 9 Van Nuys 330 330 BURGLARY FROM V…
## 2 1 1 14100 … 9 Van Nuys 745 745 VANDALISM - MIS…
## 3 2 2 7500 N… 9 Van Nuys 626 626 INTIMATE PARTNE…
## 4 3 3 15400 … 9 Van Nuys 626 626 INTIMATE PARTNE…
## 5 4 4 7400 … 10 West Va… 310 310 BURGLARY
## 6 5 5 VANOWEN 10 West Va… 740 740 VANDALISM - FEL…
## # … with 21 more variables: CrossStreet <chr>, DRNumber <dbl>,
## # DateOccurred <chr>, DateReported <chr>, Gini <dbl>, Income <dbl>,
## # Location <chr>, MOCodes <chr>, Pop <dbl>, PremiseCode <dbl>,
## # PremiseDescription <chr>, ReportingDistrict <dbl>, StatusCode <chr>,
## # StatusDescription <chr>, TimeOccurred <dbl>, Tract <dbl>, VictimAge <dbl>,
## # VictimDescent <chr>, VictimSex <chr>, WeaponDescription <chr>,
## # WeaponUsedCode <dbl>
la <- la %>% mutate(CrimeCodeDescription = as.factor(CrimeCodeDescription))
la1 <- la %>% mutate(ViolentCrime = case_when(str_detect(CrimeCodeDescription,"THEFT|BURGLARY|STOLEN|VANDALISM")~FALSE,str_detect(CrimeCodeDescription,"RAPE|SEX|SEXUAL|ASSAULT|BATTERY|SHOTS|WEAPON|WEAPONS|BOMB|KIDNAPPING|ROBBERY")~TRUE))
la1 %>% ggplot()+ geom_histogram(mapping = aes(Income))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
la1 %>% ggplot()+geom_point(mapping = aes(Pop,Income,color = VictimDescent,alpha = Gini))
la1 %>% ggplot()+geom_point(mapping = aes(Pop,Income,color = VictimSex,alpha = Gini))
la1 %>% ggplot()+geom_point(mapping = aes(Pop,Income,color = ViolentCrime,alpha = Gini))
la1 %>% ggplot()+geom_point(mapping = aes(VictimAge,Income,color = ViolentCrime,alpha = Gini))
## Warning: Removed 145 rows containing missing values (geom_point).
la1 %>% ggplot()+geom_point(mapping = aes(VictimAge,Income,color = VictimDescent,alpha = Gini))
## Warning: Removed 145 rows containing missing values (geom_point).
table(la1$ViolentCrime)
##
## FALSE TRUE
## 1497 653
la2 <- la1 %>% filter(!is.na(ViolentCrime))
oversample <-sample_n(la2,800,replace = TRUE,weight = (ViolentCrime == TRUE))
la3 <- rbind(la2,oversample)
table(la3$ViolentCrime)
##
## FALSE TRUE
## 1497 1453
input_ones <-la3[which(la3$ViolentCrime == TRUE),]
input_zeroes <- la3[which(la3$ViolentCrime == FALSE),]
set.seed(100)
input_ones_training_rows <- sample(1:nrow(input_ones),0.70*nrow(input_ones))
input_zeroes_training_rows <- sample(1:nrow(input_zeroes),0.70*nrow(input_zeroes))
training_ones <- input_ones[input_ones_training_rows,]
training_zeroes <- input_zeroes[input_zeroes_training_rows,]
trainingData <- rbind(training_ones,training_zeroes)
test_ones <- input_ones[-input_ones_training_rows,]
test_zeroes <- input_zeroes[-input_zeroes_training_rows,]
testData <- rbind(test_ones,test_zeroes)
model1 <- glm(data = la1, ViolentCrime~VictimAge + Income + VictimDescent + Pop, family = binomial)
summary(model1)
##
## Call:
## glm(formula = ViolentCrime ~ VictimAge + Income + VictimDescent +
## Pop, family = binomial, data = la1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4877 -0.9641 -0.6308 1.1628 2.7566
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.186e-02 4.457e-01 -0.094 0.925170
## VictimAge -2.328e-02 3.676e-03 -6.333 2.41e-10 ***
## Income -1.232e-05 2.174e-06 -5.665 1.47e-08 ***
## VictimDescentB 1.507e+00 3.822e-01 3.942 8.07e-05 ***
## VictimDescentC -1.291e+01 8.827e+02 -0.015 0.988331
## VictimDescentH 1.313e+00 3.747e-01 3.503 0.000459 ***
## VictimDescentI -1.228e+01 6.128e+02 -0.020 0.984005
## VictimDescentK 1.545e+00 6.561e-01 2.354 0.018557 *
## VictimDescentO 4.749e-01 4.131e-01 1.150 0.250253
## VictimDescentP -1.224e+01 8.827e+02 -0.014 0.988939
## VictimDescentW 8.901e-01 3.825e-01 2.327 0.019966 *
## VictimDescentX -1.738e-01 6.177e-01 -0.281 0.778432
## VictimDescentZ -1.255e+01 8.827e+02 -0.014 0.988652
## Pop -4.155e-04 6.662e-04 -0.624 0.532809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2286.2 on 1763 degrees of freedom
## Residual deviance: 2098.4 on 1750 degrees of freedom
## (510 observations deleted due to missingness)
## AIC: 2126.4
##
## Number of Fisher Scoring iterations: 13
options(scipen = 999)
coeffs <- coef(model1)
odds <- exp(coeffs)
odds
## (Intercept) VictimAge Income VictimDescentB VictimDescentC
## 0.959001448540 0.976992315713 0.999987682769 4.512424719761 0.000002471350
## VictimDescentH VictimDescentI VictimDescentK VictimDescentO VictimDescentP
## 3.716375422953 0.000004620623 4.686752133490 1.607914528946 0.000004846351
## VictimDescentW VictimDescentX VictimDescentZ Pop
## 2.435466903700 0.840464585281 0.000003527452 0.999584546477
model2 <- glm(data = trainingData, ViolentCrime~VictimAge + Pop + Income + VictimDescent, family = binomial)
summary(model2)
##
## Call:
## glm(formula = ViolentCrime ~ VictimAge + Pop + Income + VictimDescent,
## family = binomial, data = trainingData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8872 -1.1427 0.7585 0.9850 2.3993
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.928175834 0.401136011 2.314 0.0207 *
## VictimAge -0.022925781 0.003508786 -6.534 0.0000000000641 ***
## Pop -0.001598669 0.000651113 -2.455 0.0141 *
## Income -0.000011751 0.000001959 -5.999 0.0000000019858 ***
## VictimDescentB 1.657783332 0.334273640 4.959 0.0000007072578 ***
## VictimDescentC -12.651332678 535.411266895 -0.024 0.9811
## VictimDescentH 1.430909988 0.324088366 4.415 0.0000100923793 ***
## VictimDescentI -12.320506335 535.411264120 -0.023 0.9816
## VictimDescentK 1.658470753 0.688781301 2.408 0.0160 *
## VictimDescentO 0.689914094 0.359218462 1.921 0.0548 .
## VictimDescentP -11.756894862 535.411271658 -0.022 0.9825
## VictimDescentW 1.003831751 0.330745848 3.035 0.0024 **
## VictimDescentX 0.115125066 0.547274923 0.210 0.8334
## VictimDescentZ -12.128448415 535.411265148 -0.023 0.9819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2440.7 on 1769 degrees of freedom
## Residual deviance: 2227.6 on 1756 degrees of freedom
## (294 observations deleted due to missingness)
## AIC: 2255.6
##
## Number of Fisher Scoring iterations: 12
blr_rsq_mcfadden(model1)
## [1] 0.2052013
blr_rsq_mcfadden_adj(model1)
## [1] 0.1945957
blr_rsq_mcfadden(model2)
## [1] 0.221362
blr_rsq_mcfadden_adj(model2)
## [1] 0.2115748
blr_confusion_matrix(model1,cutoff = 0.5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## 0 1002 425
## 1 143 194
##
##
## Accuracy : 0.6780
## No Information Rate : 0.6491
##
## Kappa : 0.2106
##
## McNemars's Test P-Value : 0.0000
##
## Sensitivity : 0.3134
## Specificity : 0.8751
## Pos Pred Value : 0.5757
## Neg Pred Value : 0.7022
## Prevalence : 0.3509
## Detection Rate : 0.1100
## Detection Prevalence : 0.1910
## Balanced Accuracy : 0.5943
## Precision : 0.5757
## Recall : 0.3134
##
## 'Positive' Class : 1
blr_confusion_matrix(model2,cutoff = 0.5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## 0 401 206
## 1 408 755
##
##
## Accuracy : 0.6531
## No Information Rate : 0.4571
##
## Kappa : 0.2870
##
## McNemars's Test P-Value : 0.0000
##
## Sensitivity : 0.7856
## Specificity : 0.4957
## Pos Pred Value : 0.6492
## Neg Pred Value : 0.6606
## Prevalence : 0.5429
## Detection Rate : 0.4266
## Detection Prevalence : 0.6571
## Balanced Accuracy : 0.6407
## Precision : 0.6492
## Recall : 0.7856
##
## 'Positive' Class : 1
k <- blr_gains_table(model1)
blr_roc_curve(k)
k2 <- blr_gains_table(model2)
blr_roc_curve(k2)
k
## decile total 1 0 ks tp tn fp fn sensitivity specificity
## 1 1 176 102 74 10.01531 102 1071 74 517 16.47819 93.53712
## 2 2 176 98 78 19.03507 200 993 152 419 32.31018 86.72489
## 3 3 176 87 89 25.31707 287 904 241 332 46.36511 78.95197
## 4 4 176 64 112 25.87467 351 792 353 268 56.70436 69.17031
## 5 5 176 64 112 26.43227 415 680 465 204 67.04362 59.38865
## 6 6 176 64 112 26.98986 479 568 577 140 77.38288 49.60699
## 7 7 176 47 129 23.31638 526 439 706 93 84.97577 38.34061
## 8 8 176 45 131 19.14512 571 308 837 48 92.24556 26.89956
## 9 9 176 31 145 11.48944 602 163 982 17 97.25363 14.23581
## 10 10 180 17 163 0.00000 619 0 1145 0 100.00000 0.00000
## accuracy
## 1 66.49660
## 2 67.63039
## 3 67.51701
## 4 64.79592
## 5 62.07483
## 6 59.35374
## 7 54.70522
## 8 49.82993
## 9 43.36735
## 10 35.09070
require(pracma)
## Loading required package: pracma
##
## Attaching package: 'pracma'
## The following object is masked from 'package:purrr':
##
## cross
sensitivity1 = c(16.5,32.3,46.4,56.7,67.0,77.4,85.0,92.2,97.3,100.0)
AUC = trapz(1:10,sensitivity1)/1000
AUC
## [1] 0.61255
k2
## decile total 1 0 ks tp tn fp fn sensitivity specificity
## 1 1 177 133 44 8.400937 133 765 44 828 13.83975 94.56119
## 2 2 177 131 46 16.346538 264 719 90 697 27.47138 88.87515
## 3 3 177 133 44 24.747475 397 675 134 564 41.31113 83.43634
## 4 4 177 109 68 27.684388 506 607 202 455 52.65349 75.03090
## 5 5 177 101 76 28.799960 607 531 278 354 63.16337 65.63659
## 6 6 177 92 85 27.866522 699 446 363 262 72.73673 55.12979
## 7 7 177 86 91 25.567079 785 355 454 176 81.68574 43.88133
## 8 8 177 74 103 20.535624 859 252 557 102 89.38606 31.14957
## 9 9 177 65 112 13.455159 924 140 669 37 96.14984 17.30532
## 10 10 177 37 140 0.000000 961 0 809 0 100.00000 0.00000
## accuracy
## 1 50.73446
## 2 55.53672
## 3 60.56497
## 4 62.88136
## 5 64.29379
## 6 64.68927
## 7 64.40678
## 8 62.76836
## 9 60.11299
## 10 54.29379
race_table <- la1 %>% group_by(VictimDescent) %>% dplyr::summarise(count = n())
## `summarise()` ungrouping output (override with `.groups` argument)
race_table
## # A tibble: 12 x 2
## VictimDescent count
## <chr> <int>
## 1 A 64
## 2 B 376
## 3 C 1
## 4 H 739
## 5 I 2
## 6 K 15
## 7 O 216
## 8 P 1
## 9 W 510
## 10 X 94
## 11 Z 1
## 12 <NA> 255
## Create Probabilities Based on Predictions Using Model
newdata = with(la1, expand.grid(VictimDescent=unique(VictimDescent),
VictimAge=seq(from = 0,to = 100,by = 0.1),
Pop = quantile(Pop),
Income = quantile(Income)))
newdata$prob = predict(model1, newdata, type="response")
newdata %>% filter(VictimDescent %in% c("H","B","W","A")) %>% ggplot(aes(VictimAge,prob,color = VictimDescent)) + geom_line() + facet_wrap(~Income) + scale_color_discrete() + ylab("Probability of Violent Crime") + xlab("Victim Age") + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Probability of Experiencing Violent Crime by Age, Income, and Race") + theme_clean()
newdata %>% filter(VictimDescent %in% c("H","B","W","A")) %>% ggplot(aes(VictimAge,prob,color = VictimDescent)) + geom_line() + facet_grid(Income~Pop)+ ylab("Probability of Violent Crime") + xlab("Victim Age") + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Probability of Experiencing Violent Crime by Age, Income, Population, and Race") + theme_minimal()
The dataset analyzed above was obtained from Kaggle, but was originally obtained from LA’s Open Database. A more recent version of the dataset can also be found directly at LA’s Open Data database: https://data.lacity.org/Public-Safety/Crime-Data-from-2010-to-2019/63jg-8b9z/data
The dataset contains data on over 1 million crimes in LA from 2010 to 2017. In addition to the date of the crime, the type of crime, the type of place where the crime occurred, the dataset also includes the race/descent, age, and gender of the victim of the crime as well as latitude and longitude coordinates of the crime. In addition to the primary crimes dataset, a shapefile of the city of LA and a list of MO (method of operation) codes and types was provided. In addition to the primary and auxiliary datasets, census data on median income, population, and gini coefficient per census tract in LA from 2012 to 2017 was mined and incorporated into the analysis. I cleaned the dataset by eliminating all NA’s, which reduced the dataset to a much more manageable size of just under 130,000 observations, which is still quite large. I then changed the dates from characters to date variables, create latitude and longitude columns out of the location column that contained both latitude and longitude, renamed the descents, and reduced the crime types to more general categories in the initial stage of the analysis. I chose this dataset because I am very interested in the the relation between race, gender, income, crime, policing, etc. due to my background in sociological/anthropological subjects. The reason I chose this dataset in particular is because I am interested in whether race, gender, and income vary by crime types and, specifically, how these variables affect the likelihood of experiencing a violent crime as opposed to a property crime (i.e. non-violent crime).
My background research for this topic consisted of three Bureau of Justice Criminal Victimization Surveys, one from 2010, one from 2013, and one from 2015, and an article based on the same dataset that was used for this very analysis. According to the 2010 survey by the Bureau of Justice, in 2010, while the share of the population increased as the age range increased, the number of violent crimes per 1000 people decreased as the age range increased. That is, for those aged 12-14, the rate was 27.5; for those aged 18-20, the rate was 33.9; for those aged 25-34, the rate is 18.8; for those aged 35-49, the rate was 12.6; for those aged 50-64, the rate was 10.9, and for those aged 65 or older, the rate was 2.4 (BOJ,2010, p. 10). As for the rate of violent victimizations by race, it was about 13.6/1000 for White people, 20.8/1000 for Black people, 15.6/1000 for Hispanic people, and 6.3/1000 for Asian people (BOJ, 2010, p.11). Thus, when viewed as a rate, not a proportion, it is clear that violent victimizations were much higher for Black people even though they constituted a smaller percentage of the population compared to White people. Additionally, Asians accounted for the lowest proportion of violent crimes acrss race. Unfortunately, there was no income data for violent crimes and income in 2010, but there was data on property crimes that showed that the rate of violent crimes by income decreased as income increased. The rate for households whose income was less than 7,500 dollars was 11.2; for households whose income was between 7,500 and 15,000 dollars was 9.2; the rate for households whose income was between 15,000 and 25,000 dollars was 7.3, and so on (BOJ, 2010, p. 12). According to the survey in 2013, the trend by age was evident again though rates rose across all age ranges. For those aged 12-17, the rate was 52.1; for those aged 18-24, the rate was 33.8; for those aged 25-34, the rate is 29.6; for those aged 35-49, the rate was 20.3; for those aged 50-64, the rate was 18.7, and for those aged 65 or older, the rate was 5.4 (BOJ,2013, p. 9). By contrast, the rate of violent victimizations by race was about the same for White, Black, and Hisapnic people. It was about 22.2/1000 for White people, 25.1/1000 for Black people, 24.8/1000 for Hispanic people, and 7/1000 for Asian people (BOJ, 2013, p.9). Thus, violent victimizations were only slightly higher for Black people. Additionally, Asians accounted for the lowest proportion of violent crimes across race again. Unfortunately, like 2010, there was no income data for violent crimes and income in 2013, but there was also no data on housheold income even for property crimes. This changes in 2015, however. The rate of violent crimes by income in 2015 mirrored that of property crimes in 2010 and barely changed from 2014. That is, in both 2014 and 2015, the rate of violent crimes by income decreased as income increased. The rate for households whose income was less than 10,000 dollars was 39.7 in 2014 and 39.2 in 2015; for households whose income was between 10,000 and 15,000 dollars was 36 in 2014 and 27.7 in 2015; the rate for households whose income was between 15,000 and 25,000 dollars was 25.3 in 2014 and 25.9 in 2015; the rate for households whose income was between 25,000 and 35,000 dollars was 19.7 in 2014 and 16.3 in 2015; and the rate for households whose income was above 75,000 dollars was 15.1 in 2014 and 12.8 in 2015 (BOJ, 2015, p. 9). As with the prior two BOJ survey reports, the rate of violent crimes steadily decreased as the age brackets increased in both 2014 and 2015. For example, for those aged 12-17, the rate of violent crimes was 30.1 in 2014 and 31.3 in 2015; for those aged 18-24, the rate of violent crimes was 26.8 in 2014 and 25.1 in 2015; for those aged 35-49, the rate of violent crimes was 21.6 in 2014 and 22.6 in 2015; for those aged 50-64, the rate of violent crimes was 17.9 in 2014 and 14.2 in 2015; and, finally, for those aged 65 and older, the rate was just 3.1 in 2014 and 5.2 in 2015 (BOJ, 2015, p. 9). Meanwhile, with regard to race, the rate for White people was 20.3 in 2014 and 17.4 in 2015, while the rate for Black people was 22.5 in 2014 and 22.6 in 2015, and the rate for Hispanic people was 16.2 in 2014 and 16.8 in 2015 (BOJ, 2015, p. 9). So the only real change from 2013 to 2015 in terms of race was that the number of violent crimes for every 1000 Hispanic people dropped moderately to the 2010 level.
The article entitled, “From Bad to Worse: How Changing Inequality in Nearby Areas Impacts Local Crime”, uses an innovative and advanced form of spatial mapping that focuses on overlapping regions rather than strict boundaries called, egohoods, which are essentially concentric circles about single blocks of a certain radius (130). That is, in lieu of an analysis on the basis of census tracts, they conduct an analysis based on egohoods with a buffer of 1/4 mile (136). The authors, in their analysis of LA crime data from 2000 to 2010, make a few significant findings. The first is that “Egohoods experiencing an increase in average household income simultaneously experience decreases in aggravated assaults, robberies, and burglaries over the decade, controlling for the other measures in the models” (141). This pretty much agreed with the overall, aggregate BOJ data on violent crimes and income; namely, that an increase in income corresponds to a decrease in violent crime. Next, the authors revealed the relation between crime and race. “The findings also reveal that changes in the racial-ethnic composition of the egohood are related to changes in the crime rate…neighborhoods that experience an increase in percentage black and a simultaneous decrease in percentage white or other race are more likely to experience an increase in violent crime but a decrease in property crime” (142). This also agreed to some extent with the BOJ data since black people had a higher rate of violent crime victimization compared to white people, though it did decrease over time. These findings, combined with the BOJ data, motivated me to undertake an analysis of violent crimes on the basis of income and race. Specifically, I wanted to determine whether income varied on the basis of census tracts, in addition to egohoods, and race, as well as other variables (i.e. population, gini coefficient, sex, age, etc.).
After the data cleaning process, I made a series of boxplots and bar charts to explore how race, sex, and the type of crime were related. After this, following the NYC Asbestos Complaints tutorial developed by Lucy Murray, I created a map to show the count of crimes in LA by cluster. I then created a shapefile of LA, mined income data from 2012 to 2017 in LA by tract and merged this data by GEOID with the data from the shapefile. I then created a map of the median income by census tract in LA and then overlaid it with the crime count clusters and found that it appeared that lower income regions tended to have higher counts of crimes compared to higher income regions. I found this quite interesting, so I decided to explore the data further, which required some additional cleaning. Since I was unable to successfully use batch geocoding in R, I had to use batch geocoding in Python to match the latitude and longitude coordinates with particular census tracts for all observations without NA’s (i.e. for the dataframe of 130,000 observations I used in the analysis up to this point). After doing so, only 2,274 latitude and longitude coordinates of the 130,000 or so observations returned matches. I then left joined the tract information for each of the coordinates with median income per tract, population per tract, and gini coefficient by tract. I then created a csv file from this dataset and read it into R and created a logical variable called ViolentCrimes that differentiated between violent crimes and property crimes according to the Bureau of Justice distinction between the two. I then made a series of plots to investigate the relations between these three census variables, age, and whether a crime was violent or not. It looked like age, income, population, and race all might be influential so I decided to see if the binary type of crime (violent vs property) could be modeled as a logistic function of age, income, population, and race.
I used two models, one that corrected for class bias by oversampling from the minority class (i.e. violent crimes) and one that did not. The models ended up being very similar but did have some key differences. Both models had very high adjusted McFadden pseudo R-squared values of about 0.2. A McFadden’s pseudo R-squared value between 0.2 and 0.4 is considered excellent so that alone demonstrated that the models were good predictors. Additionally, both models shared the same statistically significant coefficients, although those coefficients were more statistically significant for the second model. As can be seen from either model summary, the races/descents with the largest counts (Black, Hispanic, and White) were all statistically significant. Races that accounted for fewer observations did not have statistically significant coefficients. Additionally, the coefficients for income and age were also statistically significant. The only variable that was not at all significant was Population. The exponentiated coefficients provide the odds ratios for each variable value. For each additional year in age, the odds of suffering a violent crime decrease by 0.97. That is, your odds of experiencing a violent crime are 0.97 times as great for every one unit increase in age. The same is true of income. For every additional dollar in income, the odds of being a victim a violent crime decrease by 0.99, which means you are 0.99 times as likely to be a victim of a violent crime for every additional dollar in median income. The odds for Korean, African American, Hispanic, and White people were 4.68, 4.51, 3.71, and 2.43 respectively, which means that your odds of experiencing a violent crime are the highest if you are Korean, second highest if you are Black, third highest is you are Latino, and fourth highest if you are White. Interestingly, the odds were almost 0 for certain races, such as Chinese people.
The effect of oversampling is quite evident from the confusion matrix output. The first model had a sensitivity of 0.30 and a specificity of 0.87 while the second model had a sensitivity of 0.77 and a specificity of 0.49, which means that the true positive rate was much higher for the second model (i.e. the model that was based on training and testing data that had been class balanced) and that the true negative rate was much higher for the first model. In other words, there was a trade-off of specificity for sensitivity, with the first model having a better true negative rate and the second having a better true positive rate. The (balanced) accuracy is just the mean of the specificity and sensitivity, so the accuracy was slightly lower at (0.87 + 0.3)/2 = 58.8% for the first model and slightly higher for the second model at (0.77 + 0.49) = 63%, which is due to the oversampling. More importantly, however, the AUC of the ROC curve of the first model was about 67% while the AUC of the ROC of the second model was about 63%, so the first model was actually the better classifier of the two. The plots of the model based on the abridged data seem to agree with the final map I created, as well as the research findings of Dr. Hipp and Dr. Kubrin, since the plots of the model also suggest that the likelihood of experiencing a crime is higher for individuals with lower income. Moreover, the model also agreed almost entirely with the Bureau of Justice survey data as the plot of the model showed that the odds of experiencing a violent crime decrease as income and age increase and that the odds of experiencing a violent crime are significantly lower for Asians (with the exception of Koreans) compared to other races. The odds in the plot of the model follow the general order of the rates of violent crime from the BOJ data above as well (i.e highest for Black people, second highest for Latinos, third highest for White people, etc.). The model also suggested the finding that Dr. Hipp and Dr. Kubrin made with respect to race - namely, that an increase in the number of Black people for a given neighborhood tends to result in an increase in the number of violent crimes that will occur in that neighborhood. It can also be seen from the final plot of the model that, as the model summary suggested, population is not as strong of a determinant of the odds of experiencing a violent crime as some of the other variables. In effect, although I used census tracts as opposed to egohoods and data from 2010 to 2017, like Dr.Hipp and Dr. Kubrin, I found that the odds of being the victim of a violent crime depend, not only on income and race, but also on age and population.
Hipp, John R., & Kubrin, Charis E. (2017). From Bad to Worse: How Changing Inequality in Nearby Areas Impacts Local Crime. RSF: The Russell Sage Foundation Journal of the Social Sciences, 3(2), 129-151. https://doi.org/10.7758/RSF.2017.3.2.06
Truman, Jennifer L., & Morgan, Rachel E. (2016). Criminal Victimization, 2015. Bureau of Justice Statistics. Truman, Jennifer L., & Langton, Lynn. (2014). Criminal Victimization, 2013. Bureau of Justice Statistics. Truman, Jennifer L. (2011). Criminal Victimization, 2010. Bureau of Justice Statistics.