library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaflet)
library(stringr)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-4, (SVN revision 833)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
## Path to GDAL shared files: C:/Users/Kim/Documents/R/win-library/3.6/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: C:/Users/Kim/Documents/R/win-library/3.6/rgdal/proj
## Linking to sp version: 1.3-1
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(forecast)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
library(DT)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl,
## flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
## modify, prepend, splice
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(DataExplorer)
City of Los Angeles
Open Data: https://data.lacity.org/browse
Selected dataset : Arrest Data from 2010 to Present Description: This dataset reflects arrest incidents in the City of Los Angeles dating back to 2010. This data is transcribed from original arrest reports that are typed on paper and therefore there may be some inaccuracies within the data. Some location fields with missing data are noted as (0.0000°, 0.0000°). Address fields are only provided to the nearest hundred block in order to maintain privacy. This data is as accurate as the data in the database. Please note questions or concerns in the comments.
LAdata = read.csv("C:/Users/Kim/Desktop/Stuff/Random Data Sets/Arrest_Data_from_2010_to_Present.csv")
head(LAdata)
# Keep one copy as unmodified original
rawdata<- LAdata
# Let us choose a year at a time
LAdata$Date <- mdy(LAdata$Arrest.Date)
LAdata$Year <- year(LAdata$Date)
LAdata$Month<- month(LAdata$Date)
LAdata$weekday <- wday(LAdata$Date)
LA_2017 <- filter(LAdata,Year==2017)
str(LA_2017)
## 'data.frame': 107677 obs. of 21 variables:
## $ Report.ID : int 170601948 5192214 182004174 5191986 181504012 5192051 181404080 5192422 182004029 5192098 ...
## $ Arrest.Date : Factor w/ 3530 levels "01/01/2010","01/01/2011",..: 3529 3529 3529 3529 3529 3529 3529 3529 3529 3529 ...
## $ Time : int 2330 1600 5 415 1200 1245 1430 2235 1210 1400 ...
## $ Area.ID : int 6 14 20 18 15 9 14 12 20 9 ...
## $ Area.Name : Factor w/ 21 levels "77th Street",..: 7 13 12 15 9 18 13 1 12 18 ...
## $ Reporting.District : int 646 1494 2026 1842 1566 914 1431 1232 2097 926 ...
## $ Age : int 39 44 17 26 50 48 33 28 35 30 ...
## $ Sex.Code : Factor w/ 2 levels "F","M": 2 2 2 1 2 2 2 1 2 2 ...
## $ Descent.Code : Factor w/ 19 levels "A","B","C","D",..: 2 17 7 7 2 7 7 7 1 7 ...
## $ Charge.Group.Code : int NA 25 24 22 NA 12 17 22 24 3 ...
## $ Charge.Group.Description: Factor w/ 28 levels "","Against Family/Child",..: 1 9 16 7 1 28 15 7 16 25 ...
## $ Arrest.Type.Code : Factor w/ 5 levels "D","F","I","M",..: 4 2 4 4 3 2 3 4 4 2 ...
## $ Charge : Factor w/ 8974 levels "(1)11357BHS",..: 2719 5745 5313 2249 4538 3470 2719 2249 2318 1872 ...
## $ Charge.Description : Factor w/ 2352 levels "","< AGE 21 DRIVING VEH W/BLOOD-ALCOHOL .01+",..: 1 1162 522 668 1 1612 627 668 644 1904 ...
## $ Address : Factor w/ 83767 levels "00 17TH AV",..: 74662 1241 40768 6205 79779 16525 78427 34650 79563 75397 ...
## $ Cross.Street : Factor w/ 20784 levels "","0 ST",..: 20260 1 19083 13308 10940 1 5053 999 19832 11198 ...
## $ Location : Factor w/ 54088 levels "(0.0, 0.0)","(33.3427, -118.3258)",..: 30196 4844 25104 4075 34751 40863 9370 9250 18958 39807 ...
## $ Date : Date, format: "2017-12-31" "2017-12-31" ...
## $ Year : num 2017 2017 2017 2017 2017 ...
## $ Month : num 12 12 12 12 12 12 12 12 12 12 ...
## $ weekday : num 1 1 1 1 1 1 1 1 1 1 ...
plot_intro(LA_2017)
plot_missing(LA_2017)
** Most of the missing data are in the Charge.Group.Code, which is only an accounting designation of the incident.
Mapping
Separate the Longitude & Latitude
latlong = gsub('\\(','',LA_2017$Location)
latlong = gsub('\\)','',latlong)
latlong = str_split(latlong,",")
latlong = do.call(rbind.data.frame, latlong)
colnames(latlong) = c("latitude","longitude")
LA_2017$latitude = as.numeric(as.character(latlong$latitude))
LA_2017$longitude = as.numeric(as.character(latlong$longitude))
rm(latlong)
LA_2017 <- LA_2017 %>%
select(-Location)
Visualize Month of crime
LA_2017$Month = month(LA_2017$Date,label = TRUE)
LA_bymonth <-LA_2017 %>%
group_by(Month) %>%
summarise(CountIncidents = n())
ggplot(data=LA_bymonth,aes(x = Month,y = CountIncidents)) +
geom_bar(stat='identity',colour="white", fill ="orange") +
geom_text(aes(x = Month, y = 1, label = paste0("(",CountIncidents,")",sep="")),
hjust=.5, vjust=-0.5, size = 3.5, colour = 'black',
fontface = 'bold') +
labs(x = 'Month Of Crime', y = 'Count of Incidents',
title = 'Number of Incidents in 2017') +
theme_bw()
Visualize Day of Crime
LA_bydate <- LA_2017 %>%
group_by(Date) %>%
summarise(Daily_Total=n())
plot(LA_bydate, type = "l",col="red",main="Daily Incident Count in 2017")
LA_2017$Weekday = wday(LA_2017$Date,label = TRUE)
LA_2017 %>%
group_by(Weekday) %>%
summarise(CountIncidents = n()) %>%
ggplot(aes(x = Weekday,y = CountIncidents)) +
geom_bar(stat='identity',colour="white", fill ="lightblue") +
geom_text(aes(x = Weekday, y = 1, label = paste0("(",CountIncidents,")",sep="")),
hjust=0.5, vjust=-.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Day Of Crime Occurrence', y = 'Number of Incidents',
title = 'Daily Count in 2017') +
theme_bw()
Sex of Crime victims
LA_2017 %>%
filter(!is.na(Sex.Code)) %>%
group_by(Sex.Code) %>%
tally() %>%
ungroup() %>%
mutate(Gender = reorder(Sex.Code,n)) %>%
ggplot(aes(x = Sex.Code,y = n,fill=Sex.Code)) +
geom_bar(stat='identity',colour="white") +
geom_text(aes(x = Sex.Code, y = 1, label = paste0("(",n,")",sep="")),
hjust=0.5, vjust=-.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Sex', y = 'Count of Incidents',
title = 'Count of Incidents by Sex in 2017') +
theme_bw()
LA_2017 %>%
filter(!is.na(Sex.Code)) %>%
group_by(Sex.Code,Charge.Description) %>%
tally() %>%
ungroup() %>%
mutate(Sex.Code = reorder(Sex.Code,n)) %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(x = Sex.Code,y = n, fill =Charge.Description)) +
geom_bar(stat='identity') +
labs(x = 'Sex and Crime Description', y = 'Count of Incidents',
title = 'Incidents and Sex by Crime Description in 2017') +
coord_flip() +
theme_bw() + theme(legend.position="top")
Crime with age 55 year old + Crimes
LA_2017 %>%
filter(!is.na(Age)) %>%
filter(Age >= 55) %>%
group_by(Charge.Description,Sex.Code) %>%
tally() %>%
ungroup() %>%
mutate(Sex.Code = reorder(Sex.Code,n)) %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(x = Sex.Code,y = n,fill = Charge.Description)) +
geom_bar(stat='identity') +
labs(x = 'Sex and Crime Description', y = 'Count of Incidents',
title = 'Incidents and Sex by Crime Description for Persons 55 and above in 2017') +
coord_flip() +
theme_bw() + theme(legend.position="top")
Crimes by Areas
LA_2017 %>%
filter(!is.na(Area.Name)) %>%
group_by(Area.Name) %>%
tally() %>%
ungroup() %>%
mutate(Area = reorder(Area.Name,n)) %>%
arrange(desc(n)) %>%
head(20) %>%
ggplot(aes(x = Area,y = n,fill=Area)) +
geom_bar(stat='identity',colour="white") +
geom_text(aes(x = Area, y = 1, label = paste0("(",n,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Area', y = 'Count',
title = 'Count of Incidents by Area in 2017') +
coord_flip()+
theme_bw()
Mapping of homicide incindents
library(highcharter)
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(leaflet)
homicide_2017 <- filter(LA_2017,Charge.Group.Description=="Homicide")
hcmap("countries/us/us-ca-all") %>%
hc_title(text = "California")
homicide_2017 %>%
leaflet() %>%
addTiles() %>%
addMarkers(lat=homicide_2017$latitude, lng=homicide_2017$longitude,
clusterOptions = markerClusterOptions(),
popup= paste(homicide_2017$Charge.Description,
"<br><strong>Date: </strong>", homicide_2017$Date,
"<br><strong>Area: </strong>", homicide_2017$Area.Name,
"<br><strong>Description: </strong>", homicide_2017$Charge.Group.Description
))
h_bydate <- homicide_2017 %>%
group_by(Date) %>%
summarise(Daily_Total=n())
plot(h_bydate, type = "l",col="darkred",main="Daily Homicide Count in 2017")
homicide_2017 %>%
filter(!is.na(Area.Name)) %>%
group_by(Area.Name) %>%
tally() %>%
ungroup() %>%
mutate(Area = reorder(Area.Name,n)) %>%
arrange(desc(n)) %>%
head(20) %>%
ggplot(aes(x = Area,y = n,fill=Area)) +
geom_bar(stat='identity',colour="white") +
geom_text(aes(x = Area, y = 1, label = paste0("(",n,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Area', y = 'Count',
title = 'Top 20 Count of Homicide Areas in 2017') +
coord_flip()+
theme_bw()