required_packages <- c('tidyverse', 'ggplot2', 'ggpmisc', 'wordcloud', 'tidytext', 'leaflet')
for (p in required_packages) {
if(!require(p,character.only = TRUE)) {
install.packages(p, dep = TRUE)
}
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: ggpmisc
## Loading required package: ggpp
##
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## Loading required package: tidytext
## Loading required package: leaflet
#Reading the data
crime<-read_csv("crime.csv", show_col_types = FALSE)
offense_code<-read_csv("offense_codes.csv", show_col_types = FALSE)
Exploratory Data Analysis
#View(crime)
#View(offense_code)
summary(crime)
## INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION
## Length:319073 Min. : 111 Length:319073 Length:319073
## Class :character 1st Qu.:1001 Class :character Class :character
## Mode :character Median :2907 Mode :character Mode :character
## Mean :2318
## 3rd Qu.:3201
## Max. :3831
##
## DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE
## Length:319073 Min. : 0.0 Length:319073 Length:319073
## Class :character 1st Qu.:177.0 Class :character Class :character
## Mode :character Median :344.0 Mode :character Mode :character
## Mean :383.2
## 3rd Qu.:544.0
## Max. :962.0
## NA's :20250
## YEAR MONTH DAY_OF_WEEK HOUR
## Min. :2015 Min. : 1.00 Length:319073 Min. : 0.00
## 1st Qu.:2016 1st Qu.: 4.00 Class :character 1st Qu.: 9.00
## Median :2017 Median : 7.00 Mode :character Median :14.00
## Mean :2017 Mean : 6.61 Mean :13.12
## 3rd Qu.:2017 3rd Qu.: 9.00 3rd Qu.:18.00
## Max. :2018 Max. :12.00 Max. :23.00
##
## UCR_PART STREET Lat Long
## Length:319073 Length:319073 Min. :-1.00 Min. :-71.18
## Class :character Class :character 1st Qu.:42.30 1st Qu.:-71.10
## Mode :character Mode :character Median :42.33 Median :-71.08
## Mean :42.21 Mean :-70.91
## 3rd Qu.:42.35 3rd Qu.:-71.06
## Max. :42.40 Max. : -1.00
## NA's :19999 NA's :19999
## Location
## Length:319073
## Class :character
## Mode :character
##
##
##
##
summary(offense_code)
## CODE NAME
## Min. : 111 Length:576
## 1st Qu.: 542 Class :character
## Median :1768 Mode :character
## Mean :1728
## 3rd Qu.:2900
## Max. :3831
Data Wrangling and Cleaning
#Creating a Tibble
Crime <- as_tibble(crime)
Offense_code <- as_tibble(offense_code)
ncol(Crime)
## [1] 17
#changing all column names to lower
names(Crime)<- tolower(names(Crime))
#renaming the Code column in Crime data as it is in offence code
colnames(Offense_code)[colnames(Offense_code)=="CODE"] <- "offense_code"
#changing street names to lower case
street<- Crime$street
street<- tolower(street)
street[1:5]
## [1] "lincoln st" "hecla st" "cazenove st" "newcomb st" "delhi st"
#changing offense description to lower case
description<- Crime$offense_description
description<- tolower(description)
description[1:5]
## [1] "larceny all others" "vandalism" "towed motor vehicle"
## [4] "investigate property" "investigate property"
#removing leading zeros in the offense code column
offense_code<- str_remove(Crime$offense_code, "0")
offense_code<- gsub("(?<![0-9])0+", "", offense_code, perl = TRUE)
offense_code[1:5]
## [1] "619" "142" "341" "3114" "3114"
#replacing Y in shooting with 1 and making the column a numeric
Crime$shooting<- Crime$shooting
Crime$shooting[is.na(Crime$shooting)]=0
Crime$shooting[Crime$shooting=='Y']=1
Crime$shooting<- as.numeric(Crime$shooting)
Crime$shooting[1297:1862]
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 1 0
#removing punctuation in the offense description column
description<- gsub('[[:punct:]]', "", description)
#removing unnecessary spaces
description<- str_replace_all(description, " ", " ")
description[31451]
## [1] "drugs poss class b cocaine etc"
#removing digits
descriptiont<- gsub('[[:digit:]]', "", description)
Offense_code%>%
mutate(offence_code=Offense_code$CODE)
## Warning: Unknown or uninitialised column: `CODE`.
Offense_code$CODE
## Warning: Unknown or uninitialised column: `CODE`.
## NULL
#joining both the Crime and Offesnse code data
Crime_join <- merge(Crime,Offense_code)
names(Crime_join)
## [1] "offense_code" "incident_number" "offense_code_group"
## [4] "offense_description" "district" "reporting_area"
## [7] "shooting" "occurred_on_date" "year"
## [10] "month" "day_of_week" "hour"
## [13] "ucr_part" "street" "lat"
## [16] "long" "location" "NAME"
#changing the Name coumn
colnames(Crime_join)[colnames(Crime_join)=="NAME"] <- "offense_name"
#removing non printing characters
offense_name<-Crime_join$offense_name
offense_name<- gsub('[^\x20-\x7F]', "", offense_name)
names(offense_name)<- tolower(offense_name)
Data Visualization
#How's the crime reported over the years
Crime_join%>%
select(year, incident_number)%>%
group_by(year)%>%
summarise(NumberofIncident = n())%>%
ggplot()+
geom_line(aes(x=year, y=NumberofIncident))+
xlab("Year") + ylab("Number of Incidents")
#What time of the day does most incidents get reported?
Crime_join%>%
select(hour, incident_number)%>%
group_by(incident_number)%>%
ggplot()+
geom_bar(aes(x=hour), color="blue")+
ggtitle("Time and Number of Incidents")+
#geom_density(aes(x=hour), fill="blue")+
#xlim(c(2006, 2020))+
#ylim(c(1000, 5000))+
xlab("Time of Incident") + ylab("Number of Incidents")
#What are the top 10 streets with the most incidents reported?
Crime_join%>%
select(street, incident_number)%>%
group_by(street)%>%
filter(!is.na(street))%>%
summarize(NumberofIncident = n())%>%
arrange(desc(NumberofIncident))%>%
head(10)%>%
ggplot()+
geom_point(aes(x = NumberofIncident, y=street))+
geom_label(aes(x = NumberofIncident, y=street, label=street))+
ggtitle("Top 10 Streets by Reported Incidents")+
xlab("Number of Incidents") + ylab("Streets")
#What's the Proportion of Shooting Incidents?
Crime_join%>%
select(shooting)%>%
group_by(shooting)%>%
summarise(Count= n(), shooting_incidents=sum(shooting))%>%
mutate(shooting=as.factor(shooting), Proportion = Count/sum(Count)*100, CummProp = cumsum(Proportion),ymin = c(0, head(CummProp, n=-1))) %>%
ggplot()+
geom_rect(mapping = aes(ymax=CummProp, ymin=ymin,xmax=4, xmin=3, fill=shooting))+
geom_label(aes(x = 2,y=Proportion, label=Proportion))+
coord_polar(theta="y")+ # Try to remove that to understand how the chart is built initially
xlim(c(-1, 4))+
theme_void()
#What are the number of incidents reported across neighborhoods?
pal <- colorFactor(c("red","gray","orange","white","blue"),
domain = unique(Crime_join$offense_code))
map <- leaflet(Crime_join) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
color = ~pal(Crime_join$offense_code),
stroke = FALSE, fillOpacity = 0.5,
lat = Crime_join$lat,
lng = Crime_join$long,
clusterOptions = markerClusterOptions(),
popup = as.character(Crime_join$street))
## Warning in validateCoords(lng, lat, funcName): Data contains 32548 rows with
## either missing or invalid lat/lon values and will be ignored
map