setwd ("C:/Loyola University/MBA/GB736 - Data Vizualization/R")
projectfile <- "BPD_Part_1_Victim_Based_Crime_Data12_31_2019.csv"
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:base':
##
## date
library(scales)
library(DescTools)
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
##
## %like%
library(ggplot2)
library(packrat)
library(rsconnect)
library(knitr)
knit2html("file")
## Warning in file(con, "r"): cannot open file 'file': No such file or directory
## Error in file(con, "r"): cannot open the connection
mycolnames <- fread(projectfile, nrows=1, header = F)
notneeded <- c("Longitude", "Latitude", "Location 1", "vri_name1")
df <- fread(projectfile, na.strings=c("", NA), drop = notneeded)
df2 <- df %>%
select(CrimeCode, CrimeDate, CrimeTime, `Total Incidents`) %>%
filter(!is.na(CrimeTime) & !is.na(CrimeCode) & !is.na(CrimeDate)) %>%
mutate( hour24 = hour(hms(CrimeTime)),
dayoftheweek = weekdays(mdy(CrimeDate), abbreviate=TRUE),
monthname = months(mdy(CrimeDate), abbreviate=TRUE),
year=year(mdy(CrimeDate))) %>%
filter(year >= 2015) %>%
data.frame()
df_year <- df2 %>%
group_by(year) %>%
summarise(totincidents=sum(Total.Incidents)) %>%
data.frame()
df_day <- df2 %>%
group_by(dayoftheweek) %>%
summarise( totincidents=sum(Total.Incidents)) %>%
data.frame()
df_month <- df2 %>%
group_by(monthname) %>%
summarise( totincidents=sum(Total.Incidents)) %>%
data.frame()
df_hour <- df2 %>%
group_by(hour24) %>%
summarise(totincidents=sum(Total.Incidents)) %>%
data.frame()
df_dayhour <- df2 %>%
group_by(dayoftheweek, hour24) %>%
summarise(totincidents=sum(Total.Incidents)) %>%
data.frame()
df_yearplot <- df %>% select(CrimeDate, "Total Incidents") %>%
filter(!is.na(CrimeDate) & !is.na("Total Incidents")) %>%
mutate(yr = year(mdy(CrimeDate))) %>%
filter(yr >= 2015) %>%
group_by(yr) %>%
summarise(totincidents = sum(`Total Incidents`)) %>%
arrange(desc(totincidents)) %>%
data.frame()
df_neighborhood <- df %>% select(Neighborhood, CrimeDate, "Total Incidents") %>%
filter(!is.na(Neighborhood) & !is.na(CrimeDate) & !is.na("Total Incidents")) %>%
mutate(yr = year(mdy(CrimeDate))) %>%
filter(yr == 2018 | yr == 2019) %>%
group_by(Neighborhood, yr) %>%
summarise(totincidents = sum(`Total Incidents`)) %>%
arrange(desc(totincidents)) %>%
data.frame() %>%
top_n(20, wt=totincidents)
The dataset that I have chose to analyze provides open preliminary data through the Baltimore Police Department depicting victim based crime reports across the greater city. This dataset includes variables such as (but not limited to) date, time, location, weapon, neighborhood, district, etc. [See below]. This dataset also includes 31 years of data spaning back to 1963 (not inclusive of all years - only past 5 most relevant years used). This dataset was obtained through https://data.baltimorecity.gov/Public-Safety/BPD-Part-1-Victim-Based-Crime-Data/wsfq-mvij.
colnames(df)
## [1] "CrimeDate" "CrimeTime" "CrimeCode" "Location"
## [5] "Description" "Inside/Outside" "Weapon" "Post"
## [9] "District" "Neighborhood" "Premise" "Total Incidents"
The first graph illistrates the overall victim incidents in Baltimore City over the past five years. Since 2017, victim related incidents have decreased by over 7,000. In fact, 2019 had the lowest amount of victim related incidents in the past 5 years whereas 2017 was the most dangerous year reaching close to 52,000.
library(ggplot2)
ggplot(df_yearplot, aes(x=yr, y=totincidents, fill=as.factor(yr))) +
geom_bar(colour="black", fill="red", stat="identity", width=.6) +
labs(x="Year", y="Incidents") +
labs(title = "Number of Victim Incidents", subtitle="Baltimore City 2015-2019") +
labs(caption = "www.data.baltimorecity.gov") +
scale_fill_brewer(name="Year", palette="Set3") +
scale_y_continuous(label = comma) +
theme(plot.title = element_text(size=14, face="italic", hjust=0.5, color="black"))
The second graph depicts the overall victim incidents in Baltimore City but broken down by month. It is quite clear from the graph that crime is most prevlant in the warmer months. As you can see, crime seems to peak in August. January seems to be an outlier month where incidents rise; this could have something to do with the nature of the “New Year”. It is safe to say, however, that based on this data, people are less likely to be a victim of a crime during the winter months compared to the rest of the year.
month_order <- factor(df_month$monthname, level = c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'))
month_order
## [1] Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
ggplot(df_month, aes(x = month_order, y=totincidents)) +
geom_bar(colour="black", fill="blue", stat="identity" , width=.6) +
labs(title = "Number of Victim Incidents by Month", subtitle="Baltimore City 2015-2019", x = "Months of the Year", y = "Incidents") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(caption = "source: www.data.baltimorecity.gov") +
scale_fill_brewer(palette="Set2") +
scale_y_continuous(labels = comma) +
theme(plot.title = element_text(size=14, face="italic", hjust=0.5, color="black"))
The third graph depicts the overall victim incidents in Baltimore City but broken down by day of the week. It is interesting to see that there isn’t too much of a delta day over day in terms of incident count. Not surprisingly, Friday is the highest day of the week for victim incidents as it is the end of the work week and the start to the weekend. It is surprising, however (to me), that Saturday and Sunday are the lowest for incidents; I would have assumed more crime would happen on the weekend.
day_order <- factor(df_day$dayoftheweek, level=c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
day_order
## [1] Fri Mon Sat Sun Thu Tue Wed
## Levels: Mon Tue Wed Thu Fri Sat Sun
ggplot(df_day, aes(x = day_order, y=totincidents)) +
geom_bar(colour="black", fill="green", stat="identity" , width=.6) +
labs(title = "Number of Victim Incidents by Day", subtitle="Baltimore City 2015-2019", x = "Day of the Year", y = "Incidents") +
theme(plot.title = element_text(hjust = 0.5)) +
labs(caption = "source: www.data.baltimorecity.gov") +
scale_fill_brewer(palette="Set2") +
scale_y_continuous(labels = comma) +
theme(plot.title = element_text(size=14, face="italic", hjust=0.5, color="black"))
The fourth graph describes the number victim incidents in Baltimore City by hour and by day of the week. This expands on Plot 3 a little further by breaking down the time of the incident. Based on the grpah, 6:00PM is actually the most likely time to face a victim related incident. Conversely, 5:00AM is the least likely time to face a victim related incident. This graphic does a excellent job of clearly showing how as it gets closer to sundown, the more likely you are to encounter an issue.
ggplot(df_dayhour, aes(x=hour24, y=totincidents, fill=dayoftheweek)) +
geom_bar(stat="identity", position="stack") +
ggtitle("Number of Victim Incidents \nby Hour and Days of the Week", subtitle="Baltimore City 2015-2019") +
xlab("Hours of the Day") +
ylab("Incidents") +
labs(fill="Days of the Week") +
labs(caption = "source: www.baltimorecity.gov") +
scale_fill_brewer(palette="Set3") +
scale_y_continuous(labels = comma) +
theme(plot.title = element_text(size=14, face="italic", hjust=0.5, color="black"))
The fifth and final graph shows the top 20 victim incident neighborhoods in Baltimore City for the past two years. It is clear that downtown has faced the most vicitm incidents by far. This is a little disheartening since I live downtown. Coincidentally, having downloaded the “Criminal” app, I can affirm that this is very accurate; I recieve notifcations daily about crimes happening in the downtown area. This graph also depicts how some neighborhoods in the city have seen a small decrease in victim incidents between ’18-’19 including (but not limited to) Belair, Brooklyn, Canton, Sandtown, Inner Harbor.
ggplot(df_neighborhood, aes(x=reorder(Neighborhood, totincidents), y=totincidents, fill=as.factor(yr))) +
geom_bar(stat="identity", position="dodge", width=.6) +
coord_flip() +
labs(x="Neighborhood", y="Incidents") +
labs(title = "Top 20 Victim Incident Neighborhoods in Baltimore", subtitle="Baltimore City 2018-2019") +
labs(caption = "www.baltimorecity.gov") +
scale_fill_brewer(name="Year", palette="Set2") +
scale_y_continuous(label = comma) +
theme(plot.title = element_text(size=14, face="italic", hjust=0.5, color="black"))+
geom_text(aes(label=totincidents), size=3, position=position_dodge(width=0.5), hjust=-0.25)