In this week’s assignment, I am going to scrape some crime data for the state of Maine from ‘The Disaster Center’ website (actual URLs used will be shared as appropriate below).

I have reviewed the site’s robot.txt file (http://www.disastercenter.com/robots.txt) and the pages that I utilized are not listed as disallowed. Given that I will not be in violation of the site’s robot.txt file and this is for educational purposes, I believe my use of web scraping under these parameters is considered ethical.

Packages

Load required packages (suppressing warning messages as needed).

library (knitr)
suppressMessages (library (tidyverse))
suppressMessages (library (rvest))

Data

Scrape number of crimes by type from http://www.disastercenter.com/crime/mecrime.htm for the years 2010 - 2019, leveraging Chrome’s SelectorGadget extension. Remove newline and comma characters and convert strings to numeric.

maineCrimeSite <- read_html ("http://www.disastercenter.com/crime/mecrime.htm")

year <- as.numeric (
  maineCrimeSite %>%
    html_nodes ("tr:nth-child(2) tr td:nth-child(1) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

violent <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(4) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

property <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(5) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

murder <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(6) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

rape <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(7) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

robbery <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(8) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

assault <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(9) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

burglary <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(10) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

larceny <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(11) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

vehicle <- as.numeric (
  maineCrimeSite %>% 
    html_nodes ("tr:nth-child(2) td:nth-child(12) small") %>% 
    html_text () %>% 
    str_replace_all ("\\n", "") %>% 
    str_replace_all (",", ""))

Combine into a data frame with friendly column names.

combinedData = data.frame (
  year, 
  violent,  
  property, 
  murder, 
  rape, 
  robbery, 
  assault, 
  burglary, 
  larceny, 
  vehicle)

names(combinedData) <- c ("Year", 
                          "Violent", 
                          "Property", 
                          "Murder", 
                          "Forcible Rape", 
                          "Robbery", 
                          "Aggravated Assault", 
                          "Burglary", 
                          "Larceny Theft", 
                          "Vehicle Theft")

Convert data frame to a tibble.

combinedDataAsTibble <- as_tibble (combinedData)

Derive a new column that calculates the total crime per year.

combinedDataAsTibble <- combinedDataAsTibble %>%
  mutate ('Total Crime' = 
            combinedDataAsTibble$Violent +
            combinedDataAsTibble$Property +
            combinedDataAsTibble$Murder + 
            combinedDataAsTibble$`Forcible Rape` +
            combinedDataAsTibble$Robbery +
            combinedDataAsTibble$`Aggravated Assault` +
            combinedDataAsTibble$Burglary +
            combinedDataAsTibble$`Larceny Theft` +
            combinedDataAsTibble$`Vehicle Theft`) 

Display table with data of interest (wide format).

kable (combinedDataAsTibble, caption = "Maine Number of Crimes by Type 2010 - 2019")
Maine Number of Crimes by Type 2010 - 2019
Year Violent Property Murder Forcible Rape Robbery Aggravated Assault Burglary Larceny Theft Vehicle Theft Total Crime
2010 1621 32900 24 389 412 796 7364 24547 989 69042
2011 1638 33829 26 394 370 848 7865 24886 1078 70934
2012 1626 33398 26 372 420 808 7476 24931 991 70048
2013 1761 30454 24 366 335 907 6480 23063 911 64301
2014 1698 26427 21 346 304 889 5035 20591 801 56112
2015 1726 24361 23 337 311 931 4694 18853 814 52050
2016 1648 21912 20 300 266 950 4003 17134 775 47008
2017 1610 20142 23 470 249 868 3337 16012 793 43504
2018 1500 18201 23 447 228 802 2712 14712 777 39402
2019 1548 16743 20 516 188 824 2350 13667 726 36582

Charts

Create a time series chart that shows the number occurrences of each crime between 2010 - 2019. As part of this, convert the tibble to long format and drop total crimes for this data set.

chartData <- combinedDataAsTibble %>% 
  select(-c(`Total Crime`)) %>%
  gather(Crime, Occurrences, -c(Year)) %>% 
  arrange(Year)

ggplot (chartData, aes (x = Year, y = Occurrences, col = Crime)) +
  geom_line () +
  geom_point () +
  scale_x_continuous (breaks = c (2010:2019)) +
  ggtitle ("Chart#1: Number of Crimes by Type for Years 2010 - 2019") +
  theme (plot.title = element_text (hjust = 0.5))

Since Burglary, Larceny and Property crimes greatly out number the rest, re-run the chart without these crime types.

chartDataRefined <- combinedDataAsTibble %>% 
  select(-c(Burglary, Property, `Larceny Theft`, `Total Crime`)) %>%
  gather(Crime, Occurrences, -c(Year)) %>% 
  arrange(Year)

ggplot (chartDataRefined, aes (x = Year, y = Occurrences, col = Crime)) +
  geom_line () +
  geom_point () +
  scale_x_continuous (breaks = c (2010:2019)) +
  ggtitle ("Chart#2: Number of Crimes by Type for Years 2010 - 2019 \n (Burglary, Property and Larceny Theft Not Included)") +
  theme (plot.title = element_text (hjust = 0.5))

From Charts #1 and #2 most crimes have either decreased over stayed relatively flat over the past decade. The exception is is forcible rape which has steadily increased since 2016.

Lastly, create a bar chart to compare most to least crimes for 2019.

ggplot (data = chartData, aes (x = fct_reorder (Crime, Occurrences), y = Occurrences)) +
  geom_col (position = "dodge") +
  geom_col (fill = "blue") +
  xlab ("Crime") +
  scale_y_continuous (breaks = c (0, 25000, 50000, 75000, 100000, 125000, 150000, 175000, 200000, 225000, 250000, 275000, 300000)) +
  ggtitle ("Chart#3: Crimes by Type for 2019") +
  theme (plot.title = element_text (hjust = 0.5)) +
  theme (axis.text.x = element_text (angle = 90, vjust = 0.5, hjust = 1))

Since Burglary, Larceny and Property crimes greatly out number the rest, re-run the chart without these crime types.

ggplot (data = chartDataRefined, aes (x = fct_reorder (Crime, Occurrences), y = Occurrences)) +
  geom_col (position = "dodge") +
  geom_col (fill = "blue") +
  xlab ("Crime") +
  scale_y_continuous (breaks = c (0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000)) +
  ggtitle ("Chart#4: Crimes by Type for 2019 \n (Burglary, Property and Larceny Theft Not Included)") +
  theme (plot.title = element_text (hjust = 0.5)) +
  theme (axis.text.x = element_text (angle = 90, vjust = 0.5, hjust = 1))

One issue I would have liked to have resolved in Chart #3 and #4 was the placement of a label on each bar that stated the number of occurrences for each crime. This would make it easier for the viewer to see the number of occurrences for the crime committed the least (e.g., murder). However, comparatively speaking, it is great to see that the number of murders are fairly negligible in the state.