required_packages <- c('tidyverse', 'ggplot2', 'ggpmisc', 'wordcloud', 'tidytext', 'leaflet')
for (p in required_packages) {
  if(!require(p,character.only = TRUE)) {
    install.packages(p, dep = TRUE)
  }
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: ggpmisc
## Loading required package: ggpp
## 
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## Loading required package: tidytext
## Loading required package: leaflet
#Reading the data 
crime<-read_csv("crime.csv", show_col_types = FALSE)
offense_code<-read_csv("offense_codes.csv", show_col_types = FALSE)

Exploratory Data Analysis

#View(crime)
#View(offense_code)
summary(crime)
##  INCIDENT_NUMBER     OFFENSE_CODE  OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION
##  Length:319073      Min.   : 111   Length:319073      Length:319073      
##  Class :character   1st Qu.:1001   Class :character   Class :character   
##  Mode  :character   Median :2907   Mode  :character   Mode  :character   
##                     Mean   :2318                                         
##                     3rd Qu.:3201                                         
##                     Max.   :3831                                         
##                                                                          
##    DISTRICT         REPORTING_AREA    SHOOTING         OCCURRED_ON_DATE  
##  Length:319073      Min.   :  0.0   Length:319073      Length:319073     
##  Class :character   1st Qu.:177.0   Class :character   Class :character  
##  Mode  :character   Median :344.0   Mode  :character   Mode  :character  
##                     Mean   :383.2                                        
##                     3rd Qu.:544.0                                        
##                     Max.   :962.0                                        
##                     NA's   :20250                                        
##       YEAR          MONTH       DAY_OF_WEEK             HOUR      
##  Min.   :2015   Min.   : 1.00   Length:319073      Min.   : 0.00  
##  1st Qu.:2016   1st Qu.: 4.00   Class :character   1st Qu.: 9.00  
##  Median :2017   Median : 7.00   Mode  :character   Median :14.00  
##  Mean   :2017   Mean   : 6.61                      Mean   :13.12  
##  3rd Qu.:2017   3rd Qu.: 9.00                      3rd Qu.:18.00  
##  Max.   :2018   Max.   :12.00                      Max.   :23.00  
##                                                                   
##    UCR_PART            STREET               Lat             Long       
##  Length:319073      Length:319073      Min.   :-1.00   Min.   :-71.18  
##  Class :character   Class :character   1st Qu.:42.30   1st Qu.:-71.10  
##  Mode  :character   Mode  :character   Median :42.33   Median :-71.08  
##                                        Mean   :42.21   Mean   :-70.91  
##                                        3rd Qu.:42.35   3rd Qu.:-71.06  
##                                        Max.   :42.40   Max.   : -1.00  
##                                        NA's   :19999   NA's   :19999   
##    Location        
##  Length:319073     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
summary(offense_code)
##       CODE          NAME          
##  Min.   : 111   Length:576        
##  1st Qu.: 542   Class :character  
##  Median :1768   Mode  :character  
##  Mean   :1728                     
##  3rd Qu.:2900                     
##  Max.   :3831

Data Wrangling and Cleaning

#Creating a Tibble
Crime <- as_tibble(crime)
Offense_code <- as_tibble(offense_code)
ncol(Crime)
## [1] 17
#changing all column names to lower
names(Crime)<- tolower(names(Crime))

#renaming the Code column in Crime data as it is in offence code
colnames(Offense_code)[colnames(Offense_code)=="CODE"] <- "offense_code"

#changing street names to lower case
street<- Crime$street
street<- tolower(street)
street[1:5]
## [1] "lincoln st"  "hecla st"    "cazenove st" "newcomb st"  "delhi st"
#changing offense description to lower case
description<- Crime$offense_description
description<- tolower(description)
description[1:5]
## [1] "larceny all others"   "vandalism"            "towed motor vehicle" 
## [4] "investigate property" "investigate property"
#removing leading zeros in the offense code column
offense_code<- str_remove(Crime$offense_code, "0")
offense_code<- gsub("(?<![0-9])0+", "", offense_code, perl = TRUE)
offense_code[1:5]
## [1] "619"  "142"  "341"  "3114" "3114"
#replacing Y in shooting with 1 and making the column a numeric
Crime$shooting<- Crime$shooting
Crime$shooting[is.na(Crime$shooting)]=0
Crime$shooting[Crime$shooting=='Y']=1
Crime$shooting<- as.numeric(Crime$shooting)
Crime$shooting[1297:1862]
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 1 0
#removing punctuation in the offense description column
description<- gsub('[[:punct:]]', "", description)

#removing unnecessary spaces
description<- str_replace_all(description, " ", " ")
description[31451]
## [1] "drugs  poss class b  cocaine etc"
#removing digits
descriptiont<- gsub('[[:digit:]]', "", description)
Offense_code%>%
  mutate(offence_code=Offense_code$CODE)
## Warning: Unknown or uninitialised column: `CODE`.
Offense_code$CODE
## Warning: Unknown or uninitialised column: `CODE`.
## NULL
#joining both the Crime and Offesnse code data
Crime_join <- merge(Crime,Offense_code)
names(Crime_join)
##  [1] "offense_code"        "incident_number"     "offense_code_group" 
##  [4] "offense_description" "district"            "reporting_area"     
##  [7] "shooting"            "occurred_on_date"    "year"               
## [10] "month"               "day_of_week"         "hour"               
## [13] "ucr_part"            "street"              "lat"                
## [16] "long"                "location"            "NAME"
#changing the Name coumn
colnames(Crime_join)[colnames(Crime_join)=="NAME"] <- "offense_name"
#removing non printing characters 
offense_name<-Crime_join$offense_name
offense_name<- gsub('[^\x20-\x7F]', "", offense_name)
names(offense_name)<- tolower(offense_name)

Data Visualization

#How's the crime reported over the years


Crime_join%>%
  select(year, incident_number)%>%
  group_by(year)%>%
  summarise(NumberofIncident = n())%>%
  ggplot()+
  geom_line(aes(x=year, y=NumberofIncident))+
  xlab("Year") + ylab("Number of Incidents")

#What time of the day does most incidents get reported?

Crime_join%>%
  select(hour, incident_number)%>%
  group_by(incident_number)%>%
  ggplot()+
  geom_bar(aes(x=hour), color="blue")+
  ggtitle("Time and Number of Incidents")+
  #geom_density(aes(x=hour), fill="blue")+
  #xlim(c(2006, 2020))+ 
  #ylim(c(1000, 5000))+
  xlab("Time of Incident") + ylab("Number of Incidents")

#What are the top 10 streets with the most incidents reported?

Crime_join%>%
  select(street, incident_number)%>%
  group_by(street)%>%
  filter(!is.na(street))%>%
  summarize(NumberofIncident = n())%>%
  arrange(desc(NumberofIncident))%>%
  head(10)%>%
  ggplot()+
  geom_point(aes(x = NumberofIncident, y=street))+
  geom_label(aes(x = NumberofIncident, y=street, label=street))+
  ggtitle("Top 10 Streets by Reported Incidents")+
  xlab("Number of Incidents") + ylab("Streets")

#What's the Proportion of Shooting Incidents?

Crime_join%>%
  select(shooting)%>%
  group_by(shooting)%>%
  summarise(Count= n(), shooting_incidents=sum(shooting))%>%
  mutate(shooting=as.factor(shooting), Proportion = Count/sum(Count)*100, CummProp = cumsum(Proportion),ymin = c(0, head(CummProp, n=-1))) %>%
  ggplot()+
  geom_rect(mapping = aes(ymax=CummProp, ymin=ymin,xmax=4, xmin=3, fill=shooting))+
  geom_label(aes(x = 2,y=Proportion, label=Proportion))+
  coord_polar(theta="y")+ # Try to remove that to understand how the chart is built initially
  xlim(c(-1, 4))+
  theme_void() 

#What are the number of incidents reported across neighborhoods?

pal <- colorFactor(c("red","gray","orange","white","blue"),
                   domain = unique(Crime_join$offense_code))

      map <- leaflet(Crime_join) %>%
      addProviderTiles("CartoDB.Positron") %>%
      addCircleMarkers(
          color = ~pal(Crime_join$offense_code),
          stroke = FALSE, fillOpacity = 0.5,
          lat =  Crime_join$lat,
          lng = Crime_join$long,
          clusterOptions = markerClusterOptions(),
          popup = as.character(Crime_join$street))
## Warning in validateCoords(lng, lat, funcName): Data contains 32548 rows with
## either missing or invalid lat/lon values and will be ignored
      map
#Which words are usually used to describe incidents?

Word_Cloud = function(x)
{
  Crime_join %>%
    unnest_tokens(word, offense_description) %>%
    filter(!word %in% stop_words$word) %>%
    filter(!word %in% c("mfr", "dist", "val","mv" ))%>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    head(50) %>%
    with(wordcloud(word, n, max.words = 40,colors=brewer.pal(8, "Set1")))
}

Word_Cloud(offense_description)