#getwd()
#setwd("/Users/tina/Projects/R/PhoenixCityData")
#Installing packages needed:
suppressMessages(library(dplyr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(plyr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(readr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(lubridate, warn.conflicts = FALSE, quietly=TRUE))
#Reading in the data
CrimeStats = read.csv("Crime Stats.csv")
str(CrimeStats)
## 'data.frame': 29711 obs. of 7 variables:
## $ INC.NUMBER : Factor w/ 29711 levels "000061951901-000",..: 61 1389 36 11104 23873 22243 42 26 6186 39 ...
## $ OCCURRED.ON : Factor w/ 19038 levels "","01/01/2016 00:00",..: 12663 12663 12663 12663 12663 12663 12663 12663 12664 12664 ...
## $ OCCURRED.TO : Factor w/ 15546 levels "","01/01/2016 00:00",..: 10387 11185 10362 723 1 7031 10371 10356 11751 1 ...
## $ UCR.CRIME.CATEGORY: Factor w/ 9 levels "AGGRAVATED ASSAULT",..: 4 5 5 6 8 5 6 6 5 5 ...
## $ X100.BLOCK.ADDR : Factor w/ 12270 levels "100XX N 11TH AVE",..: 8881 1112 8575 11994 859 5515 9958 79 5858 11061 ...
## $ ZIP : int 85031 85014 85014 85029 85006 85017 85043 85307 85018 85051 ...
## $ PREMISE.TYPE : Factor w/ 73 levels "","07A STOREROOM/SHED (COMMERCIAL)",..: 6 52 6 63 63 52 63 63 36 38 ...
summary(CrimeStats)
## INC.NUMBER OCCURRED.ON
## 000061951901-000: 1 : 109
## 201500001958633 : 1 04/16/2016 17:00: 13
## 201500001971825 : 1 11/01/2015 00:01: 13
## 201500001978297 : 1 01/28/2016 17:00: 12
## 201500001979643 : 1 11/06/2015 12:00: 12
## 201500001981209 : 1 12/01/2015 12:00: 12
## (Other) :29705 (Other) :29540
## OCCURRED.TO UCR.CRIME.CATEGORY
## : 8120 LARCENY-THEFT :15085
## 01/25/2016 08:00: 11 BURGLARY : 5670
## 03/29/2016 08:00: 11 MOTOR VEHICLE THEFT: 3349
## 01/20/2016 08:00: 10 DRUG OFFENSE : 2122
## 02/23/2016 12:00: 9 AGGRAVATED ASSAULT : 1781
## 03/14/2016 07:00: 9 ROBBERY : 1182
## (Other) :21541 (Other) : 522
## X100.BLOCK.ADDR ZIP
## 18XX W BELL RD : 237 Min. :85003
## 61XX N 35TH AVE : 205 1st Qu.:85016
## 61XX S 35TH AVE : 186 Median :85027
## 16XX W BETHANY HOME RD: 161 Mean :85044
## 45XX E CACTUS RD : 136 3rd Qu.:85041
## 57XX N 19TH AVE : 135 Max. :85395
## (Other) :28651
## PREMISE.TYPE
## SINGLE FAMILY HOUSE : 5873
## APARTMENT : 3356
## PARKING LOT : 3281
## RETAIL BUSINESS : 3168
## STREET / ROADWAY / ALLEY / SIDEWALK: 2334
## VEHICLE : 1601
## (Other) :10098
head(CrimeStats)
## INC.NUMBER OCCURRED.ON OCCURRED.TO UCR.CRIME.CATEGORY
## 1 201500002103724 11/01/2015 00:00 11/01/2015 15:21 DRUG OFFENSE
## 2 201500002168686 11/01/2015 00:00 11/11/2015 09:30 LARCENY-THEFT
## 3 201500002102327 11/01/2015 00:00 11/01/2015 09:00 LARCENY-THEFT
## 4 201600000052855 11/01/2015 00:00 01/09/2016 00:00 MOTOR VEHICLE THEFT
## 5 201600000594484 11/01/2015 00:00 RAPE
## 6 201600000527709 11/01/2015 00:00 03/22/2016 00:36 LARCENY-THEFT
## X100.BLOCK.ADDR ZIP PREMISE.TYPE
## 1 54XX W INDIAN SCHOOL RD 85031 APARTMENT
## 2 14XX E HIGHLAND AVE 85014 PARKING LOT
## 3 51XX N 15TH ST 85014 APARTMENT
## 4 N 43RD AVE & W CACTUS RD 85029 SINGLE FAMILY HOUSE
## 5 13XX E ALMERIA RD 85006 SINGLE FAMILY HOUSE
## 6 33XX W CAMELBACK RD 85017 PARKING LOT
#Getting the data into shape:
#Working with crime date columns:
#Using Lubridate to convert crime occurance dates from factors to date-time
OccurredOn <- CrimeStats$OCCURRED.ON
OccurredTo <- CrimeStats$OCCURRED.TO
OccurredOnDate <- mdy_hm(OccurredOn)
OccurredToDate <- mdy_hm(OccurredTo)
CrimeDuration <- OccurredToDate-OccurredOnDate
CrimeStats$OccurredOnDate <- OccurredOnDate
CrimeStats$OccurredToDate <- OccurredToDate
#head(CrimeStats)
#Renaming the data frame:
CrimeData <- CrimeStats
#str(CrimeData)
#removing unneeded columns:
CrimeData$OCCURRED.ON <- NULL
CrimeData$OCCURRED.TO <- NULL
CrimeData$CrimeDuration <- CrimeDuration
#Converting crime duration from seconds to days, hours, min, seconds:
Duration <- interval(ymd_hms(OccurredOnDate, tz="MST"), ymd_hms(OccurredToDate, tz="MST"))
CrimeData$Duration <- as.period(Duration)
#Extracting date and time information from starting time of the crime
CrimeData$WeekDay <- wday(OccurredOnDate, label=TRUE)
CrimeData$Month <- month(OccurredOnDate, label=TRUE)
CrimeData$Hour <- hour(OccurredOnDate)
head(CrimeData)
## INC.NUMBER UCR.CRIME.CATEGORY X100.BLOCK.ADDR ZIP
## 1 201500002103724 DRUG OFFENSE 54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686 LARCENY-THEFT 14XX E HIGHLAND AVE 85014
## 3 201500002102327 LARCENY-THEFT 51XX N 15TH ST 85014
## 4 201600000052855 MOTOR VEHICLE THEFT N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484 RAPE 13XX E ALMERIA RD 85006
## 6 201600000527709 LARCENY-THEFT 33XX W CAMELBACK RD 85017
## PREMISE.TYPE OccurredOnDate OccurredToDate CrimeDuration
## 1 APARTMENT 2015-11-01 2015-11-01 15:21:00 55260 secs
## 2 PARKING LOT 2015-11-01 2015-11-11 09:30:00 898200 secs
## 3 APARTMENT 2015-11-01 2015-11-01 09:00:00 32400 secs
## 4 SINGLE FAMILY HOUSE 2015-11-01 2016-01-09 00:00:00 5961600 secs
## 5 SINGLE FAMILY HOUSE 2015-11-01 <NA> NA secs
## 6 PARKING LOT 2015-11-01 2016-03-22 00:36:00 12270960 secs
## Duration WeekDay Month Hour
## 1 15H 21M 0S Sun Nov 0
## 2 10d 9H 30M 0S Sun Nov 0
## 3 9H 0M 0S Sun Nov 0
## 4 2m 8d 0H 0M 0S Sun Nov 0
## 5 <NA> Sun Nov 0
## 6 4m 21d 0H 36M 0S Sun Nov 0
#renaming columns
CrimeDataNames <- c("Number", "Category", "Address", "ZIP", "Premises",
"OccuranceDate", "EndDate", "Duration", "DurationSpan", "Weekday", "Month", "NA")
names(CrimeData) <- CrimeDataNames
#Convert zip code to a factor to keep it unique:
CrimeData$ZIP <- as.factor(CrimeData$ZIP)
#Change crime category lables and premises labels to lower case:
CrimeData$Category <- as.factor(tolower(as.character(CrimeData$Category)))
CrimeData$Premises <- as.factor(tolower(as.character(CrimeData$Premises)))
#renaming the category for "murder and non-negligent manslaughter" to make it fit into the bar graph
levels(CrimeData$Category)[levels(CrimeData$Category)=="murder and non-negligent manslaughter"] <- "murder"
levels(CrimeData$Category)[levels(CrimeData$Category)=="murder"] <- "murder-manslaughter"
head(CrimeData)
## Number Category Address ZIP
## 1 201500002103724 drug offense 54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686 larceny-theft 14XX E HIGHLAND AVE 85014
## 3 201500002102327 larceny-theft 51XX N 15TH ST 85014
## 4 201600000052855 motor vehicle theft N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484 rape 13XX E ALMERIA RD 85006
## 6 201600000527709 larceny-theft 33XX W CAMELBACK RD 85017
## Premises OccuranceDate EndDate Duration
## 1 apartment 2015-11-01 2015-11-01 15:21:00 55260 secs
## 2 parking lot 2015-11-01 2015-11-11 09:30:00 898200 secs
## 3 apartment 2015-11-01 2015-11-01 09:00:00 32400 secs
## 4 single family house 2015-11-01 2016-01-09 00:00:00 5961600 secs
## 5 single family house 2015-11-01 <NA> NA secs
## 6 parking lot 2015-11-01 2016-03-22 00:36:00 12270960 secs
## DurationSpan Weekday Month NA
## 1 15H 21M 0S Sun Nov 0
## 2 10d 9H 30M 0S Sun Nov 0
## 3 9H 0M 0S Sun Nov 0
## 4 2m 8d 0H 0M 0S Sun Nov 0
## 5 <NA> Sun Nov 0
## 6 4m 21d 0H 36M 0S Sun Nov 0
#Getting summary stats:
#NUMBER OF CRIME TYPE PER ZIP CODE:
#summary table:
CrimeCategoryPerZIPCode <- with(CrimeData, table(ZIP, Category))
#CrimeCategoryPerZIPCode
#Summarising in terms of proportions:
CrimeCategoryPerZipCodeProp <- prop.table(CrimeCategoryPerZIPCode, margin = 1)
#Proportions for relative comparisons of crimes in ZIP:
CrimeCatPerZipPropRounded <- round(CrimeCategoryPerZipCodeProp, digits = 3)
#Converting the summary tables to data frames to do further analysis:
CatPerZipDF <- as.data.frame(CrimeCategoryPerZIPCode)
#Writing to file:
write.csv(CatPerZipDF, file = "CategoryPerZIPDf.csv")
head(CatPerZipDF)
## ZIP Category Freq
## 1 85003 aggravated assault 17
## 2 85004 aggravated assault 19
## 3 85006 aggravated assault 61
## 4 85007 aggravated assault 44
## 5 85008 aggravated assault 93
## 6 85009 aggravated assault 88
#NUMBER OF CRIME TYPE PER PREMISES TYPE
CrimeCategoryPerPremises <- with(CrimeData, table(Premises, Category))
#Summarising in terms of proportions:
CrimeCategoryPerPremisesProp <- prop.table(CrimeCategoryPerPremises, margin = 1)
#Proportions for relative comparisons of crimes in premises:
CrimeCatPerPremPropRounded <- round(CrimeCategoryPerPremisesProp, digits = 3)
#Converting the summary tables to data frames to do further analysis:
CatPerPremDF <- as.data.frame(CrimeCategoryPerPremises)
#Writing to file:
write.csv(CatPerPremDF, file = "CategoryPerPremDf.csv")
head(CatPerPremDF)
## Premises Category Freq
## 1 aggravated assault 9
## 2 07a storeroom/shed (commercial) aggravated assault 0
## 3 07a storeroom/shed (residential) aggravated assault 0
## 4 adult only store / movie aggravated assault 1
## 5 airport aggravated assault 0
## 6 apartment aggravated assault 402
#NUMBER OF CRIME TYPE PER MONTH: NOTE we don't have data for the months of May
#through to October. Somehow I need to remove these months
CrimeCategoryPerMonth <- with(CrimeData, table(Month, Category))
head(CrimeData)
## Number Category Address ZIP
## 1 201500002103724 drug offense 54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686 larceny-theft 14XX E HIGHLAND AVE 85014
## 3 201500002102327 larceny-theft 51XX N 15TH ST 85014
## 4 201600000052855 motor vehicle theft N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484 rape 13XX E ALMERIA RD 85006
## 6 201600000527709 larceny-theft 33XX W CAMELBACK RD 85017
## Premises OccuranceDate EndDate Duration
## 1 apartment 2015-11-01 2015-11-01 15:21:00 55260 secs
## 2 parking lot 2015-11-01 2015-11-11 09:30:00 898200 secs
## 3 apartment 2015-11-01 2015-11-01 09:00:00 32400 secs
## 4 single family house 2015-11-01 2016-01-09 00:00:00 5961600 secs
## 5 single family house 2015-11-01 <NA> NA secs
## 6 parking lot 2015-11-01 2016-03-22 00:36:00 12270960 secs
## DurationSpan Weekday Month NA
## 1 15H 21M 0S Sun Nov 0
## 2 10d 9H 30M 0S Sun Nov 0
## 3 9H 0M 0S Sun Nov 0
## 4 2m 8d 0H 0M 0S Sun Nov 0
## 5 <NA> Sun Nov 0
## 6 4m 21d 0H 36M 0S Sun Nov 0
CrimeCategoryPerMonthProp <- prop.table(CrimeCategoryPerMonth, margin = 1)
CrimeCategoryPerMonthPropRounded <- round(CrimeCategoryPerMonthProp, digits = 3)
#Converting the frequency table to a data frame:
CatPerMonthDF <- as.data.frame(CrimeCategoryPerMonth)
#Writing to file:
write.csv(CatPerMonthDF, file = "CategoryPerMonthDf.csv")
head(CatPerMonthDF)
## Month Category Freq
## 1 Jan aggravated assault 289
## 2 Feb aggravated assault 280
## 3 Mar aggravated assault 353
## 4 Apr aggravated assault 267
## 5 May aggravated assault 0
## 6 Jun aggravated assault 0
#NUMBER OF CRIME TYPE PER WEEKDAY:
CrimeCategoryPerDay <- with(CrimeData, table(Weekday, Category))
CrimeCategoryPerDayProp <- prop.table(CrimeCategoryPerDay, margin = 1)
CrimeCategoryPerDayPropRounded <- round(CrimeCategoryPerDayProp, digits = 3)
#Converting the frequency table to a data frame:
CatPerDayDF <- as.data.frame(CrimeCategoryPerDay)
#Writing to file:
write.csv(CatPerDayDF, file = "CategoryPerDayDf.csv")
head(CatPerDayDF)
## Weekday Category Freq
## 1 Sun aggravated assault 297
## 2 Mon aggravated assault 246
## 3 Tues aggravated assault 251
## 4 Wed aggravated assault 221
## 5 Thurs aggravated assault 218
## 6 Fri aggravated assault 251