#getwd()

#setwd("/Users/tina/Projects/R/PhoenixCityData")
#Installing packages needed:
suppressMessages(library(dplyr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(plyr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(readr, warn.conflicts = FALSE, quietly=TRUE))
suppressMessages(library(lubridate, warn.conflicts = FALSE, quietly=TRUE))

#Reading in the data
CrimeStats = read.csv("Crime Stats.csv")
str(CrimeStats)
## 'data.frame':    29711 obs. of  7 variables:
##  $ INC.NUMBER        : Factor w/ 29711 levels "000061951901-000",..: 61 1389 36 11104 23873 22243 42 26 6186 39 ...
##  $ OCCURRED.ON       : Factor w/ 19038 levels "","01/01/2016  00:00",..: 12663 12663 12663 12663 12663 12663 12663 12663 12664 12664 ...
##  $ OCCURRED.TO       : Factor w/ 15546 levels "","01/01/2016  00:00",..: 10387 11185 10362 723 1 7031 10371 10356 11751 1 ...
##  $ UCR.CRIME.CATEGORY: Factor w/ 9 levels "AGGRAVATED ASSAULT",..: 4 5 5 6 8 5 6 6 5 5 ...
##  $ X100.BLOCK.ADDR   : Factor w/ 12270 levels "100XX N 11TH AVE",..: 8881 1112 8575 11994 859 5515 9958 79 5858 11061 ...
##  $ ZIP               : int  85031 85014 85014 85029 85006 85017 85043 85307 85018 85051 ...
##  $ PREMISE.TYPE      : Factor w/ 73 levels "","07A STOREROOM/SHED (COMMERCIAL)",..: 6 52 6 63 63 52 63 63 36 38 ...
summary(CrimeStats)
##             INC.NUMBER               OCCURRED.ON   
##  000061951901-000:    1                    :  109  
##  201500001958633 :    1   04/16/2016  17:00:   13  
##  201500001971825 :    1   11/01/2015  00:01:   13  
##  201500001978297 :    1   01/28/2016  17:00:   12  
##  201500001979643 :    1   11/06/2015  12:00:   12  
##  201500001981209 :    1   12/01/2015  12:00:   12  
##  (Other)         :29705   (Other)          :29540  
##             OCCURRED.TO              UCR.CRIME.CATEGORY
##                   : 8120   LARCENY-THEFT      :15085   
##  01/25/2016  08:00:   11   BURGLARY           : 5670   
##  03/29/2016  08:00:   11   MOTOR VEHICLE THEFT: 3349   
##  01/20/2016  08:00:   10   DRUG OFFENSE       : 2122   
##  02/23/2016  12:00:    9   AGGRAVATED ASSAULT : 1781   
##  03/14/2016  07:00:    9   ROBBERY            : 1182   
##  (Other)          :21541   (Other)            :  522   
##                X100.BLOCK.ADDR       ZIP       
##  18XX W BELL RD        :  237   Min.   :85003  
##  61XX N 35TH AVE       :  205   1st Qu.:85016  
##  61XX S 35TH AVE       :  186   Median :85027  
##  16XX W BETHANY HOME RD:  161   Mean   :85044  
##  45XX E CACTUS RD      :  136   3rd Qu.:85041  
##  57XX N 19TH AVE       :  135   Max.   :85395  
##  (Other)               :28651                  
##                               PREMISE.TYPE  
##  SINGLE FAMILY HOUSE                : 5873  
##  APARTMENT                          : 3356  
##  PARKING LOT                        : 3281  
##  RETAIL BUSINESS                    : 3168  
##  STREET / ROADWAY / ALLEY / SIDEWALK: 2334  
##  VEHICLE                            : 1601  
##  (Other)                            :10098
head(CrimeStats)
##        INC.NUMBER       OCCURRED.ON       OCCURRED.TO  UCR.CRIME.CATEGORY
## 1 201500002103724 11/01/2015  00:00 11/01/2015  15:21        DRUG OFFENSE
## 2 201500002168686 11/01/2015  00:00 11/11/2015  09:30       LARCENY-THEFT
## 3 201500002102327 11/01/2015  00:00 11/01/2015  09:00       LARCENY-THEFT
## 4 201600000052855 11/01/2015  00:00 01/09/2016  00:00 MOTOR VEHICLE THEFT
## 5 201600000594484 11/01/2015  00:00                                  RAPE
## 6 201600000527709 11/01/2015  00:00 03/22/2016  00:36       LARCENY-THEFT
##            X100.BLOCK.ADDR   ZIP        PREMISE.TYPE
## 1  54XX W INDIAN SCHOOL RD 85031           APARTMENT
## 2      14XX E HIGHLAND AVE 85014         PARKING LOT
## 3           51XX N 15TH ST 85014           APARTMENT
## 4 N 43RD AVE & W CACTUS RD 85029 SINGLE FAMILY HOUSE
## 5        13XX E ALMERIA RD 85006 SINGLE FAMILY HOUSE
## 6      33XX W CAMELBACK RD 85017         PARKING LOT
#Getting the data into shape:

#Working with crime date columns:
#Using Lubridate to convert crime occurance dates from factors to date-time
OccurredOn <- CrimeStats$OCCURRED.ON
OccurredTo <- CrimeStats$OCCURRED.TO
OccurredOnDate <- mdy_hm(OccurredOn)
OccurredToDate <- mdy_hm(OccurredTo)
CrimeDuration <- OccurredToDate-OccurredOnDate
CrimeStats$OccurredOnDate <- OccurredOnDate
CrimeStats$OccurredToDate <- OccurredToDate
#head(CrimeStats)

#Renaming the data frame:
CrimeData <- CrimeStats
#str(CrimeData)

#removing unneeded columns:
CrimeData$OCCURRED.ON <- NULL
CrimeData$OCCURRED.TO <- NULL
CrimeData$CrimeDuration <- CrimeDuration
#Converting crime duration from seconds to days, hours, min, seconds:
Duration <- interval(ymd_hms(OccurredOnDate, tz="MST"), ymd_hms(OccurredToDate, tz="MST"))
CrimeData$Duration <- as.period(Duration)

#Extracting date and time information from starting time of the crime
CrimeData$WeekDay <- wday(OccurredOnDate, label=TRUE)
CrimeData$Month <- month(OccurredOnDate, label=TRUE)
CrimeData$Hour <- hour(OccurredOnDate)
head(CrimeData)
##        INC.NUMBER  UCR.CRIME.CATEGORY          X100.BLOCK.ADDR   ZIP
## 1 201500002103724        DRUG OFFENSE  54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686       LARCENY-THEFT      14XX E HIGHLAND AVE 85014
## 3 201500002102327       LARCENY-THEFT           51XX N 15TH ST 85014
## 4 201600000052855 MOTOR VEHICLE THEFT N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484                RAPE        13XX E ALMERIA RD 85006
## 6 201600000527709       LARCENY-THEFT      33XX W CAMELBACK RD 85017
##          PREMISE.TYPE OccurredOnDate      OccurredToDate CrimeDuration
## 1           APARTMENT     2015-11-01 2015-11-01 15:21:00    55260 secs
## 2         PARKING LOT     2015-11-01 2015-11-11 09:30:00   898200 secs
## 3           APARTMENT     2015-11-01 2015-11-01 09:00:00    32400 secs
## 4 SINGLE FAMILY HOUSE     2015-11-01 2016-01-09 00:00:00  5961600 secs
## 5 SINGLE FAMILY HOUSE     2015-11-01                <NA>       NA secs
## 6         PARKING LOT     2015-11-01 2016-03-22 00:36:00 12270960 secs
##           Duration WeekDay Month Hour
## 1       15H 21M 0S     Sun   Nov    0
## 2    10d 9H 30M 0S     Sun   Nov    0
## 3         9H 0M 0S     Sun   Nov    0
## 4   2m 8d 0H 0M 0S     Sun   Nov    0
## 5             <NA>     Sun   Nov    0
## 6 4m 21d 0H 36M 0S     Sun   Nov    0
#renaming columns
CrimeDataNames <- c("Number", "Category", "Address", "ZIP", "Premises", 
                    "OccuranceDate", "EndDate", "Duration", "DurationSpan", "Weekday", "Month", "NA")
names(CrimeData) <- CrimeDataNames

#Convert zip code to a factor to keep it unique:
CrimeData$ZIP <- as.factor(CrimeData$ZIP)

#Change crime category lables and premises labels to lower case:
CrimeData$Category <- as.factor(tolower(as.character(CrimeData$Category)))
CrimeData$Premises <- as.factor(tolower(as.character(CrimeData$Premises)))

#renaming the category for "murder and non-negligent manslaughter" to make it fit into the bar graph
levels(CrimeData$Category)[levels(CrimeData$Category)=="murder and non-negligent manslaughter"] <- "murder"
levels(CrimeData$Category)[levels(CrimeData$Category)=="murder"] <- "murder-manslaughter"
head(CrimeData)
##            Number            Category                  Address   ZIP
## 1 201500002103724        drug offense  54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686       larceny-theft      14XX E HIGHLAND AVE 85014
## 3 201500002102327       larceny-theft           51XX N 15TH ST 85014
## 4 201600000052855 motor vehicle theft N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484                rape        13XX E ALMERIA RD 85006
## 6 201600000527709       larceny-theft      33XX W CAMELBACK RD 85017
##              Premises OccuranceDate             EndDate      Duration
## 1           apartment    2015-11-01 2015-11-01 15:21:00    55260 secs
## 2         parking lot    2015-11-01 2015-11-11 09:30:00   898200 secs
## 3           apartment    2015-11-01 2015-11-01 09:00:00    32400 secs
## 4 single family house    2015-11-01 2016-01-09 00:00:00  5961600 secs
## 5 single family house    2015-11-01                <NA>       NA secs
## 6         parking lot    2015-11-01 2016-03-22 00:36:00 12270960 secs
##       DurationSpan Weekday Month NA
## 1       15H 21M 0S     Sun   Nov  0
## 2    10d 9H 30M 0S     Sun   Nov  0
## 3         9H 0M 0S     Sun   Nov  0
## 4   2m 8d 0H 0M 0S     Sun   Nov  0
## 5             <NA>     Sun   Nov  0
## 6 4m 21d 0H 36M 0S     Sun   Nov  0
#Getting summary stats:

#NUMBER OF CRIME TYPE PER ZIP CODE:
#summary table:
CrimeCategoryPerZIPCode <- with(CrimeData, table(ZIP, Category))
#CrimeCategoryPerZIPCode
#Summarising in terms of proportions:
CrimeCategoryPerZipCodeProp <- prop.table(CrimeCategoryPerZIPCode, margin = 1)
#Proportions for relative comparisons of crimes in ZIP:
CrimeCatPerZipPropRounded <- round(CrimeCategoryPerZipCodeProp, digits = 3)
#Converting the summary tables to data frames to do further analysis:
CatPerZipDF <- as.data.frame(CrimeCategoryPerZIPCode)
#Writing to file:
write.csv(CatPerZipDF, file = "CategoryPerZIPDf.csv")
head(CatPerZipDF)
##     ZIP           Category Freq
## 1 85003 aggravated assault   17
## 2 85004 aggravated assault   19
## 3 85006 aggravated assault   61
## 4 85007 aggravated assault   44
## 5 85008 aggravated assault   93
## 6 85009 aggravated assault   88
#NUMBER OF CRIME TYPE PER PREMISES TYPE
CrimeCategoryPerPremises <- with(CrimeData, table(Premises, Category))
#Summarising in terms of proportions:
CrimeCategoryPerPremisesProp <- prop.table(CrimeCategoryPerPremises, margin = 1)
#Proportions for relative comparisons of crimes in premises:
CrimeCatPerPremPropRounded <- round(CrimeCategoryPerPremisesProp, digits = 3)
#Converting the summary tables to data frames to do further analysis:
CatPerPremDF <- as.data.frame(CrimeCategoryPerPremises)
#Writing to file:
write.csv(CatPerPremDF, file = "CategoryPerPremDf.csv")
head(CatPerPremDF)
##                           Premises           Category Freq
## 1                                  aggravated assault    9
## 2  07a storeroom/shed (commercial) aggravated assault    0
## 3 07a storeroom/shed (residential) aggravated assault    0
## 4         adult only store / movie aggravated assault    1
## 5                          airport aggravated assault    0
## 6                        apartment aggravated assault  402
#NUMBER OF CRIME TYPE PER MONTH: NOTE we don't have data for the months of May 
#through to October. Somehow I need to remove these months
CrimeCategoryPerMonth <- with(CrimeData, table(Month, Category))
head(CrimeData)
##            Number            Category                  Address   ZIP
## 1 201500002103724        drug offense  54XX W INDIAN SCHOOL RD 85031
## 2 201500002168686       larceny-theft      14XX E HIGHLAND AVE 85014
## 3 201500002102327       larceny-theft           51XX N 15TH ST 85014
## 4 201600000052855 motor vehicle theft N 43RD AVE & W CACTUS RD 85029
## 5 201600000594484                rape        13XX E ALMERIA RD 85006
## 6 201600000527709       larceny-theft      33XX W CAMELBACK RD 85017
##              Premises OccuranceDate             EndDate      Duration
## 1           apartment    2015-11-01 2015-11-01 15:21:00    55260 secs
## 2         parking lot    2015-11-01 2015-11-11 09:30:00   898200 secs
## 3           apartment    2015-11-01 2015-11-01 09:00:00    32400 secs
## 4 single family house    2015-11-01 2016-01-09 00:00:00  5961600 secs
## 5 single family house    2015-11-01                <NA>       NA secs
## 6         parking lot    2015-11-01 2016-03-22 00:36:00 12270960 secs
##       DurationSpan Weekday Month NA
## 1       15H 21M 0S     Sun   Nov  0
## 2    10d 9H 30M 0S     Sun   Nov  0
## 3         9H 0M 0S     Sun   Nov  0
## 4   2m 8d 0H 0M 0S     Sun   Nov  0
## 5             <NA>     Sun   Nov  0
## 6 4m 21d 0H 36M 0S     Sun   Nov  0
CrimeCategoryPerMonthProp <- prop.table(CrimeCategoryPerMonth, margin = 1)
CrimeCategoryPerMonthPropRounded <- round(CrimeCategoryPerMonthProp, digits = 3)
#Converting the frequency table to a data frame:
CatPerMonthDF <- as.data.frame(CrimeCategoryPerMonth)
#Writing to file:
write.csv(CatPerMonthDF, file = "CategoryPerMonthDf.csv")
head(CatPerMonthDF)
##   Month           Category Freq
## 1   Jan aggravated assault  289
## 2   Feb aggravated assault  280
## 3   Mar aggravated assault  353
## 4   Apr aggravated assault  267
## 5   May aggravated assault    0
## 6   Jun aggravated assault    0
#NUMBER OF CRIME TYPE PER WEEKDAY:
CrimeCategoryPerDay <- with(CrimeData, table(Weekday, Category))
CrimeCategoryPerDayProp <- prop.table(CrimeCategoryPerDay, margin = 1)
CrimeCategoryPerDayPropRounded <- round(CrimeCategoryPerDayProp, digits = 3)
#Converting the frequency table to a data frame:
CatPerDayDF <- as.data.frame(CrimeCategoryPerDay)
#Writing to file:
write.csv(CatPerDayDF, file = "CategoryPerDayDf.csv")
head(CatPerDayDF)
##   Weekday           Category Freq
## 1     Sun aggravated assault  297
## 2     Mon aggravated assault  246
## 3    Tues aggravated assault  251
## 4     Wed aggravated assault  221
## 5   Thurs aggravated assault  218
## 6     Fri aggravated assault  251