This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
trainraw <- read.csv("Data/train.csv")
# Load packages
Installed <- require("plyr")
## Loading required package: plyr
if (!Installed) {
install.packages("plyr")
library("plyr")
}
Installed <- require("ggplot2")
## Loading required package: ggplot2
if (!Installed) {
install.packages("ggplot2")
library("ggplot2")
}
Installed <- require("scales")
## Loading required package: scales
if (!Installed) {
install.packages("scales")
library("scales")
}
summary(trainraw)
## Dates Category
## 2011-01-01 00:01:00: 185 LARCENY/THEFT :174900
## 2006-01-01 00:01:00: 136 OTHER OFFENSES:126182
## 2012-01-01 00:01:00: 94 NON-CRIMINAL : 92304
## 2006-01-01 12:00:00: 63 ASSAULT : 76876
## 2007-06-01 00:01:00: 61 DRUG/NARCOTIC : 53971
## 2006-06-01 00:01:00: 58 VEHICLE THEFT : 53781
## (Other) :877452 (Other) :300035
## Descript DayOfWeek
## GRAND THEFT FROM LOCKED AUTO : 60022 Friday :133734
## LOST PROPERTY : 31729 Monday :121584
## BATTERY : 27441 Saturday :126810
## STOLEN AUTOMOBILE : 26897 Sunday :116707
## DRIVERS LICENSE, SUSPENDED OR REVOKED: 26839 Thursday :125038
## WARRANT ARREST : 23754 Tuesday :124965
## (Other) :681367 Wednesday:129211
## PdDistrict Resolution
## SOUTHERN :157182 NONE :526790
## MISSION :119908 ARREST, BOOKED :206403
## NORTHERN :105296 ARREST, CITED : 77004
## BAYVIEW : 89431 LOCATED : 17101
## CENTRAL : 85460 PSYCHOPATHIC CASE: 14534
## TENDERLOIN: 81809 UNFOUNDED : 9585
## (Other) :238963 (Other) : 26632
## Address X Y
## 800 Block of BRYANT ST : 26533 Min. :-122.5 Min. :37.71
## 800 Block of MARKET ST : 6581 1st Qu.:-122.4 1st Qu.:37.75
## 2000 Block of MISSION ST: 5097 Median :-122.4 Median :37.78
## 1000 Block of POTRERO AV: 4063 Mean :-122.4 Mean :37.77
## 900 Block of MARKET ST : 3251 3rd Qu.:-122.4 3rd Qu.:37.78
## 0 Block of TURK ST : 3228 Max. :-120.5 Max. :90.00
## (Other) :829296
str(trainraw)
## 'data.frame': 878049 obs. of 9 variables:
## $ Dates : Factor w/ 389257 levels "2003-01-06 00:01:00",..: 389257 389257 389256 389255 389255 389255 389255 389255 389254 389254 ...
## $ Category : Factor w/ 39 levels "ARSON","ASSAULT",..: 38 22 22 17 17 17 37 37 17 17 ...
## $ Descript : Factor w/ 879 levels "ABANDONMENT OF CHILD",..: 867 811 811 405 405 407 740 740 405 405 ...
## $ DayOfWeek : Factor w/ 7 levels "Friday","Monday",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ PdDistrict: Factor w/ 10 levels "BAYVIEW","CENTRAL",..: 5 5 5 5 6 3 3 1 7 2 ...
## $ Resolution: Factor w/ 17 levels "ARREST, BOOKED",..: 1 1 1 12 12 12 12 12 12 12 ...
## $ Address : Factor w/ 23228 levels "0 Block of HARRISON ST",..: 19791 19791 22698 4267 1844 1506 13323 18055 11385 17659 ...
## $ X : num -122 -122 -122 -122 -122 ...
## $ Y : num 37.8 37.8 37.8 37.8 37.8 ...
unique(trainraw$Category)
## [1] WARRANTS OTHER OFFENSES
## [3] LARCENY/THEFT VEHICLE THEFT
## [5] VANDALISM NON-CRIMINAL
## [7] ROBBERY ASSAULT
## [9] WEAPON LAWS BURGLARY
## [11] SUSPICIOUS OCC DRUNKENNESS
## [13] FORGERY/COUNTERFEITING DRUG/NARCOTIC
## [15] STOLEN PROPERTY SECONDARY CODES
## [17] TRESPASS MISSING PERSON
## [19] FRAUD KIDNAPPING
## [21] RUNAWAY DRIVING UNDER THE INFLUENCE
## [23] SEX OFFENSES FORCIBLE PROSTITUTION
## [25] DISORDERLY CONDUCT ARSON
## [27] FAMILY OFFENSES LIQUOR LAWS
## [29] BRIBERY EMBEZZLEMENT
## [31] SUICIDE LOITERING
## [33] SEX OFFENSES NON FORCIBLE EXTORTION
## [35] GAMBLING BAD CHECKS
## [37] TREA RECOVERED VEHICLE
## [39] PORNOGRAPHY/OBSCENE MAT
## 39 Levels: ARSON ASSAULT BAD CHECKS BRIBERY ... WEAPON LAWS
# Convert Dates variable from factor to date
trainraw$Dates <- as.Date(trainraw$Dates)
# Count daily frequency of each crime
freqByCategory <- count(trainraw, c("Dates", "Category"))
# Create data frame with only data from Category of ARSON
dataByArson <- freqByCategory[freqByCategory$Category == "ARSON", c(1, 3)]
# Extract month and year into new variables
dataByArson$Month <- as.Date(cut(dataByArson$Dates, breaks = "month"))
dataByArson$Year <- as.Date(cut(dataByArson$Dates, breaks = "year"))
# Plot by month
ggplot(data = dataByArson,
aes(Month, freq)) +
stat_summary(fun.y = sum, # adds up all observations for the month
geom = "line") + # or "line"
scale_x_date(
labels = date_format("%m/%Y")) # custom x-axis labels