This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

trainraw <- read.csv("Data/train.csv")

# Load packages
Installed <- require("plyr")
## Loading required package: plyr
if (!Installed) {
    install.packages("plyr")
    library("plyr")
}

Installed <- require("ggplot2")
## Loading required package: ggplot2
if (!Installed) {
    install.packages("ggplot2")
    library("ggplot2")
}

Installed <- require("scales")
## Loading required package: scales
if (!Installed) {
    install.packages("scales")
    library("scales")
}

Exploratory Data Analysis

summary(trainraw)
##                  Dates                  Category     
##  2011-01-01 00:01:00:   185   LARCENY/THEFT :174900  
##  2006-01-01 00:01:00:   136   OTHER OFFENSES:126182  
##  2012-01-01 00:01:00:    94   NON-CRIMINAL  : 92304  
##  2006-01-01 12:00:00:    63   ASSAULT       : 76876  
##  2007-06-01 00:01:00:    61   DRUG/NARCOTIC : 53971  
##  2006-06-01 00:01:00:    58   VEHICLE THEFT : 53781  
##  (Other)            :877452   (Other)       :300035  
##                                   Descript          DayOfWeek     
##  GRAND THEFT FROM LOCKED AUTO         : 60022   Friday   :133734  
##  LOST PROPERTY                        : 31729   Monday   :121584  
##  BATTERY                              : 27441   Saturday :126810  
##  STOLEN AUTOMOBILE                    : 26897   Sunday   :116707  
##  DRIVERS LICENSE, SUSPENDED OR REVOKED: 26839   Thursday :125038  
##  WARRANT ARREST                       : 23754   Tuesday  :124965  
##  (Other)                              :681367   Wednesday:129211  
##       PdDistrict                 Resolution    
##  SOUTHERN  :157182   NONE             :526790  
##  MISSION   :119908   ARREST, BOOKED   :206403  
##  NORTHERN  :105296   ARREST, CITED    : 77004  
##  BAYVIEW   : 89431   LOCATED          : 17101  
##  CENTRAL   : 85460   PSYCHOPATHIC CASE: 14534  
##  TENDERLOIN: 81809   UNFOUNDED        :  9585  
##  (Other)   :238963   (Other)          : 26632  
##                      Address             X                Y        
##  800 Block of BRYANT ST  : 26533   Min.   :-122.5   Min.   :37.71  
##  800 Block of MARKET ST  :  6581   1st Qu.:-122.4   1st Qu.:37.75  
##  2000 Block of MISSION ST:  5097   Median :-122.4   Median :37.78  
##  1000 Block of POTRERO AV:  4063   Mean   :-122.4   Mean   :37.77  
##  900 Block of MARKET ST  :  3251   3rd Qu.:-122.4   3rd Qu.:37.78  
##  0 Block of TURK ST      :  3228   Max.   :-120.5   Max.   :90.00  
##  (Other)                 :829296
str(trainraw)
## 'data.frame':    878049 obs. of  9 variables:
##  $ Dates     : Factor w/ 389257 levels "2003-01-06 00:01:00",..: 389257 389257 389256 389255 389255 389255 389255 389255 389254 389254 ...
##  $ Category  : Factor w/ 39 levels "ARSON","ASSAULT",..: 38 22 22 17 17 17 37 37 17 17 ...
##  $ Descript  : Factor w/ 879 levels "ABANDONMENT OF CHILD",..: 867 811 811 405 405 407 740 740 405 405 ...
##  $ DayOfWeek : Factor w/ 7 levels "Friday","Monday",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ PdDistrict: Factor w/ 10 levels "BAYVIEW","CENTRAL",..: 5 5 5 5 6 3 3 1 7 2 ...
##  $ Resolution: Factor w/ 17 levels "ARREST, BOOKED",..: 1 1 1 12 12 12 12 12 12 12 ...
##  $ Address   : Factor w/ 23228 levels "0 Block of  HARRISON ST",..: 19791 19791 22698 4267 1844 1506 13323 18055 11385 17659 ...
##  $ X         : num  -122 -122 -122 -122 -122 ...
##  $ Y         : num  37.8 37.8 37.8 37.8 37.8 ...
unique(trainraw$Category)
##  [1] WARRANTS                    OTHER OFFENSES             
##  [3] LARCENY/THEFT               VEHICLE THEFT              
##  [5] VANDALISM                   NON-CRIMINAL               
##  [7] ROBBERY                     ASSAULT                    
##  [9] WEAPON LAWS                 BURGLARY                   
## [11] SUSPICIOUS OCC              DRUNKENNESS                
## [13] FORGERY/COUNTERFEITING      DRUG/NARCOTIC              
## [15] STOLEN PROPERTY             SECONDARY CODES            
## [17] TRESPASS                    MISSING PERSON             
## [19] FRAUD                       KIDNAPPING                 
## [21] RUNAWAY                     DRIVING UNDER THE INFLUENCE
## [23] SEX OFFENSES FORCIBLE       PROSTITUTION               
## [25] DISORDERLY CONDUCT          ARSON                      
## [27] FAMILY OFFENSES             LIQUOR LAWS                
## [29] BRIBERY                     EMBEZZLEMENT               
## [31] SUICIDE                     LOITERING                  
## [33] SEX OFFENSES NON FORCIBLE   EXTORTION                  
## [35] GAMBLING                    BAD CHECKS                 
## [37] TREA                        RECOVERED VEHICLE          
## [39] PORNOGRAPHY/OBSCENE MAT    
## 39 Levels: ARSON ASSAULT BAD CHECKS BRIBERY ... WEAPON LAWS

Clean Data

# Convert Dates variable from factor to date
trainraw$Dates <- as.Date(trainraw$Dates)
# Count daily frequency of each crime
freqByCategory <- count(trainraw, c("Dates", "Category"))

# Create data frame with only data from Category of ARSON
dataByArson <- freqByCategory[freqByCategory$Category == "ARSON", c(1, 3)]
# Extract month and year into new variables
dataByArson$Month <- as.Date(cut(dataByArson$Dates, breaks = "month"))
dataByArson$Year <- as.Date(cut(dataByArson$Dates, breaks = "year"))
# Plot by month
ggplot(data = dataByArson,
  aes(Month, freq)) +
  stat_summary(fun.y = sum, # adds up all observations for the month
    geom = "line") + # or "line"
  scale_x_date(
    labels = date_format("%m/%Y"))  # custom x-axis labels