Kaggle Challenge to classify 39 different crimes that occurred in SF in 12 years. https://www.kaggle.com/c/sf-crime

Load Required Packages

suppressWarnings( library(caret) )
suppressWarnings( library(ggplot2) )
suppressWarnings( library(lubridate) ) # to extract date, month etc
suppressWarnings( library(ggmap) ) # to get zipcode from longitude and latitude
suppressWarnings( library(dplyr) )

load data

# load data
file_location <- file.path("/Users","shruti","Desktop","WorkMiscellaneous","MachineLearning","SanFranciscoCrime/")
train_data <- read.csv(paste(file_location,"train.csv",sep=""))
test_data <- read.csv(paste(file_location,"test.csv",sep="")) 

# data exploration
dim(train_data)
## [1] 878049      9
head(train_data)
##                 Dates       Category                       Descript
## 1 2015-05-13 23:53:00       WARRANTS                 WARRANT ARREST
## 2 2015-05-13 23:53:00 OTHER OFFENSES       TRAFFIC VIOLATION ARREST
## 3 2015-05-13 23:33:00 OTHER OFFENSES       TRAFFIC VIOLATION ARREST
## 4 2015-05-13 23:30:00  LARCENY/THEFT   GRAND THEFT FROM LOCKED AUTO
## 5 2015-05-13 23:30:00  LARCENY/THEFT   GRAND THEFT FROM LOCKED AUTO
## 6 2015-05-13 23:30:00  LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO
##   DayOfWeek PdDistrict     Resolution                   Address         X
## 1 Wednesday   NORTHERN ARREST, BOOKED        OAK ST / LAGUNA ST -122.4259
## 2 Wednesday   NORTHERN ARREST, BOOKED        OAK ST / LAGUNA ST -122.4259
## 3 Wednesday   NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.4244
## 4 Wednesday   NORTHERN           NONE  1500 Block of LOMBARD ST -122.4270
## 5 Wednesday       PARK           NONE 100 Block of BRODERICK ST -122.4387
## 6 Wednesday  INGLESIDE           NONE       0 Block of TEDDY AV -122.4033
##          Y
## 1 37.77460
## 2 37.77460
## 3 37.80041
## 4 37.80087
## 5 37.77154
## 6 37.71343
summary(train_data)
##                  Dates                  Category     
##  2011-01-01 00:01:00:   185   LARCENY/THEFT :174900  
##  2006-01-01 00:01:00:   136   OTHER OFFENSES:126182  
##  2012-01-01 00:01:00:    94   NON-CRIMINAL  : 92304  
##  2006-01-01 12:00:00:    63   ASSAULT       : 76876  
##  2007-06-01 00:01:00:    61   DRUG/NARCOTIC : 53971  
##  2006-06-01 00:01:00:    58   VEHICLE THEFT : 53781  
##  (Other)            :877452   (Other)       :300035  
##                                   Descript          DayOfWeek     
##  GRAND THEFT FROM LOCKED AUTO         : 60022   Friday   :133734  
##  LOST PROPERTY                        : 31729   Monday   :121584  
##  BATTERY                              : 27441   Saturday :126810  
##  STOLEN AUTOMOBILE                    : 26897   Sunday   :116707  
##  DRIVERS LICENSE, SUSPENDED OR REVOKED: 26839   Thursday :125038  
##  WARRANT ARREST                       : 23754   Tuesday  :124965  
##  (Other)                              :681367   Wednesday:129211  
##       PdDistrict                 Resolution    
##  SOUTHERN  :157182   NONE             :526790  
##  MISSION   :119908   ARREST, BOOKED   :206403  
##  NORTHERN  :105296   ARREST, CITED    : 77004  
##  BAYVIEW   : 89431   LOCATED          : 17101  
##  CENTRAL   : 85460   PSYCHOPATHIC CASE: 14534  
##  TENDERLOIN: 81809   UNFOUNDED        :  9585  
##  (Other)   :238963   (Other)          : 26632  
##                      Address             X                Y        
##  800 Block of BRYANT ST  : 26533   Min.   :-122.5   Min.   :37.71  
##  800 Block of MARKET ST  :  6581   1st Qu.:-122.4   1st Qu.:37.75  
##  2000 Block of MISSION ST:  5097   Median :-122.4   Median :37.78  
##  1000 Block of POTRERO AV:  4063   Mean   :-122.4   Mean   :37.77  
##  900 Block of MARKET ST  :  3251   3rd Qu.:-122.4   3rd Qu.:37.78  
##  0 Block of TURK ST      :  3228   Max.   :-120.5   Max.   :90.00  
##  (Other)                 :829296
levels(train_data$Category)
##  [1] "ARSON"                       "ASSAULT"                    
##  [3] "BAD CHECKS"                  "BRIBERY"                    
##  [5] "BURGLARY"                    "DISORDERLY CONDUCT"         
##  [7] "DRIVING UNDER THE INFLUENCE" "DRUG/NARCOTIC"              
##  [9] "DRUNKENNESS"                 "EMBEZZLEMENT"               
## [11] "EXTORTION"                   "FAMILY OFFENSES"            
## [13] "FORGERY/COUNTERFEITING"      "FRAUD"                      
## [15] "GAMBLING"                    "KIDNAPPING"                 
## [17] "LARCENY/THEFT"               "LIQUOR LAWS"                
## [19] "LOITERING"                   "MISSING PERSON"             
## [21] "NON-CRIMINAL"                "OTHER OFFENSES"             
## [23] "PORNOGRAPHY/OBSCENE MAT"     "PROSTITUTION"               
## [25] "RECOVERED VEHICLE"           "ROBBERY"                    
## [27] "RUNAWAY"                     "SECONDARY CODES"            
## [29] "SEX OFFENSES FORCIBLE"       "SEX OFFENSES NON FORCIBLE"  
## [31] "STOLEN PROPERTY"             "SUICIDE"                    
## [33] "SUSPICIOUS OCC"              "TREA"                       
## [35] "TRESPASS"                    "VANDALISM"                  
## [37] "VEHICLE THEFT"               "WARRANTS"                   
## [39] "WEAPON LAWS"
table(train_data$Category)
## 
##                       ARSON                     ASSAULT 
##                        1513                       76876 
##                  BAD CHECKS                     BRIBERY 
##                         406                         289 
##                    BURGLARY          DISORDERLY CONDUCT 
##                       36755                        4320 
## DRIVING UNDER THE INFLUENCE               DRUG/NARCOTIC 
##                        2268                       53971 
##                 DRUNKENNESS                EMBEZZLEMENT 
##                        4280                        1166 
##                   EXTORTION             FAMILY OFFENSES 
##                         256                         491 
##      FORGERY/COUNTERFEITING                       FRAUD 
##                       10609                       16679 
##                    GAMBLING                  KIDNAPPING 
##                         146                        2341 
##               LARCENY/THEFT                 LIQUOR LAWS 
##                      174900                        1903 
##                   LOITERING              MISSING PERSON 
##                        1225                       25989 
##                NON-CRIMINAL              OTHER OFFENSES 
##                       92304                      126182 
##     PORNOGRAPHY/OBSCENE MAT                PROSTITUTION 
##                          22                        7484 
##           RECOVERED VEHICLE                     ROBBERY 
##                        3138                       23000 
##                     RUNAWAY             SECONDARY CODES 
##                        1946                        9985 
##       SEX OFFENSES FORCIBLE   SEX OFFENSES NON FORCIBLE 
##                        4388                         148 
##             STOLEN PROPERTY                     SUICIDE 
##                        4540                         508 
##              SUSPICIOUS OCC                        TREA 
##                       31414                           6 
##                    TRESPASS                   VANDALISM 
##                        7326                       44725 
##               VEHICLE THEFT                    WARRANTS 
##                       53781                       42214 
##                 WEAPON LAWS 
##                        8555
# missing data
sum(!complete.cases(train_data))
## [1] 0
dim(test_data)
## [1] 884262      7
head(test_data)
##   Id               Dates DayOfWeek PdDistrict                  Address
## 1  0 2015-05-10 23:59:00    Sunday    BAYVIEW  2000 Block of THOMAS AV
## 2  1 2015-05-10 23:51:00    Sunday    BAYVIEW       3RD ST / REVERE AV
## 3  2 2015-05-10 23:50:00    Sunday   NORTHERN   2000 Block of GOUGH ST
## 4  3 2015-05-10 23:45:00    Sunday  INGLESIDE 4700 Block of MISSION ST
## 5  4 2015-05-10 23:45:00    Sunday  INGLESIDE 4700 Block of MISSION ST
## 6  5 2015-05-10 23:40:00    Sunday    TARAVAL    BROAD ST / CAPITOL AV
##           X        Y
## 1 -122.3996 37.73505
## 2 -122.3915 37.73243
## 3 -122.4260 37.79221
## 4 -122.4374 37.72141
## 5 -122.4374 37.72141
## 6 -122.4590 37.71317

Data Munging

# since test data does not contain "Descript", "Resolution", not using it for training the model
# function for Feature Extraction
FeatureExtraction <- function(dataset,zip){
  year_of_crime <- as.factor(year(dataset$Dates))
  month_of_crime <- as.factor(month(dataset$Dates))
  date_of_crime <- as.factor(day(dataset$Dates))
  hour_of_crime <- as.factor(hour(dataset$Dates))
  new_dataframe <- data.frame(cbind(hour_of_crime,date_of_crime,month_of_crime,year_of_crime,droplevels(dataset[,c("DayOfWeek","PdDistrict","X","Y")])))
  
  return(new_dataframe)
}

# Feature engineering: get zip code from latitude and longitude
# install.packages("zipcode")
library(zipcode)
data(zipcode)
sf_zipcodes <- subset(zipcode,city=="San Francisco",)
sf_zipcodes$latitude <- as.numeric(format(sf_zipcodes$latitude,digits=8))
sf_zipcodes$longitude <- as.numeric(format(sf_zipcodes$longitude,digits=8))
dim(sf_zipcodes)
## [1] 73  5
head(sf_zipcodes)
##         zip          city state latitude longitude
## 41315 94101 San Francisco    CA 37.78483 -122.7278
## 41316 94102 San Francisco    CA 37.77933 -122.4192
## 41317 94103 San Francisco    CA 37.77233 -122.4109
## 41318 94104 San Francisco    CA 37.79173 -122.4019
## 41319 94105 San Francisco    CA 37.78923 -122.3957
## 41320 94106 San Francisco    CA 37.78483 -122.7278
# function to convert latitude and longitude to zipcode
closest_zipcode <- function(latitude_longitude_matrix)   
# latitude_longitude_matrix : input matrix containing latitude and longitude from training or test set
{
  # find euclidean distance between user provided latitude and longitude and 
  # all latitudes, longitudes from sf_zipcodes and select the row from latter 
  # which has minimum distance. then extract its zip code
  return(sf_zipcodes[which.min 
                     (apply (sf_zipcodes[,c("latitude","longitude")],1,function(i) 
                       {
                          dist( rbind(i,latitude_longitude_matrix) ) #euclidean distance
                        }   ) 
                      ),"zip"])
}

# Sanity Check: to validate some of the zipcodes with google api
train_data[1,]
##                 Dates Category       Descript DayOfWeek PdDistrict
## 1 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday   NORTHERN
##       Resolution            Address         X       Y
## 1 ARREST, BOOKED OAK ST / LAGUNA ST -122.4259 37.7746
closest_zipcode(c(train_data$Y[1],train_data$X[1]))
## [1] "94102"
revgeocode(c(train_data$X[1], train_data$Y[1]))
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?latlng=37.7745985956747,-122.425891675136&sensor=false
## [1] "391 Oak St, San Francisco, CA 94102, USA"
# zip_codes_training <- apply(train_data[,c("Y","X")],1,closest_zipcode)
# save(zip_codes_training,file=paste(file_location,"zip_codes_training.rda",sep=""))
load(paste(file_location,"zip_codes_training.rda",sep=""))

# zip_codes_test <- apply(test_data[,c("Y","X")],1,closest_zipcode)
# save(zip_codes_test,file=paste(file_location,"zip_codes_test.rda",sep=""))
# load(paste(file_location,"zip_codes_test.rda",sep=""))

# combining new features
training_features <- cbind( FeatureExtraction(train_data),zip=zip_codes_training, Category=train_data$Category )
#test_features <- FeatureExtraction(test_data,zip=zip_codes_test)
# save(training_features,file=paste(file_location,"training_features.rda",sep=""))
# save(test_features,file=paste(file_location,"test_features.rda",sep=""))
# load(paste(file_location,"training_features.rda",sep=""))
head(training_features)
##   hour_of_crime date_of_crime month_of_crime year_of_crime DayOfWeek
## 1            23            13              5          2015 Wednesday
## 2            23            13              5          2015 Wednesday
## 3            23            13              5          2015 Wednesday
## 4            23            13              5          2015 Wednesday
## 5            23            13              5          2015 Wednesday
## 6            23            13              5          2015 Wednesday
##   PdDistrict         X        Y   zip       Category
## 1   NORTHERN -122.4259 37.77460 94102       WARRANTS
## 2   NORTHERN -122.4259 37.77460 94102 OTHER OFFENSES
## 3   NORTHERN -122.4244 37.80041 94109 OTHER OFFENSES
## 4   NORTHERN -122.4270 37.80087 94109  LARCENY/THEFT
## 5       PARK -122.4387 37.77154 94117  LARCENY/THEFT
## 6  INGLESIDE -122.4033 37.71343 94134  LARCENY/THEFT

split data

set.seed(123)
training_index <- createDataPartition(training_features$Category,p=0.6,list=F)
training_set <- droplevels(training_features[training_index,])
test_set <- droplevels(training_features[-training_index,])
dim(training_set); dim(test_set)
## [1] 526842     10
## [1] 351207     10

exploratory data analysis

# Function to group data frame by specific columns
group_by_col <- function(dataset,...){
  dataset %>%
  group_by_(.dots = ...) %>%
  summarise(count = n())
}

# Function to plot crime count with variation in different features 
plot_by_col <- function(dataset,xcolname,ycolname){
  par(mar=c(6,4,4,2),mgp=c(3,0.5,0),cex.axis=0.8,cex.main=0.8)
  
  x_levels <- levels(dataset[[xcolname]]) 
  x_level_count <- seq_along(x_levels)
  
  boxplot(dataset[[ycolname]] ~ dataset[[xcolname]],
          col=x_level_count,
          ylim = c( min(dataset[[ycolname]]), max(dataset[[ycolname]]) ),
          xaxt="n",
          #xlab=xcolname,
          ylab=c("Number of crimes"),
          main = c(paste("Variation in crime with",xcolname))
          )
  axis(side=1, at=x_level_count, labels=x_levels, las=2)
}

# top most crimes
Category_hist <- training_set %>%
  group_by(Category) %>%
  summarise(count = n()) %>%
  transform(Category = reorder(Category,-count))

Category_hist <- arrange(Category_hist, desc(count))
head(Category_hist)
##         Category  count
## 1  LARCENY/THEFT 104940
## 2 OTHER OFFENSES  75710
## 3   NON-CRIMINAL  55383
## 4        ASSAULT  46126
## 5  DRUG/NARCOTIC  32383
## 6  VEHICLE THEFT  32269
top10_crimes <- Category_hist[1:10,1]
top5_crimes <- Category_hist[1:5,1]

# To Do: check the following
# Category_hist <- group_by_col(training_set,"Category")
# Category_hist2 <- Category_hist %>%
#   transform(Category = reorder(Category,-count))
# arrange(Category_hist, desc(count))
# top10_crimes <- Category_hist[1:10,1]
# top5_crimes <- Category_hist[1:5,1]

# Frequency/historgram of different crimes.
#ggplot(Category_hist2) +
ggplot(Category_hist) +
  geom_bar(aes(x=Category, y=count,
        color = Category, fill = Category),
        stat="identity") +
coord_flip() +
theme(legend.position="None") +
ggtitle("Number of crimes in each category") +
ylab("Number of crimes") +
xlab("Category of crime")

# Variation in crime with day of week
data_day <- group_by_col(training_set,"DayOfWeek","year_of_crime","month_of_crime")
head(data_day)
## Source: local data frame [6 x 4]
## Groups: DayOfWeek, year_of_crime [1]
## 
##   DayOfWeek year_of_crime month_of_crime count
##      (fctr)        (fctr)         (fctr) (int)
## 1    Friday          2003              1   525
## 2    Friday          2003              2   537
## 3    Friday          2003              3   552
## 4    Friday          2003              4   551
## 5    Friday          2003              5   765
## 6    Friday          2003              6   492
plot_by_col(data_day,"DayOfWeek","count")

# the graph indicates that maximum crimes occur on Friday and minimum on sunday

# Variation in crime with year
data_year <- group_by_col(training_set,"year_of_crime","month_of_crime")
#head(data_year)
plot_by_col(data_year,"year_of_crime","count")

# Variation in crime with month
data_month <- group_by_col(training_set,"month_of_crime","year_of_crime")
plot_by_col(data_month,"month_of_crime","count")

# the graph indicates that maximum crimes occur in October and least in Dec

# Variation in crime with hour of day
data_hour <- group_by_col(training_set,"hour_of_crime","year_of_crime","month_of_crime")
plot_by_col(data_hour,"hour_of_crime","count")

## Variation in crime with zipcode
data_zip <- group_by_col(training_set,"zip","year_of_crime","month_of_crime")
plot_by_col(data_zip,"zip","count")

# Function to group data frame by specific columns and Category of crime
group_category_by_col <- function(dataset,...){
  x <- dataset %>%
  subset(Category %in% top10_crimes) %>%
  #subset(Category %in% top5_crimes) %>%
  group_by_(.dots = ...) %>%
  summarise(count = n())
#   x$Category <- factor(x$Category,levels = top10_crimes)
#   return(x)
}

# Function to plot crime count in top 10 crimes with variation in different features 
plot_category_by_col <- function(dataset,xcolname,ycolname="count"){
  ggplot(data=dataset, aes_string(x=xcolname, y=ycolname, fill="Category")) +
    geom_boxplot() +
    facet_wrap(~Category,ncol = 5)+
    #facet_wrap wraps a 1d sequence of panels into 2d
    theme(legend.position="None",
        axis.text.x = element_text(angle = 90, hjust = 1)) +

    ggtitle( c(paste("Variations in crime by",xcolname)) )+
    xlab(xcolname)+
    ylab("Number of crime incidents")
}

# Group top 10 crimes by hour of crime
data_hour_category <- group_category_by_col(training_set,"hour_of_crime","Category","year_of_crime","month_of_crime")
data_hour_category$Category <- factor(data_hour_category$Category,levels = top10_crimes)
head(data_hour_category)
## Source: local data frame [6 x 5]
## Groups: hour_of_crime, Category, year_of_crime [1]
## 
##   hour_of_crime Category year_of_crime month_of_crime count
##          (fctr)   (fctr)        (fctr)         (fctr) (int)
## 1             0  ASSAULT          2003              1    12
## 2             0  ASSAULT          2003              2    25
## 3             0  ASSAULT          2003              3    19
## 4             0  ASSAULT          2003              4    12
## 5             0  ASSAULT          2003              5    13
## 6             0  ASSAULT          2003              6    17
# Variations in top 10 crimes by hour of crime
plot_category_by_col(data_hour_category,"hour_of_crime")

# Group top 10 crimes by year
data_year_category <- group_category_by_col(training_set,"year_of_crime","Category","month_of_crime")
data_year_category$Category <- factor(data_year_category$Category,levels = top10_crimes)
head(data_year_category)
## Source: local data frame [6 x 4]
## Groups: year_of_crime, Category [1]
## 
##   year_of_crime Category month_of_crime count
##          (fctr)   (fctr)         (fctr) (int)
## 1          2003  ASSAULT              1   306
## 2          2003  ASSAULT              2   303
## 3          2003  ASSAULT              3   358
## 4          2003  ASSAULT              4   316
## 5          2003  ASSAULT              5   360
## 6          2003  ASSAULT              6   316
# Variations in top 10 crimes with different features
plot_category_by_col(data_year_category,"year_of_crime")

# Group top 10 crimes by day of week
data_day_category <- group_category_by_col(training_set,"DayOfWeek","Category","year_of_crime","month_of_crime")
data_day_category$Category <- factor(data_day_category$Category,levels = top10_crimes)
head(data_day_category)
## Source: local data frame [6 x 5]
## Groups: DayOfWeek, Category, year_of_crime [1]
## 
##   DayOfWeek Category year_of_crime month_of_crime count
##      (fctr)   (fctr)        (fctr)         (fctr) (int)
## 1    Friday  ASSAULT          2003              1    47
## 2    Friday  ASSAULT          2003              2    48
## 3    Friday  ASSAULT          2003              3    50
## 4    Friday  ASSAULT          2003              4    36
## 5    Friday  ASSAULT          2003              5    66
## 6    Friday  ASSAULT          2003              6    39
# Variations in top 10 crimes by day of week
plot_category_by_col(data_day_category,"DayOfWeek")

# Variations in top 10 crimes by zip code
data_zip_category <- group_category_by_col(training_set,"zip","Category","year_of_crime","month_of_crime")
data_zip_category$Category <- factor(data_zip_category$Category,levels = top10_crimes)
plot_category_by_col(data_zip_category,"zip")

# Variations in top 3 crimes by zip code
top3_crimes <- Category_hist[1:3,1]
data_zip_top3category <- training_set %>%
  subset(Category %in% top3_crimes) %>%
  group_by(zip,Category,year_of_crime,month_of_crime) %>%
  summarise(count = n())
data_zip_top3category$Category <- factor(data_zip_top3category$Category,levels = top3_crimes)
plot_category_by_col(data_zip_top3category,"zip")

# Function to group data frame by category, then by a specific column and normalize all values for that column for each category.
normalized_group_by_col <- function(dataset,...){
     dataset %>%
     group_by_(.dots = ...) %>% 
     summarise(count = n()) %>% 
     # for each category of crime, calculate mean and standard deviation and normalize all values for that category.
     mutate(normalized_count = (count-mean(count))/sd(count)) 
}

# function to generate normalized plots to check for trends.
# NOT WORKING
normalized_plot <- function(dataset,xcolname,ycolname="normalized_count"){
  ggplot(data = subset(dataset, Category %in% top10_crimes),
         aes(x=as.numeric(xcolname), y=normalized_count, fill=normalized_count)) + 
             geom_line()+
             geom_point()+
             scale_x_continuous(breaks=seq_along(levels(xcolname)),labels=levels(xcolname))
             xlab(xcolname)+
             ylab("Normalized crime count")}
              
# pattern by month
norm_data_month <- normalized_group_by_col(training_set,"Category","month_of_crime")

ggplot(data = subset(norm_data_month, Category %in% top10_crimes),
            aes(x=as.numeric(month_of_crime), y=normalized_count,color = Category)) + 
        geom_line()+
        geom_point()+
        scale_x_continuous(breaks = 1:12, labels=c("Jan","Feb","Mar",
                                                 "Apr","May","Jun",
                                                 "Jul","Aug","Sep",
                                                 "Oct","Nov","Dec")) +
        xlab("Months")+
        ylab("Normalized crime count")

        #theme(legend.position="None")

# pattern by hour
norm_data_hour <- normalized_group_by_col(training_set,"Category","hour_of_crime")
ggplot(data = subset(norm_data_hour, Category %in% top10_crimes),
        aes(x=as.numeric(hour_of_crime), y=normalized_count,color = Category)) + 
          geom_line()+
          geom_point()+
         scale_x_continuous(breaks = 1:24, labels=c(0:23)) +
          xlab("Hour")+
          ylab("Normalized crime count")

# pattern by week
norm_data_week <- normalized_group_by_col(training_set,"Category","DayOfWeek")
ggplot(data = subset(norm_data_week, Category %in% top10_crimes),
        aes(x=as.numeric(DayOfWeek), y=normalized_count,color = Category)) + 
          geom_line()+
          geom_point()+
         scale_x_continuous(breaks = 1:7, labels=c(levels(norm_data_week$DayOfWeek))) +
          xlab("Day Of Week")+
          ylab("Normalized crime count")

# "Assault" crime seems to be high on sunday. Also certain crimes cluster and follow similar trend over days of the week

# pattern by year
norm_data_year <- normalized_group_by_col(training_set,"Category","year_of_crime")
ggplot(data = subset(norm_data_year, Category %in% top10_crimes),
        aes(x=as.numeric(year_of_crime), y=normalized_count,color = Category)) + 
          geom_line()+
          geom_point()+
         scale_x_continuous(breaks = seq_along(levels(norm_data_year$year_of_crime)), labels=c(levels(norm_data_year$year_of_crime))) +
          xlab("year")+
          ylab("Normalized crime count")

# pattern by zip
norm_data_zip <- normalized_group_by_col(training_set,"Category","zip")
ggplot(data = subset(norm_data_zip, Category %in% top10_crimes),
        aes(x=as.numeric(zip), y=normalized_count,color = Category)) + 
          geom_line()+
          geom_point()+
         scale_x_continuous(breaks = seq_along(levels(norm_data_zip$zip)), labels=c(levels(norm_data_zip$zip))) +
          xlab("zip code")+
          ylab("Normalized crime count")