## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
## 
## Loading required package: scales
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

Import Data

data_set <- read.csv("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Projects/Project%202/crime_trends.csv",skip=3)
head(tbl_df(data_set))
## Source: local data frame [6 x 17]
## 
##      Population.group              X Violent..crime
##                (fctr)         (fctr)         (fctr)
## 1 TOTAL ALL AGENCIES:           2012      1,145,272
## 2                               2013      1,095,149
## 3                     Percent change         -4.4  
## 4        TOTAL CITIES           2012        919,218
## 5                               2013        877,594
## 6                     Percent change         -4.5  
## Variables not shown: Murder.and..nonnegligent..manslaughter (fctr),
##   Forcible..rape1 (fctr), Robbery (fctr), Aggravated..assault (fctr),
##   Property..crime (fctr), Burglary (fctr), Larceny..theft (fctr),
##   Motor..vehicle..theft (fctr), Arson (fctr), Number..of..agencies (fctr),
##   X2013..estimated..population (fctr), X.1 (lgl), X.2 (lgl), X.3 (lgl)

Data Cleansing

#Filling missing data
 name_rows<-seq(from = 1, to = nrow(data_set), by=3)
 nameless_rows1<-seq(from = 2, to = nrow(data_set), by=3)
 data_set[nameless_rows1,1]<-data_set[name_rows,1]
# give year column suitable name
 colnames(data_set)[2] <- "Year"
#extract group information from data set
 data_set <- subset(data_set,grepl("^[Group].+",data_set$Population.group))
 
#remove remaining rows with summary information
 data_set <- data_set[c(TRUE,TRUE,FALSE),]

#remove columns filled with NA and summary data
 data_set <- data_set[,1:12]
 
 head(tbl_df(data_set))
## Source: local data frame [6 x 12]
## 
##                Population.group   Year Violent..crime
##                          (fctr) (fctr)         (fctr)
## 1    GROUP I (250,000 and over)   2012        416,885
## 2    GROUP I (250,000 and over)   2013        402,988
## 3 GROUP II (100,000 to 249,999)   2013        145,408
## 4  GROUP III (50,000 to 99,999)   2012        116,180
## 5   GROUP IV (25,000 to 49,999)   2012         85,900
## 6   GROUP IV (25,000 to 49,999)   2013         80,441
## Variables not shown: Murder.and..nonnegligent..manslaughter (fctr),
##   Forcible..rape1 (fctr), Robbery (fctr), Aggravated..assault (fctr),
##   Property..crime (fctr), Burglary (fctr), Larceny..theft (fctr),
##   Motor..vehicle..theft (fctr), Arson (fctr)

Data Transformation

data_set <- data_set %>%  gather("Crime","count",3:12)
## Warning: attributes are not identical across measure variables; they will
## be dropped
# remove formating from count column
 data_set$count <- as.numeric(gsub(',','',data_set$count))
head(tbl_df(data_set))
## Source: local data frame [6 x 4]
## 
##                Population.group   Year          Crime  count
##                          (fctr) (fctr)         (fctr)  (dbl)
## 1    GROUP I (250,000 and over)   2012 Violent..crime 416885
## 2    GROUP I (250,000 and over)   2013 Violent..crime 402988
## 3 GROUP II (100,000 to 249,999)   2013 Violent..crime 145408
## 4  GROUP III (50,000 to 99,999)   2012 Violent..crime 116180
## 5   GROUP IV (25,000 to 49,999)   2012 Violent..crime  85900
## 6   GROUP IV (25,000 to 49,999)   2013 Violent..crime  80441

Analysis

i <- data_set %>% group_by(Year,Crime) %>% summarise(total = sum(count))
head(i)
## Source: local data frame [6 x 3]
## Groups: Year [1]
## 
##     Year                                  Crime   total
##   (fctr)                                 (fctr)   (dbl)
## 1   2012                         Violent..crime  687315
## 2   2012 Murder.and..nonnegligent..manslaughter    8584
## 3   2012                        Forcible..rape1   35180
## 4   2012                                Robbery  235502
## 5   2012                    Aggravated..assault  408049
## 6   2012                        Property..crime 4686552
ggplot(data = i, aes(x=Year, y= total, fill=Crime)) + geom_bar(stat = "identity") + scale_y_continuous(labels = comma)