##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
##
## Loading required package: scales
## Loading required package: bitops
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
Import Data
data_set <- read.csv("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Projects/Project%202/crime_trends.csv",skip=3)
head(tbl_df(data_set))
## Source: local data frame [6 x 17]
##
## Population.group X Violent..crime
## (fctr) (fctr) (fctr)
## 1 TOTAL ALL AGENCIES: 2012 1,145,272
## 2 2013 1,095,149
## 3 Percent change -4.4
## 4 TOTAL CITIES 2012 919,218
## 5 2013 877,594
## 6 Percent change -4.5
## Variables not shown: Murder.and..nonnegligent..manslaughter (fctr),
## Forcible..rape1 (fctr), Robbery (fctr), Aggravated..assault (fctr),
## Property..crime (fctr), Burglary (fctr), Larceny..theft (fctr),
## Motor..vehicle..theft (fctr), Arson (fctr), Number..of..agencies (fctr),
## X2013..estimated..population (fctr), X.1 (lgl), X.2 (lgl), X.3 (lgl)
Data Cleansing
#Filling missing data
name_rows<-seq(from = 1, to = nrow(data_set), by=3)
nameless_rows1<-seq(from = 2, to = nrow(data_set), by=3)
data_set[nameless_rows1,1]<-data_set[name_rows,1]
# give year column suitable name
colnames(data_set)[2] <- "Year"
#extract group information from data set
data_set <- subset(data_set,grepl("^[Group].+",data_set$Population.group))
#remove remaining rows with summary information
data_set <- data_set[c(TRUE,TRUE,FALSE),]
#remove columns filled with NA and summary data
data_set <- data_set[,1:12]
head(tbl_df(data_set))
## Source: local data frame [6 x 12]
##
## Population.group Year Violent..crime
## (fctr) (fctr) (fctr)
## 1 GROUP I (250,000 and over) 2012 416,885
## 2 GROUP I (250,000 and over) 2013 402,988
## 3 GROUP II (100,000 to 249,999) 2013 145,408
## 4 GROUP III (50,000 to 99,999) 2012 116,180
## 5 GROUP IV (25,000 to 49,999) 2012 85,900
## 6 GROUP IV (25,000 to 49,999) 2013 80,441
## Variables not shown: Murder.and..nonnegligent..manslaughter (fctr),
## Forcible..rape1 (fctr), Robbery (fctr), Aggravated..assault (fctr),
## Property..crime (fctr), Burglary (fctr), Larceny..theft (fctr),
## Motor..vehicle..theft (fctr), Arson (fctr)
Analysis
i <- data_set %>% group_by(Year,Crime) %>% summarise(total = sum(count))
head(i)
## Source: local data frame [6 x 3]
## Groups: Year [1]
##
## Year Crime total
## (fctr) (fctr) (dbl)
## 1 2012 Violent..crime 687315
## 2 2012 Murder.and..nonnegligent..manslaughter 8584
## 3 2012 Forcible..rape1 35180
## 4 2012 Robbery 235502
## 5 2012 Aggravated..assault 408049
## 6 2012 Property..crime 4686552
ggplot(data = i, aes(x=Year, y= total, fill=Crime)) + geom_bar(stat = "identity") + scale_y_continuous(labels = comma)
