Crime and Population Analysis

Data Preparation

raw<-read.csv("crime.csv", skip = 3, header = TRUE, na.strings = "")
sub<-raw[10:33, c(1:3,8)]
#fill group value
library('zoo')
sub[1]<-na.locf(sub[1]) 
#trim leading and trailing spaces; remove "," from numbers
trim <- function (x) gsub("^\\s+|\\s+$|,", "", x) 
df <- data.frame(sapply(sub, trim))
#create id/pk for each group
dflong <- cbind(rep(1:8, each=3), df) 
names(dflong) <- c("group.id","group","variable","violent.crime","property.crime")
head(dflong)
##   group.id                             group       variable violent.crime
## 1        1 1000000 and over (Group I subset)           2012        166007
## 2        1 1000000 and over (Group I subset)           2013        161252
## 3        1 1000000 and over (Group I subset) Percent change          -2.9
## 4        2 500000 to 999999 (Group I subset)           2012        138439
## 5        2 500000 to 999999 (Group I subset)           2013        134216
## 6        2 500000 to 999999 (Group I subset) Percent change          -3.1
##   property.crime
## 1         783251
## 2         760143
## 3           -3.0
## 4         747008
## 5         733283
## 6           -1.8
#conversion between long and wide format
library(tidyr)
v<-cbind(dflong[1:4] %>% 
  #unite(key2, violent.crime, murder, sep = "_") %>% 
  spread(variable, violent.crime), rep("violent.crime",8))
names(v)[3:6]<-c("Yr12", "Yr13", "change", "crime")
m<-cbind(dflong[c(1:3,5)] %>% 
  spread(variable, property.crime), rep("property.crime",8))
names(m)[3:6]<-names(v)[3:6]
dfwide<-rbind(v,m)
#convert factor to numeric for analysis
number <- function(x) {as.numeric(levels(x))[x]}
dfwide[3:4] <- data.frame(sapply(dfwide[3:4],number))
head(dfwide)
##   group.id                             group   Yr12   Yr13 change
## 1        1 1000000 and over (Group I subset) 166007 161252   -2.9
## 2        2 500000 to 999999 (Group I subset) 138439 134216   -3.1
## 3        3 250000 to 499999 (Group I subset) 112439 107520   -4.4
## 4        4       GROUP II (100000 to 249999) 154144 145408   -5.7
## 5        5        GROUP III (50000 to 99999) 116180 110033   -5.3
## 6        6         GROUP IV (25000 to 49999)  85900  80441   -6.4
##           crime
## 1 violent.crime
## 2 violent.crime
## 3 violent.crime
## 4 violent.crime
## 5 violent.crime
## 6 violent.crime

Plot

Personal Care and County Analysis

Data Wrangling

#read data and escape quotes
raw2 <- read.csv("care.csv", header = TRUE, quote = "\"", na.strings = "")
 lst <- strsplit(as.character(raw2[,9]), "[^0-9]+") 
#fill unequal length and create daily hour calculations
 indx <- lengths(lst) 
 df <- as.data.frame(do.call(rbind,lapply(lst, `length<-`, max(indx))))
 res <- as.data.frame(sapply(df[,2:3],number))
 aide <- cbind(res, res[1]/res[2])
#bind data to original dataframe
 colnames(aide) <- c("Aide_Hours_Week", "Aide_Days", "Aide_Hours_Day")
 data <- cbind(raw2[,1:8], aide)
#prepare for numeric analysis 
 library("stringr")
 num <- function(x) {as.numeric(str_replace(x,",",""))}
 data$Auth_Units <- num(data$Auth_Units)
 head(data)
##   County Auth_FromDate Auth_ToDate Auth_Units Auth_Months Auth_Weeks
## 1  Bronx    05/01/2015  10/31/2015        243           6         30
## 2  Kings    05/01/2015  10/31/2015        321           6         30
## 3  Kings    08/01/2015  10/31/2015        558           3         15
## 4  Kings    05/01/2015  07/31/2015       1116           3         15
## 5  Kings    05/01/2015  07/31/2015         92           3         15
## 6 Queens    05/01/2015  07/31/2015        216           3         15
##   Hrs_Per_Week Creation_Date Aide_Hours_Week Aide_Days Aide_Hours_Day
## 1            8    04/29/2015               9         3              3
## 2           10    05/04/2015              12         4              3
## 3           37    04/28/2015              42         7              6
## 4           74    04/28/2015              84         7             12
## 5            0    05/05/2015               1        NA             NA
## 6           14    05/06/2015              16         4              4

Plot

## Warning: Removed 1608 rows containing non-finite values (stat_boxplot).

International Trade Sample Data

Data Preparation

#read and cleanse
raw3<-read.csv("trade.csv", header = TRUE, skip = 1, na.strings = "")
dat<-raw3[rowSums(is.na(raw3))!=8, ]
dat[1]<-na.locf(dat[1])
dat[,3:8]<-data.frame(sapply(dat[3:8],num))
#calculate trade surplus
dat <- cbind(dat, dat[3]-dat[4], dat[5]-dat[6], dat[7]-dat[8])
ts <- dat[,c(1,2,9:11)]
names(ts) <- c("month","category","S13","S14","S15")
tsl <- ts %>% gather(year, surplus, S13:S15)
tsl
##       month category year surplus
## 1   January   Goods   S13  -60517
## 2   January Services  S13   18888
## 3  February   Goods   S13  -61647
## 4  February Services  S13   18893
## 5     March   Goods   S13  -56233
## 6     March Services  S13   18709
## 7     April   Goods   S13  -58039
## 8     April Services  S13   18274
## 9   January   Goods   S14  -59968
## 10  January Services  S14   20506
## 11 February   Goods   S14  -61292
## 12 February Services  S14   18456
## 13    March   Goods   S14  -63050
## 14    March Services  S14   19929
## 15    April   Goods   S14  -64321
## 16    April Services  S14   20051
## 17  January   Goods   S15  -62950
## 18  January Services  S15   19362
## 19 February   Goods   S15  -58041
## 20 February Services  S15   19503
## 21    March   Goods   S15  -71164
## 22    March Services  S15   19002
## 23    April   Goods   S15  -61591
## 24    April Services  S15   19334

Plot