data source : http://data.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD
Load require libary
#load library
library(jsonlite)
library(RCurl)
## Loading required package: bitops
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
Create a function call readJsonFromURL
that will read json from url and return a dataframe
#data source
url <- "http://data.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD"
readJsonFromURL <- function(url){
raw_data <- getURL(url)
data_fromJson <- fromJSON(raw_data)
df <- data_fromJson$data #data_fromJson contains meta and data, data is extracted for our analysis.
#df_meta <- data_fromJson$meta #meta data of the dataframe.information about the data
return (df)
}
df <- readJsonFromURL(url)
summary(df)
## V1 V2
## 1 : 1 0006A909-5D07-4AFF-A282-439A725518E0: 1
## 10 : 1 0014C7BB-405D-4613-9D08-5C1CA8A85AB4: 1
## 100 : 1 00163FEE-2C69-4550-9892-88DAFF9C6A0A: 1
## 1000 : 1 0016519E-9810-4D4A-9F96-41BB1E89C5C1: 1
## 10000 : 1 001A6589-9CAC-4083-AB2F-61918B39BEAF: 1
## 10001 : 1 001B6C93-307E-4D1B-B203-96CDE754DC04: 1
## (Other):18632 (Other) :18632
## V3 V4 V5 V6
## 1 : 1 1367579714:1991 697617:18638 1367579714:1991
## 10 : 1 1367579711:1880 1367579711:1880
## 100 : 1 1367579704:1503 1367579704:1503
## 1000 : 1 1367579705:1503 1367579705:1503
## 10000 : 1 1367579706:1503 1367579706:1503
## 10001 : 1 1367579707:1503 1367579707:1503
## (Other):18632 (Other) :8755 (Other) :8755
## V7 V8 V9 V10
## 697617:18638 {\n}:18638 1257000644: 3 Forestville : 1911
## 1262006287: 3 College Park: 1536
## 1266001445: 3 Frederick : 1501
## 1250003311: 2 Bel Air : 1385
## 1250005131: 2 Rockville : 1381
## 1251005139: 2 (Other) :10194
## (Other) :18623 NA's : 730
## V11 V12 V13 V14
## 2012-07-21T00:00:00: 113 17:11 : 160 1:1665 FRIDAY :3014
## 2012-01-21T00:00:00: 111 17:12 : 149 2:2645 MONDAY :2554
## 2012-12-09T00:00:00: 98 17:08 : 136 3:3330 SATURDAY :2732
## 2012-06-12T00:00:00: 95 16:05 : 133 4:4109 SUNDAY :2373
## 2012-08-26T00:00:00: 92 16:08 : 130 5:4540 THURSDAY :2671
## 2012-02-29T00:00:00: 86 14:06 : 129 6:2349 TUESDAY :2676
## (Other) :18043 (Other):17801 WEDNESDAY:2618
## V15 V16
## IS 00095 CAPITAL BELTWAY : 1163 IS 00695 BALTO BELTWAY : 173
## IS 00495 CAPITAL BELTWAY : 874 MD 00185 CONNECTICUT AVE: 153
## IS 00695 BALTO BELTWAY : 840 MD 00100 NO NAME : 133
## US 00301 CRAIN HWY : 672 IS 00095 CAPITAL BELTWAY: 129
## IS 00095 NO NAME : 618 MD 00201 KENILWORTH AVE : 112
## IS 00095 J F K MEMORIAL HWY: 568 (Other) :17937
## (Other) :13903 NA's : 1
## V17 V18 V19 V20
## 0 :4941 E :2656 Not Applicable:18170 16 :3453
## 100 :1399 N :4493 Mount Airy : 59 12 :1659
## 0.25 :1381 S :4586 Westminster : 20 3 :1597
## 0.5 :1226 U :3974 Berlin : 19 10 :1502
## 500 :1224 W :2548 Leonardtown : 10 15 :1387
## (Other):8454 NA's: 381 (Other) : 164 (Other):9006
## NA's : 13 NA's : 196 NA's : 34
## V21 V22 V23 V24
## Prince Georges:3453 2 :7816 NO : 7071 NO :12204
## Harford :1659 1 :6864 YES :11566 YES : 6433
## Baltimore :1597 3 :2083 NA's: 1 NA's: 1
## Frederick :1502 4 : 480
## Montgomery :1387 5 : 105
## (Other) :9006 (Other): 39
## NA's : 34 NA's :1251
## V25 V26
## VEH :10675 OTHER-COLLISION:13644
## FIXED OBJ : 4299 FIXED OBJ : 2644
## OTHER-COLLISION: 2205 VEH : 1759
## ANIMAL : 846 NON-COLLISION : 559
## NON-COLLISION : 465 PED : 17
## (Other) : 147 (Other) : 14
## NA's : 1 NA's : 1
We will be removing the first 8 columns while the columns of interest are listed from column 9 to 26.which will be rename as follow :
EK","ROAD","INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NA
ME","COUNTY_CODE","COUNTY_NAME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLI
SION_WITH_1","COLLISION_WITH_2")
namesOfColumns <- c("CASE_NUMBER","BARRACK","ACC_DATE","ACC_TIME","ACC_TIME_CODE","DAY_OF_WEEK","ROAD",
"INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NAME","COUNTY_CODE",
"COUNTY_AME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLISION_WITH_1","COLLISION_WITH_2")
cleandf <- function(df,namesOfColumns){
df <- df[,-(1:8)] #remove the first 8 columns
colnames(df) <- namesOfColumns
return (df)
}
df <- cleandf(df,namesOfColumns)
summary(df)
## CASE_NUMBER BARRACK ACC_DATE
## 1257000644: 3 Forestville : 1911 2012-07-21T00:00:00: 113
## 1262006287: 3 College Park: 1536 2012-01-21T00:00:00: 111
## 1266001445: 3 Frederick : 1501 2012-12-09T00:00:00: 98
## 1250003311: 2 Bel Air : 1385 2012-06-12T00:00:00: 95
## 1250005131: 2 Rockville : 1381 2012-08-26T00:00:00: 92
## 1251005139: 2 (Other) :10194 2012-02-29T00:00:00: 86
## (Other) :18623 NA's : 730 (Other) :18043
## ACC_TIME ACC_TIME_CODE DAY_OF_WEEK
## 17:11 : 160 1:1665 FRIDAY :3014
## 17:12 : 149 2:2645 MONDAY :2554
## 17:08 : 136 3:3330 SATURDAY :2732
## 16:05 : 133 4:4109 SUNDAY :2373
## 16:08 : 130 5:4540 THURSDAY :2671
## 14:06 : 129 6:2349 TUESDAY :2676
## (Other):17801 WEDNESDAY:2618
## ROAD INTERSECT_ROAD
## IS 00095 CAPITAL BELTWAY : 1163 IS 00695 BALTO BELTWAY : 173
## IS 00495 CAPITAL BELTWAY : 874 MD 00185 CONNECTICUT AVE: 153
## IS 00695 BALTO BELTWAY : 840 MD 00100 NO NAME : 133
## US 00301 CRAIN HWY : 672 IS 00095 CAPITAL BELTWAY: 129
## IS 00095 NO NAME : 618 MD 00201 KENILWORTH AVE : 112
## IS 00095 J F K MEMORIAL HWY: 568 (Other) :17937
## (Other) :13903 NA's : 1
## DIST_FROM_INTERSECT DIST_DIRECTION CITY_NAME COUNTY_CODE
## 0 :4941 E :2656 Not Applicable:18170 16 :3453
## 100 :1399 N :4493 Mount Airy : 59 12 :1659
## 0.25 :1381 S :4586 Westminster : 20 3 :1597
## 0.5 :1226 U :3974 Berlin : 19 10 :1502
## 500 :1224 W :2548 Leonardtown : 10 15 :1387
## (Other):8454 NA's: 381 (Other) : 164 (Other):9006
## NA's : 13 NA's : 196 NA's : 34
## COUNTY_AME VEHICLE_COUNT PROP_DEST INJURY
## Prince Georges:3453 2 :7816 NO : 7071 NO :12204
## Harford :1659 1 :6864 YES :11566 YES : 6433
## Baltimore :1597 3 :2083 NA's: 1 NA's: 1
## Frederick :1502 4 : 480
## Montgomery :1387 5 : 105
## (Other) :9006 (Other): 39
## NA's : 34 NA's :1251
## COLLISION_WITH_1 COLLISION_WITH_2
## VEH :10675 OTHER-COLLISION:13644
## FIXED OBJ : 4299 FIXED OBJ : 2644
## OTHER-COLLISION: 2205 VEH : 1759
## ANIMAL : 846 NON-COLLISION : 559
## NON-COLLISION : 465 PED : 17
## (Other) : 147 (Other) : 14
## NA's : 1 NA's : 1
This will make the dataframe to be easy to work with.
#function to remove NAs
remove_na <- function(df,n=0){
df <- df[rowSums(is.na(df)) <= n,]
df <- as.data.frame.matrix(df)
return (df)
}
df <- remove_na(df)
str(df)
## 'data.frame': 16263 obs. of 18 variables:
## $ CASE_NUMBER : Factor w/ 16237 levels "1056008704","1150007982",..: 16225 15849 15195 14840 13224 13223 11777 11775 11774 10682 ...
## $ BARRACK : Factor w/ 22 levels "Bel Air","Berlin",..: 19 2 17 14 7 7 4 4 4 10 ...
## $ ACC_DATE : Factor w/ 366 levels "2012-01-01T00:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ACC_TIME : Factor w/ 288 levels "0:01","0:02",..: 145 121 253 1 13 13 241 73 229 109 ...
## $ ACC_TIME_CODE : Factor w/ 6 levels "1","2","3","4",..: 1 5 2 1 1 1 2 4 2 5 ...
## $ DAY_OF_WEEK : Factor w/ 7 levels "FRIDAY ","MONDAY ",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ ROAD : Factor w/ 2048 levels " NE Plaza parking lot",..: 740 999 1475 1510 749 626 675 1859 661 1685 ...
## $ INTERSECT_ROAD : Factor w/ 5885 levels " NONE"," RP FR IS 97 TO 176",..: 2534 930 872 3057 2476 2563 87 3229 3000 2961 ...
## $ DIST_FROM_INTERSECT: Factor w/ 133 levels "0","0.0050000000000000001",..: 1 21 46 45 46 43 21 21 21 46 ...
## $ DIST_DIRECTION : Factor w/ 5 levels "E","N","S","U",..: 4 5 3 1 3 3 3 1 2 5 ...
## $ CITY_NAME : Factor w/ 59 levels "Aberdeen","Accident",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ COUNTY_CODE : Factor w/ 24 levels "0","1","10","11",..: 8 17 19 11 18 18 9 9 9 13 ...
## $ COUNTY_AME : Factor w/ 24 levels "Allegany","Anne Arundel",..: 15 24 4 20 3 3 17 17 17 2 ...
## $ VEHICLE_COUNT : Factor w/ 10 levels "1","10","2","3",..: 3 1 1 1 3 1 3 1 3 1 ...
## $ PROP_DEST : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 1 2 ...
## $ INJURY : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
## $ COLLISION_WITH_1 : Factor w/ 7 levels "ANIMAL","BICYCLE",..: 7 3 3 3 7 3 7 3 7 3 ...
## $ COLLISION_WITH_2 : Factor w/ 7 levels "ANIMAL","BICYCLE",..: 5 5 3 5 5 5 5 5 5 5 ...
Change the datatype of column ACC_DATE ,ACC_TIME, VEHICLE_COUNT to the proper data type.
#ibrary(chron)
df$ACC_DATE <- as.Date(df$ACC_DATE)
df$ACC_TIME <- as.numeric(df$ACC_TIME)
df$VEHICLE_COUNT <- as.integer(df$VEHICLE_COUNT)
summary(df)
## CASE_NUMBER BARRACK ACC_DATE
## 1257000644: 3 Forestville :1875 Min. :2012-01-01
## 1266001445: 3 College Park:1455 1st Qu.:2012-04-11
## 1250005131: 2 Bel Air :1305 Median :2012-07-08
## 1251031222: 2 Rockville :1291 Mean :2012-07-07
## 1254004177: 2 Frederick :1224 3rd Qu.:2012-10-07
## 1257001053: 2 Glen Burnie :1099 Max. :2012-12-31
## (Other) :16249 (Other) :8014
## ACC_TIME ACC_TIME_CODE DAY_OF_WEEK
## Min. : 1.0 1:1467 FRIDAY :2630
## 1st Qu.: 78.0 2:2293 MONDAY :2207
## Median :122.0 3:2897 SATURDAY :2377
## Mean :140.9 4:3598 SUNDAY :2061
## 3rd Qu.:211.0 5:3966 THURSDAY :2356
## Max. :288.0 6:2042 TUESDAY :2331
## WEDNESDAY:2301
## ROAD INTERSECT_ROAD
## IS 00095 CAPITAL BELTWAY : 1130 IS 00695 BALTO BELTWAY : 157
## IS 00495 CAPITAL BELTWAY : 828 MD 00185 CONNECTICUT AVE : 144
## IS 00695 BALTO BELTWAY : 680 IS 00095 CAPITAL BELTWAY : 122
## US 00301 CRAIN HWY : 667 MD 00100 NO NAME : 122
## IS 00095 NO NAME : 573 MD 00201 KENILWORTH AVE : 109
## IS 00095 J F K MEMORIAL HWY: 483 MD 00004 PENNSYLVANIA AVE: 102
## (Other) :11902 (Other) :15507
## DIST_FROM_INTERSECT DIST_DIRECTION CITY_NAME COUNTY_CODE
## 0 :4048 E:2334 Not Applicable:16074 16 :3328
## 100 :1249 N:4072 Berlin : 19 12 :1525
## 0.25 :1187 S:4165 Westminster : 17 15 :1294
## 500 :1088 U:3468 Mount Airy : 11 3 :1281
## 0.5 :1058 W:2224 Leonardtown : 10 10 :1224
## 0.1 : 932 Grantsville : 8 2 :1081
## (Other):6701 (Other) : 124 (Other):6530
## COUNTY_AME VEHICLE_COUNT PROP_DEST INJURY
## Prince Georges:3328 Min. : 1.000 NO : 6192 NO :10624
## Harford :1525 1st Qu.: 1.000 YES:10071 YES: 5639
## Montgomery :1294 Median : 3.000
## Baltimore :1281 Mean : 2.439
## Frederick :1224 3rd Qu.: 3.000
## Anne Arundel :1081 Max. :10.000
## (Other) :6530
## COLLISION_WITH_1 COLLISION_WITH_2
## ANIMAL : 708 ANIMAL : 8
## BICYCLE : 28 BICYCLE : 2
## FIXED OBJ :3710 FIXED OBJ : 2258
## NON-COLLISION : 405 NON-COLLISION : 471
## OTHER-COLLISION:1868 OTHER-COLLISION:11911
## PED : 95 PED : 16
## VEH :9449 VEH : 1597
#remove extral space in Day_OF_WEEK columns
df$DAY_OF_WEEK <- gsub(" ","",df$DAY_OF_WEEK)
#how many accident happend on sunday
sunday_acct <- sqldf(" select count(DAY_OF_WEEK), DAY_OF_WEEK from df where DAY_OF_WEEK = 'SUNDAY' ")
sunday_acct
## count(DAY_OF_WEEK) DAY_OF_WEEK
## 1 2061 SUNDAY
#How many accidents had injuries
sqldf("select count(*) as numOfAccidentWithInjuries from df where INJURY='YES' ")
## numOfAccidentWithInjuries
## 1 5639
There 5639 accident with injuries
# List the injuries by day
injurybyday <- sqldf(" select DAY_OF_WEEK, count(DAY_OF_WEEK) as numofaccident from df where INJURY='YES' group by DAY_OF_WEEK")
injurybyday
## DAY_OF_WEEK numofaccident
## 1 FRIDAY 915
## 2 MONDAY 795
## 3 SATURDAY 827
## 4 SUNDAY 705
## 5 THURSDAY 864
## 6 TUESDAY 748
## 7 WEDNESDAY 785
Using tapply to answer the following question
#How many accidents happen on Sunday
tapply(df$CASE_NUMBER, df$DAY_OF_WEEK=='SUNDAY', length)
## FALSE TRUE
## 14202 2061
#How many accidents had injuries (might need to remove NAs from the data)
tapply(df$CASE_NUMBER, df$INJURY == 'YES', length)
## FALSE TRUE
## 10624 5639
#List the injuries by day
temp <- df[which(df$INJURY == 'YES'),]
tapply(temp$CASE_NUMBER, temp$DAY_OF_WEEK, length)
## FRIDAY MONDAY SATURDAY SUNDAY THURSDAY TUESDAY WEDNESDAY
## 915 795 827 705 864 748 785