Load data

data source : http://data.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD

Load require libary

#load library
library(jsonlite)
library(RCurl)
## Loading required package: bitops
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite

Create a function call readJsonFromURL that will read json from url and return a dataframe

#data source
url <- "http://data.maryland.gov/api/views/pdvh-tf2u/rows.json?accessType=DOWNLOAD"

readJsonFromURL <- function(url){
  raw_data <- getURL(url)
  data_fromJson <- fromJSON(raw_data)
  df <- data_fromJson$data #data_fromJson contains meta and data, data is extracted for our analysis.
  #df_meta <- data_fromJson$meta #meta data of the dataframe.information about the data
  return (df)
}

df <- readJsonFromURL(url)
summary(df)
##        V1                                           V2       
##  1      :    1   0006A909-5D07-4AFF-A282-439A725518E0:    1  
##  10     :    1   0014C7BB-405D-4613-9D08-5C1CA8A85AB4:    1  
##  100    :    1   00163FEE-2C69-4550-9892-88DAFF9C6A0A:    1  
##  1000   :    1   0016519E-9810-4D4A-9F96-41BB1E89C5C1:    1  
##  10000  :    1   001A6589-9CAC-4083-AB2F-61918B39BEAF:    1  
##  10001  :    1   001B6C93-307E-4D1B-B203-96CDE754DC04:    1  
##  (Other):18632   (Other)                             :18632  
##        V3                 V4            V5                 V6      
##  1      :    1   1367579714:1991   697617:18638   1367579714:1991  
##  10     :    1   1367579711:1880                  1367579711:1880  
##  100    :    1   1367579704:1503                  1367579704:1503  
##  1000   :    1   1367579705:1503                  1367579705:1503  
##  10000  :    1   1367579706:1503                  1367579706:1503  
##  10001  :    1   1367579707:1503                  1367579707:1503  
##  (Other):18632   (Other)   :8755                  (Other)   :8755  
##       V7          V8                  V9                  V10       
##  697617:18638   {\n}:18638   1257000644:    3   Forestville : 1911  
##                              1262006287:    3   College Park: 1536  
##                              1266001445:    3   Frederick   : 1501  
##                              1250003311:    2   Bel Air     : 1385  
##                              1250005131:    2   Rockville   : 1381  
##                              1251005139:    2   (Other)     :10194  
##                              (Other)   :18623   NA's        :  730  
##                   V11             V12        V13             V14      
##  2012-07-21T00:00:00:  113   17:11  :  160   1:1665   FRIDAY   :3014  
##  2012-01-21T00:00:00:  111   17:12  :  149   2:2645   MONDAY   :2554  
##  2012-12-09T00:00:00:   98   17:08  :  136   3:3330   SATURDAY :2732  
##  2012-06-12T00:00:00:   95   16:05  :  133   4:4109   SUNDAY   :2373  
##  2012-08-26T00:00:00:   92   16:08  :  130   5:4540   THURSDAY :2671  
##  2012-02-29T00:00:00:   86   14:06  :  129   6:2349   TUESDAY  :2676  
##  (Other)            :18043   (Other):17801            WEDNESDAY:2618  
##                           V15                              V16       
##  IS 00095 CAPITAL BELTWAY   : 1163   IS 00695 BALTO BELTWAY  :  173  
##  IS 00495 CAPITAL BELTWAY   :  874   MD 00185 CONNECTICUT AVE:  153  
##  IS 00695 BALTO BELTWAY     :  840   MD 00100 NO NAME        :  133  
##  US 00301 CRAIN HWY         :  672   IS 00095 CAPITAL BELTWAY:  129  
##  IS 00095 NO NAME           :  618   MD 00201 KENILWORTH AVE :  112  
##  IS 00095 J F K MEMORIAL HWY:  568   (Other)                 :17937  
##  (Other)                    :13903   NA's                    :    1  
##       V17         V18                   V19             V20      
##  0      :4941   E   :2656   Not Applicable:18170   16     :3453  
##  100    :1399   N   :4493   Mount Airy    :   59   12     :1659  
##  0.25   :1381   S   :4586   Westminster   :   20   3      :1597  
##  0.5    :1226   U   :3974   Berlin        :   19   10     :1502  
##  500    :1224   W   :2548   Leonardtown   :   10   15     :1387  
##  (Other):8454   NA's: 381   (Other)       :  164   (Other):9006  
##  NA's   :  13               NA's          :  196   NA's   :  34  
##              V21            V22         V23          V24       
##  Prince Georges:3453   2      :7816   NO  : 7071   NO  :12204  
##  Harford       :1659   1      :6864   YES :11566   YES : 6433  
##  Baltimore     :1597   3      :2083   NA's:    1   NA's:    1  
##  Frederick     :1502   4      : 480                            
##  Montgomery    :1387   5      : 105                            
##  (Other)       :9006   (Other):  39                            
##  NA's          :  34   NA's   :1251                            
##               V25                     V26       
##  VEH            :10675   OTHER-COLLISION:13644  
##  FIXED OBJ      : 4299   FIXED OBJ      : 2644  
##  OTHER-COLLISION: 2205   VEH            : 1759  
##  ANIMAL         :  846   NON-COLLISION  :  559  
##  NON-COLLISION  :  465   PED            :   17  
##  (Other)        :  147   (Other)        :   14  
##  NA's           :    1   NA's           :    1

Clean the data

We will be removing the first 8 columns while the columns of interest are listed from column 9 to 26.which will be rename as follow :

EK","ROAD","INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NA
ME","COUNTY_CODE","COUNTY_NAME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLI
SION_WITH_1","COLLISION_WITH_2")
namesOfColumns <- c("CASE_NUMBER","BARRACK","ACC_DATE","ACC_TIME","ACC_TIME_CODE","DAY_OF_WEEK","ROAD",
                    "INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NAME","COUNTY_CODE",
                    "COUNTY_AME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLISION_WITH_1","COLLISION_WITH_2")


cleandf <- function(df,namesOfColumns){
  df <- df[,-(1:8)] #remove the first 8 columns
  colnames(df) <- namesOfColumns
  return (df)
}

df <- cleandf(df,namesOfColumns)
summary(df)
##      CASE_NUMBER            BARRACK                     ACC_DATE    
##  1257000644:    3   Forestville : 1911   2012-07-21T00:00:00:  113  
##  1262006287:    3   College Park: 1536   2012-01-21T00:00:00:  111  
##  1266001445:    3   Frederick   : 1501   2012-12-09T00:00:00:   98  
##  1250003311:    2   Bel Air     : 1385   2012-06-12T00:00:00:   95  
##  1250005131:    2   Rockville   : 1381   2012-08-26T00:00:00:   92  
##  1251005139:    2   (Other)     :10194   2012-02-29T00:00:00:   86  
##  (Other)   :18623   NA's        :  730   (Other)            :18043  
##     ACC_TIME     ACC_TIME_CODE    DAY_OF_WEEK  
##  17:11  :  160   1:1665        FRIDAY   :3014  
##  17:12  :  149   2:2645        MONDAY   :2554  
##  17:08  :  136   3:3330        SATURDAY :2732  
##  16:05  :  133   4:4109        SUNDAY   :2373  
##  16:08  :  130   5:4540        THURSDAY :2671  
##  14:06  :  129   6:2349        TUESDAY  :2676  
##  (Other):17801                 WEDNESDAY:2618  
##                           ROAD                        INTERSECT_ROAD 
##  IS 00095 CAPITAL BELTWAY   : 1163   IS 00695 BALTO BELTWAY  :  173  
##  IS 00495 CAPITAL BELTWAY   :  874   MD 00185 CONNECTICUT AVE:  153  
##  IS 00695 BALTO BELTWAY     :  840   MD 00100 NO NAME        :  133  
##  US 00301 CRAIN HWY         :  672   IS 00095 CAPITAL BELTWAY:  129  
##  IS 00095 NO NAME           :  618   MD 00201 KENILWORTH AVE :  112  
##  IS 00095 J F K MEMORIAL HWY:  568   (Other)                 :17937  
##  (Other)                    :13903   NA's                    :    1  
##  DIST_FROM_INTERSECT DIST_DIRECTION          CITY_NAME      COUNTY_CODE  
##  0      :4941        E   :2656      Not Applicable:18170   16     :3453  
##  100    :1399        N   :4493      Mount Airy    :   59   12     :1659  
##  0.25   :1381        S   :4586      Westminster   :   20   3      :1597  
##  0.5    :1226        U   :3974      Berlin        :   19   10     :1502  
##  500    :1224        W   :2548      Leonardtown   :   10   15     :1387  
##  (Other):8454        NA's: 381      (Other)       :  164   (Other):9006  
##  NA's   :  13                       NA's          :  196   NA's   :  34  
##           COUNTY_AME   VEHICLE_COUNT  PROP_DEST     INJURY     
##  Prince Georges:3453   2      :7816   NO  : 7071   NO  :12204  
##  Harford       :1659   1      :6864   YES :11566   YES : 6433  
##  Baltimore     :1597   3      :2083   NA's:    1   NA's:    1  
##  Frederick     :1502   4      : 480                            
##  Montgomery    :1387   5      : 105                            
##  (Other)       :9006   (Other):  39                            
##  NA's          :  34   NA's   :1251                            
##         COLLISION_WITH_1        COLLISION_WITH_2
##  VEH            :10675   OTHER-COLLISION:13644  
##  FIXED OBJ      : 4299   FIXED OBJ      : 2644  
##  OTHER-COLLISION: 2205   VEH            : 1759  
##  ANIMAL         :  846   NON-COLLISION  :  559  
##  NON-COLLISION  :  465   PED            :   17  
##  (Other)        :  147   (Other)        :   14  
##  NA's           :    1   NA's           :    1

Remove NAs from the dataframe

This will make the dataframe to be easy to work with.

#function to remove NAs
remove_na <- function(df,n=0){
  df <- df[rowSums(is.na(df)) <= n,]
  df <- as.data.frame.matrix(df)
  return (df)
} 

  
df <- remove_na(df)
str(df)
## 'data.frame':    16263 obs. of  18 variables:
##  $ CASE_NUMBER        : Factor w/ 16237 levels "1056008704","1150007982",..: 16225 15849 15195 14840 13224 13223 11777 11775 11774 10682 ...
##  $ BARRACK            : Factor w/ 22 levels "Bel Air","Berlin",..: 19 2 17 14 7 7 4 4 4 10 ...
##  $ ACC_DATE           : Factor w/ 366 levels "2012-01-01T00:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ACC_TIME           : Factor w/ 288 levels "0:01","0:02",..: 145 121 253 1 13 13 241 73 229 109 ...
##  $ ACC_TIME_CODE      : Factor w/ 6 levels "1","2","3","4",..: 1 5 2 1 1 1 2 4 2 5 ...
##  $ DAY_OF_WEEK        : Factor w/ 7 levels "FRIDAY   ","MONDAY   ",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ ROAD               : Factor w/ 2048 levels "  NE Plaza parking lot",..: 740 999 1475 1510 749 626 675 1859 661 1685 ...
##  $ INTERSECT_ROAD     : Factor w/ 5885 levels "  NONE","  RP FR IS 97 TO 176",..: 2534 930 872 3057 2476 2563 87 3229 3000 2961 ...
##  $ DIST_FROM_INTERSECT: Factor w/ 133 levels "0","0.0050000000000000001",..: 1 21 46 45 46 43 21 21 21 46 ...
##  $ DIST_DIRECTION     : Factor w/ 5 levels "E","N","S","U",..: 4 5 3 1 3 3 3 1 2 5 ...
##  $ CITY_NAME          : Factor w/ 59 levels "Aberdeen","Accident",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ COUNTY_CODE        : Factor w/ 24 levels "0","1","10","11",..: 8 17 19 11 18 18 9 9 9 13 ...
##  $ COUNTY_AME         : Factor w/ 24 levels "Allegany","Anne Arundel",..: 15 24 4 20 3 3 17 17 17 2 ...
##  $ VEHICLE_COUNT      : Factor w/ 10 levels "1","10","2","3",..: 3 1 1 1 3 1 3 1 3 1 ...
##  $ PROP_DEST          : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 2 1 2 ...
##  $ INJURY             : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 2 1 1 ...
##  $ COLLISION_WITH_1   : Factor w/ 7 levels "ANIMAL","BICYCLE",..: 7 3 3 3 7 3 7 3 7 3 ...
##  $ COLLISION_WITH_2   : Factor w/ 7 levels "ANIMAL","BICYCLE",..: 5 5 3 5 5 5 5 5 5 5 ...

Change the datatype of column ACC_DATE ,ACC_TIME, VEHICLE_COUNT to the proper data type.

#ibrary(chron)

df$ACC_DATE <- as.Date(df$ACC_DATE)
df$ACC_TIME <- as.numeric(df$ACC_TIME)
df$VEHICLE_COUNT <- as.integer(df$VEHICLE_COUNT)

summary(df)
##      CASE_NUMBER            BARRACK        ACC_DATE         
##  1257000644:    3   Forestville :1875   Min.   :2012-01-01  
##  1266001445:    3   College Park:1455   1st Qu.:2012-04-11  
##  1250005131:    2   Bel Air     :1305   Median :2012-07-08  
##  1251031222:    2   Rockville   :1291   Mean   :2012-07-07  
##  1254004177:    2   Frederick   :1224   3rd Qu.:2012-10-07  
##  1257001053:    2   Glen Burnie :1099   Max.   :2012-12-31  
##  (Other)   :16249   (Other)     :8014                       
##     ACC_TIME     ACC_TIME_CODE    DAY_OF_WEEK  
##  Min.   :  1.0   1:1467        FRIDAY   :2630  
##  1st Qu.: 78.0   2:2293        MONDAY   :2207  
##  Median :122.0   3:2897        SATURDAY :2377  
##  Mean   :140.9   4:3598        SUNDAY   :2061  
##  3rd Qu.:211.0   5:3966        THURSDAY :2356  
##  Max.   :288.0   6:2042        TUESDAY  :2331  
##                                WEDNESDAY:2301  
##                           ROAD                         INTERSECT_ROAD 
##  IS 00095 CAPITAL BELTWAY   : 1130   IS 00695 BALTO BELTWAY   :  157  
##  IS 00495 CAPITAL BELTWAY   :  828   MD 00185 CONNECTICUT AVE :  144  
##  IS 00695 BALTO BELTWAY     :  680   IS 00095 CAPITAL BELTWAY :  122  
##  US 00301 CRAIN HWY         :  667   MD 00100 NO NAME         :  122  
##  IS 00095 NO NAME           :  573   MD 00201 KENILWORTH AVE  :  109  
##  IS 00095 J F K MEMORIAL HWY:  483   MD 00004 PENNSYLVANIA AVE:  102  
##  (Other)                    :11902   (Other)                  :15507  
##  DIST_FROM_INTERSECT DIST_DIRECTION          CITY_NAME      COUNTY_CODE  
##  0      :4048        E:2334         Not Applicable:16074   16     :3328  
##  100    :1249        N:4072         Berlin        :   19   12     :1525  
##  0.25   :1187        S:4165         Westminster   :   17   15     :1294  
##  500    :1088        U:3468         Mount Airy    :   11   3      :1281  
##  0.5    :1058        W:2224         Leonardtown   :   10   10     :1224  
##  0.1    : 932                       Grantsville   :    8   2      :1081  
##  (Other):6701                       (Other)       :  124   (Other):6530  
##           COUNTY_AME   VEHICLE_COUNT    PROP_DEST   INJURY     
##  Prince Georges:3328   Min.   : 1.000   NO : 6192   NO :10624  
##  Harford       :1525   1st Qu.: 1.000   YES:10071   YES: 5639  
##  Montgomery    :1294   Median : 3.000                          
##  Baltimore     :1281   Mean   : 2.439                          
##  Frederick     :1224   3rd Qu.: 3.000                          
##  Anne Arundel  :1081   Max.   :10.000                          
##  (Other)       :6530                                           
##         COLLISION_WITH_1        COLLISION_WITH_2
##  ANIMAL         : 708    ANIMAL         :    8  
##  BICYCLE        :  28    BICYCLE        :    2  
##  FIXED OBJ      :3710    FIXED OBJ      : 2258  
##  NON-COLLISION  : 405    NON-COLLISION  :  471  
##  OTHER-COLLISION:1868    OTHER-COLLISION:11911  
##  PED            :  95    PED            :   16  
##  VEH            :9449    VEH            : 1597

Understand the data using SQL (via SQLDF)

  1. How many accidents happen on SUNDAY
  2. How many accidents had injuries
  3. List the injuries by day
#remove extral space in Day_OF_WEEK columns

df$DAY_OF_WEEK <- gsub(" ","",df$DAY_OF_WEEK)
#how many accident happend on sunday
sunday_acct <- sqldf(" select count(DAY_OF_WEEK), DAY_OF_WEEK from df where DAY_OF_WEEK = 'SUNDAY' ")
sunday_acct 
##   count(DAY_OF_WEEK) DAY_OF_WEEK
## 1               2061      SUNDAY
#How many accidents had injuries
sqldf("select count(*) as numOfAccidentWithInjuries from df where INJURY='YES' ")
##   numOfAccidentWithInjuries
## 1                      5639

There 5639 accident with injuries

# List the injuries by day

injurybyday <- sqldf(" select  DAY_OF_WEEK, count(DAY_OF_WEEK) as numofaccident from df where INJURY='YES' group by DAY_OF_WEEK")

injurybyday
##   DAY_OF_WEEK numofaccident
## 1      FRIDAY           915
## 2      MONDAY           795
## 3    SATURDAY           827
## 4      SUNDAY           705
## 5    THURSDAY           864
## 6     TUESDAY           748
## 7   WEDNESDAY           785

Understand the data using tapply

Using tapply to answer the following question

  1. How many accidents happen on Sunday
  2. How many accidents had injuries (might need to remove NAs from the data)
  3. List the injuries by day
#How    many    accidents   happen  on  Sunday

tapply(df$CASE_NUMBER, df$DAY_OF_WEEK=='SUNDAY', length)
## FALSE  TRUE 
## 14202  2061
#How    many    accidents   had injuries (might need    to  remove  NAs from    the data)
tapply(df$CASE_NUMBER, df$INJURY == 'YES', length)
## FALSE  TRUE 
## 10624  5639
#List   the injuries    by  day

temp <- df[which(df$INJURY == 'YES'),]

tapply(temp$CASE_NUMBER, temp$DAY_OF_WEEK, length)
##    FRIDAY    MONDAY  SATURDAY    SUNDAY  THURSDAY   TUESDAY WEDNESDAY 
##       915       795       827       705       864       748       785