rm(list = ls())
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Crash data (crashes.csv)
# Year - The year in which the observation was taken
# Road - The name of the road being studied
# N_Crashes - The number of crashes on the road during that year
# Volume - Average annual daily traffic (AADT) volumes on the road during that year. See this site for a formal definition. [North Carolina Department of Transportation 2012]
# Road data (roads.csv)
# Road - The name of the road being studied
# District - The administrative district responsible for the road's upkeep and maintenance
# Length - Length of the road in miles
setwd('/Users/jaehochang/Desktop/Study/R/sqldf')
crashes <- read.csv("crashes.csv")
roads <- read.csv("roads.csv")
head(crashes)
## Year Road N_Crashes Volume
## 1 1991 Interstate 65 25 40000
## 2 1992 Interstate 65 37 41000
## 3 1993 Interstate 65 45 45000
## 4 1994 Interstate 65 46 45600
## 5 1995 Interstate 65 46 49000
## 6 1996 Interstate 65 59 51000
print(roads)
## Road District Length
## 1 Interstate 65 Greenfield 262
## 2 Interstate 70 Vincennes 156
## 3 US-36 Crawfordsville 139
## 4 US-40 Greenfield 150
## 5 US-52 Crawfordsville 172
join_string <-
'select crashes.*, roads.District, roads.Length
from crashes
left join roads on crashes.Road = roads.Road'
crashes_join_roads <- sqldf(join_string)
tail(crashes_join_roads) # left join Allows NA
## Year Road N_Crashes Volume District Length
## 105 2007 Interstate 275 32 21900 <NA> NA
## 106 2008 Interstate 275 21 21850 <NA> NA
## 107 2009 Interstate 275 25 22100 <NA> NA
## 108 2010 Interstate 275 24 21500 <NA> NA
## 109 2011 Interstate 275 23 20300 <NA> NA
## 110 2012 Interstate 275 22 21200 <NA> NA
join_string2 <-
'select crashes.*, roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road'
crashes_join_roads2 <- sqldf(join_string2, stringsAsFactors = FALSE)
tail(crashes_join_roads2) # inner join allows no NA
## Year Road N_Crashes Volume District Length
## 83 2007 US-36 49 24000 Crawfordsville 139
## 84 2008 US-36 52 24500 Crawfordsville 139
## 85 2009 US-36 55 24700 Crawfordsville 139
## 86 2010 US-36 35 23000 Crawfordsville 139
## 87 2011 US-36 33 21000 Crawfordsville 139
## 88 2012 US-36 31 20500 Crawfordsville 139
join_string2 <-
"select crashes.*, roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road
where crashes.Road = 'US-40'"
crashes_join_roads4 <- sqldf(join_string2,stringsAsFactors = FALSE)
head(crashes_join_roads4)
## Year Road N_Crashes Volume District Length
## 1 1991 US-40 46 21000 Greenfield 150
## 2 1992 US-40 101 21500 Greenfield 150
## 3 1993 US-40 76 23000 Greenfield 150
## 4 1994 US-40 72 21000 Greenfield 150
## 5 1995 US-40 75 24000 Greenfield 150
## 6 1996 US-40 136 23500 Greenfield 150
group_string <-
"select crashes.Road, avg(crashes.N_Crashes) as Mean_Crashes
from crashes
left join roads
on crashes.Road = roads.Road
group by 1" # 1st col.
sqldf(group_string)
## Road Mean_Crashes
## 1 Interstate 275 24.95455
## 2 Interstate 65 107.81818
## 3 Interstate 70 65.18182
## 4 US-36 48.00000
## 5 US-40 68.68182