rm(list = ls())
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
# Crash data (crashes.csv)

# Year - The year in which the observation was taken
# Road - The name of the road being studied
# N_Crashes - The number of crashes on the road during that year
# Volume - Average annual daily traffic (AADT) volumes on the road during that year. See this site for a formal definition. [North Carolina Department of Transportation 2012]
# Road data (roads.csv)
 
# Road - The name of the road being studied
# District - The administrative district responsible for the road's upkeep and maintenance
# Length - Length of the road in miles

setwd('/Users/jaehochang/Desktop/Study/R/sqldf')
crashes <- read.csv("crashes.csv")
roads <- read.csv("roads.csv")
head(crashes)
##   Year          Road N_Crashes Volume
## 1 1991 Interstate 65        25  40000
## 2 1992 Interstate 65        37  41000
## 3 1993 Interstate 65        45  45000
## 4 1994 Interstate 65        46  45600
## 5 1995 Interstate 65        46  49000
## 6 1996 Interstate 65        59  51000
print(roads)
##            Road       District Length
## 1 Interstate 65     Greenfield    262
## 2 Interstate 70      Vincennes    156
## 3         US-36 Crawfordsville    139
## 4         US-40     Greenfield    150
## 5         US-52 Crawfordsville    172
join_string <- 
'select crashes.*, roads.District, roads.Length 
from crashes 
left join roads on crashes.Road = roads.Road'

crashes_join_roads <- sqldf(join_string)
tail(crashes_join_roads) # left join Allows NA
##     Year           Road N_Crashes Volume District Length
## 105 2007 Interstate 275        32  21900     <NA>     NA
## 106 2008 Interstate 275        21  21850     <NA>     NA
## 107 2009 Interstate 275        25  22100     <NA>     NA
## 108 2010 Interstate 275        24  21500     <NA>     NA
## 109 2011 Interstate 275        23  20300     <NA>     NA
## 110 2012 Interstate 275        22  21200     <NA>     NA
join_string2 <- 
'select crashes.*, roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road'

crashes_join_roads2 <- sqldf(join_string2, stringsAsFactors = FALSE)
tail(crashes_join_roads2) # inner join allows no NA
##    Year  Road N_Crashes Volume       District Length
## 83 2007 US-36        49  24000 Crawfordsville    139
## 84 2008 US-36        52  24500 Crawfordsville    139
## 85 2009 US-36        55  24700 Crawfordsville    139
## 86 2010 US-36        35  23000 Crawfordsville    139
## 87 2011 US-36        33  21000 Crawfordsville    139
## 88 2012 US-36        31  20500 Crawfordsville    139
join_string2 <- 
"select crashes.*, roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road
where crashes.Road = 'US-40'"            
crashes_join_roads4 <- sqldf(join_string2,stringsAsFactors = FALSE)
head(crashes_join_roads4)
##   Year  Road N_Crashes Volume   District Length
## 1 1991 US-40        46  21000 Greenfield    150
## 2 1992 US-40       101  21500 Greenfield    150
## 3 1993 US-40        76  23000 Greenfield    150
## 4 1994 US-40        72  21000 Greenfield    150
## 5 1995 US-40        75  24000 Greenfield    150
## 6 1996 US-40       136  23500 Greenfield    150
group_string <- 
"select crashes.Road, avg(crashes.N_Crashes) as Mean_Crashes
from crashes
left join roads
on crashes.Road = roads.Road
group by 1" # 1st col.
sqldf(group_string)
##             Road Mean_Crashes
## 1 Interstate 275     24.95455
## 2  Interstate 65    107.81818
## 3  Interstate 70     65.18182
## 4          US-36     48.00000
## 5          US-40     68.68182