library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Introduction

Coming from a SQL background and being able to use SQL to do EDA was deinitely a help in R. Although with using packages such as dplyr you can do much the same thing. Sqldf is another great tool in your arsenal.

Some example commands

sqldf supports the SQLite database backend by default. To check the version run the below.

sqldf("select sqlite_version()")
##   sqlite_version()
## 1           3.30.1

Get data

crash <- read_csv("data/crash_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   Crash_Ref_Number = col_double(),
##   Crash_Year = col_double(),
##   Crash_Hour = col_double(),
##   Crash_Longitude_GDA94 = col_double(),
##   Crash_Latitude_GDA94 = col_double(),
##   Count_Casualty_Fatality = col_double(),
##   Count_Casualty_Hospitalised = col_double(),
##   Count_Casualty_MedicallyTreated = col_double(),
##   Count_Casualty_MinorInjury = col_double(),
##   Count_Casualty_Total = col_double(),
##   Count_Unit_Car = col_double(),
##   Count_Unit_Motorcycle_Moped = col_double(),
##   Count_Unit_Truck = col_double(),
##   Count_Unit_Bus = col_double(),
##   Count_Unit_Bicycle = col_double(),
##   Count_Unit_Pedestrian = col_double(),
##   Count_Unit_Other = col_double()
## )
## See spec(...) for full column specifications.

sqldf also enables reading in data using read.csv.sql :

crash <- read.csv.sql(file=“data/crash.csv”,sql=“select * from file”)

We can start of with a staight select where columns are selected and the data is filtered with the where clause.

crash_data_2017 <-  sqldf("select Count_Unit_Motorcycle_Moped,Crash_Month,Crash_Severity from crash where Crash_Year = 2017" )
#display just a subset of rows
sqldf("select * from crash_data_2017 limit 10")
##    Count_Unit_Motorcycle_Moped Crash_Month    Crash_Severity
## 1                            0     January   Hospitalisation
## 2                            0     January Medical treatment
## 3                            0     January Medical treatment
## 4                            1     January      Minor injury
## 5                            0     January   Hospitalisation
## 6                            0     January   Hospitalisation
## 7                            0     January      Minor injury
## 8                            0     January Medical treatment
## 9                            0     January   Hospitalisation
## 10                           0     January Medical treatment

Next we can try an aggregate to and group by month. AVG, COUNT, MIN, MAX, SUM are all supported.

crash_data_sum_month <- sqldf("select sum(Count_Unit_Motorcycle_Moped) as Num_Crashes,Crash_Month  from crash_data_2017 group by Crash_Month order by 1 DESC  "  )

crash_data_sum_month
##    Num_Crashes Crash_Month
## 1          155      August
## 2          146         May
## 3          143   September
## 4          143        July
## 5          143       April
## 6          129     October
## 7          125        June
## 8          120    December
## 9          117    February
## 10         110     January
## 11         105    November
## 12         104       March

Below we use a window function to create a running total. For more info take a look at : https://www.sqlitetutorial.net/sqlite-window-functions/

sqldf("select 
       Crash_Month
      ,Num_Crashes
      ,sum(Num_Crashes) OVER (
        ORDER BY Crash_Month  -- Note this is window ordering
        
        ) AS Running_Total
      from crash_data_sum_month ORDER by Crash_Month "  )
##    Crash_Month Num_Crashes Running_Total
## 1        April         143           143
## 2       August         155           298
## 3     December         120           418
## 4     February         117           535
## 5      January         110           645
## 6         July         143           788
## 7         June         125           913
## 8        March         104          1017
## 9          May         146          1163
## 10    November         105          1268
## 11     October         129          1397
## 12   September         143          1540

We can select the top 5 rows ordered by our sum of crashes descending

sqldf("select * from crash_data_sum_month order by Num_crashes desc Limit 5") 
##   Num_Crashes Crash_Month
## 1         155      August
## 2         146         May
## 3         143   September
## 4         143        July
## 5         143       April

We can take a look at the distinct values

sqldf("select distinct(crash_severity) from crash_data_2017" )
##      Crash_Severity
## 1   Hospitalisation
## 2 Medical treatment
## 3      Minor injury
## 4             Fatal

We can also concatenate values and cast them

sqldf("select Crash_Month ||'-'|| cast(Crash_Year as text) from crash limit 10" )
##    Crash_Month ||'-'|| cast(Crash_Year as text)
## 1                                January-2001.0
## 2                                January-2001.0
## 3                                January-2001.0
## 4                                January-2001.0
## 5                                January-2001.0
## 6                                January-2001.0
## 7                                January-2001.0
## 8                                January-2001.0
## 9                                January-2001.0
## 10                               January-2001.0

We can also pipe our results out to other packages. In this case ggplot

sqldf("select count(Crash_Severity) Count_Accidents, Crash_Severity from crash_data_2017 group by Crash_Severity ") %>% ggplot(aes(x=Crash_Severity,y=Count_Accidents)) + geom_col()

One thing that I have not been able to get to work is delete , update , insert statements. But everthing you can do in select statements in SQLite you can do with sqldf.