library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Coming from a SQL background and being able to use SQL to do EDA was deinitely a help in R. Although with using packages such as dplyr you can do much the same thing. Sqldf is another great tool in your arsenal.
sqldf supports the SQLite database backend by default. To check the version run the below.
sqldf("select sqlite_version()")
## sqlite_version()
## 1 3.30.1
Get data
crash <- read_csv("data/crash_data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_double(),
## Crash_Ref_Number = col_double(),
## Crash_Year = col_double(),
## Crash_Hour = col_double(),
## Crash_Longitude_GDA94 = col_double(),
## Crash_Latitude_GDA94 = col_double(),
## Count_Casualty_Fatality = col_double(),
## Count_Casualty_Hospitalised = col_double(),
## Count_Casualty_MedicallyTreated = col_double(),
## Count_Casualty_MinorInjury = col_double(),
## Count_Casualty_Total = col_double(),
## Count_Unit_Car = col_double(),
## Count_Unit_Motorcycle_Moped = col_double(),
## Count_Unit_Truck = col_double(),
## Count_Unit_Bus = col_double(),
## Count_Unit_Bicycle = col_double(),
## Count_Unit_Pedestrian = col_double(),
## Count_Unit_Other = col_double()
## )
## See spec(...) for full column specifications.
sqldf also enables reading in data using read.csv.sql :
crash <- read.csv.sql(file=“data/crash.csv”,sql=“select * from file”)
We can start of with a staight select where columns are selected and the data is filtered with the where clause.
crash_data_2017 <- sqldf("select Count_Unit_Motorcycle_Moped,Crash_Month,Crash_Severity from crash where Crash_Year = 2017" )
#display just a subset of rows
sqldf("select * from crash_data_2017 limit 10")
## Count_Unit_Motorcycle_Moped Crash_Month Crash_Severity
## 1 0 January Hospitalisation
## 2 0 January Medical treatment
## 3 0 January Medical treatment
## 4 1 January Minor injury
## 5 0 January Hospitalisation
## 6 0 January Hospitalisation
## 7 0 January Minor injury
## 8 0 January Medical treatment
## 9 0 January Hospitalisation
## 10 0 January Medical treatment
Next we can try an aggregate to and group by month. AVG, COUNT, MIN, MAX, SUM are all supported.
crash_data_sum_month <- sqldf("select sum(Count_Unit_Motorcycle_Moped) as Num_Crashes,Crash_Month from crash_data_2017 group by Crash_Month order by 1 DESC " )
crash_data_sum_month
## Num_Crashes Crash_Month
## 1 155 August
## 2 146 May
## 3 143 September
## 4 143 July
## 5 143 April
## 6 129 October
## 7 125 June
## 8 120 December
## 9 117 February
## 10 110 January
## 11 105 November
## 12 104 March
Below we use a window function to create a running total. For more info take a look at : https://www.sqlitetutorial.net/sqlite-window-functions/
sqldf("select
Crash_Month
,Num_Crashes
,sum(Num_Crashes) OVER (
ORDER BY Crash_Month -- Note this is window ordering
) AS Running_Total
from crash_data_sum_month ORDER by Crash_Month " )
## Crash_Month Num_Crashes Running_Total
## 1 April 143 143
## 2 August 155 298
## 3 December 120 418
## 4 February 117 535
## 5 January 110 645
## 6 July 143 788
## 7 June 125 913
## 8 March 104 1017
## 9 May 146 1163
## 10 November 105 1268
## 11 October 129 1397
## 12 September 143 1540
We can select the top 5 rows ordered by our sum of crashes descending
sqldf("select * from crash_data_sum_month order by Num_crashes desc Limit 5")
## Num_Crashes Crash_Month
## 1 155 August
## 2 146 May
## 3 143 September
## 4 143 July
## 5 143 April
We can take a look at the distinct values
sqldf("select distinct(crash_severity) from crash_data_2017" )
## Crash_Severity
## 1 Hospitalisation
## 2 Medical treatment
## 3 Minor injury
## 4 Fatal
We can also concatenate values and cast them
sqldf("select Crash_Month ||'-'|| cast(Crash_Year as text) from crash limit 10" )
## Crash_Month ||'-'|| cast(Crash_Year as text)
## 1 January-2001.0
## 2 January-2001.0
## 3 January-2001.0
## 4 January-2001.0
## 5 January-2001.0
## 6 January-2001.0
## 7 January-2001.0
## 8 January-2001.0
## 9 January-2001.0
## 10 January-2001.0
We can also pipe our results out to other packages. In this case ggplot
sqldf("select count(Crash_Severity) Count_Accidents, Crash_Severity from crash_data_2017 group by Crash_Severity ") %>% ggplot(aes(x=Crash_Severity,y=Count_Accidents)) + geom_col()
One thing that I have not been able to get to work is delete , update , insert statements. But everthing you can do in select statements in SQLite you can do with sqldf.