Today we will take a look at:

  1. the iris data
  2. the diamonds data in the ggplot2 library
  3. the gapminder data in the gapminder library
  4. the nycflights13 data in the nyclights13 library

A good website to look at to learn more about sql is w3school.com

library(ggplot2)
library(gapminder)
library(nycflights13)
head(iris)
head(diamonds)
head(gapminder)
head(airlines)
head(flights)
head(planes)
head(weather)
  1. Look at the iris data.
knitr::include_graphics("E:/University/classes/2016-2017/Spring2017/Stat6620/sqldf/thUFAY9IC0.jpg")

Try aggregate() function

str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
species.means <- with(iris, aggregate(iris[-5], by=list(Species), FUN=mean, na.rm=TRUE))
print(species.means)
  1. Start by looking at the diamond data.

Here is a very nice blog post about using sqldf.

https://www.r-bloggers.com/manipulating-data-frames-using-sqldf-a-brief-overview/

knitr::include_graphics("E:/University/classes/2016-2017/Spring2017/Stat6620/sqldf/diamond.png")

data("diamonds")
head(diamonds)
tail(diamonds)
print(diamonds)
str(diamonds)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   53940 obs. of  10 variables:
 $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
 $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
 $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
 $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
 $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
 $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
 $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
 $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
 $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
 $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
diamonds.cut <- with(diamonds, aggregate(cbind(carat,price) ~ cut, FUN=mean))
print(diamonds.cut)
diamonds.color <- with(diamonds, aggregate(cbind(carat,price) ~ color, FUN=mean))
print(diamonds.color)
diamonds.clarity <- with(diamonds, aggregate(cbind(carat,price) ~ clarity, FUN=mean))
print(diamonds.clarity)

Try subset() function

diamonds.good <- subset(diamonds, cut=='Good')
print(diamonds.good)
diamonds.good.means <- with(diamonds.good, aggregate(cbind(carat,price) ~ color, FUN=mean))
print(diamonds.good.means)
library(sqldf)
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Loading required package: DBI
goodDiamonds <- sqldf("select * from diamonds where cut='Good'")
Loading required package: tcltk
print(goodDiamonds)
library(sqldf)
goodDiamonds <- sqldf("select * from diamonds where cut='Good' ")
print(goodDiamonds)
goodDiamonds <- sqldf("select carat from diamonds where cut='Good' and color='E' ")
print(goodDiamonds)
goodDiamonds <- sqldf("select * from diamonds where cut='Good' and color='E' ")
print(goodDiamonds)
  1. Look at the gapmider data.
library(tibble)
library(ggplot2)
head(gapminder)
ggplot(gapminder, aes(x = log10(gdpPercap), y = lifeExp)) +
  geom_point() + scale_x_log10() + geom_point(aes(color = continent))

print(gapminder)
Americas <- sqldf("SELECT * FROM gapminder 
                  WHERE continent='Asia' AND year = '2007' AND lifeExp > 75 
                  ORDER BY lifeExp DESC")
print(Americas)
ggplot(Americas, aes(x=gdpPercap, y=lifeExp)) +
    geom_point(shape=1) +    # Use hollow circles
    geom_smooth(method=lm)   # Add linear regression line 

                             #  (by default includes 95% confidence region)
  1. Look at the nycflights13 data. Here the data file contains many files and can be used to merge substes of the data.
head(airlines)
head(flights)
head(planes)
head(weather)

Try merge() function

flights_planes.data <- merge(x = flights, y = planes, by = "tailnum", all.x = TRUE)
head(flights_planes.data)
dim(flights_planes.data)
[1] 336776     27
flights.data <- sqldf("SELECT * FROM flights ")
head(flights.data)
dim(flights.data)
[1] 336776     19
flights.data <- sqldf("SELECT * FROM flights WHERE carrier = 'UA'")
head(flights.data)
dim(flights.data)
[1] 58665    19
flights.data <- sqldf("SELECT * FROM flights WHERE carrier <> 'UA'")
head(flights.data)
dim(flights.data)
[1] 278111     19
flights_planes.data <- sqldf("SELECT * FROM flights LEFT JOIN planes USING(tailnum)")
head(flights_planes.data)
dim(flights_planes.data)
[1] 336776     27

Save the file to an sqlite dataset.

library(dplyr)

Attaching package: <U+393C><U+3E31>dplyr<U+393C><U+3E32>

The following objects are masked from <U+393C><U+3E31>package:plyr<U+393C><U+3E32>:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following object is masked from <U+393C><U+3E31>package:randomForest<U+393C><U+3E32>:

    combine

The following object is masked from <U+393C><U+3E31>package:MASS<U+393C><U+3E32>:

    select

The following objects are masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:

    filter, lag

The following objects are masked from <U+393C><U+3E31>package:base<U+393C><U+3E32>:

    intersect, setdiff, setequal, union
nycflights13_sqlite(path = "C:/Users/Eric A. Suess/Documents")
Caching nycflights db at C:/Users/Eric A. Suess/Documents/nycflights13.sqlite
src:  sqlite 3.8.6 [C:/Users/Eric A. Suess/Documents/nycflights13.sqlite]
tbls: airlines, airports, flights, planes, sqlite_stat1, weather

Other topics.

  1. Top 7 SQL interview questions for tech professionals
  2. Interesting document about loading and saving data files in different data formats.
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
 [1] tcltk     parallel  splines   stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] dplyr_0.5.0          tibble_1.2           sqldf_0.4-10         RSQLite_1.0.0        DBI_0.5-1           
 [6] gsubfn_0.6-6         proto_1.0.0          nycflights13_0.2.0   gapminder_0.2.0      plyr_1.8.4          
[11] gbm_2.1.1            survival_2.39-5      caTools_1.17.1       nnet_7.3-12          randomForest_4.6-12 
[16] caretEnsemble_2.0.0  rpart_4.1-10         pROC_1.8             mlbench_2.1-1        pscl_1.4.9          
[21] MASS_7.3-45          caret_6.0-71         ggplot2_2.1.0        lattice_0.20-34      RevoUtilsMath_10.0.0

loaded via a namespace (and not attached):
 [1] RevoUtils_10.0.2   reshape2_1.4.2     pbapply_1.3-1      colorspace_1.2-7   htmltools_0.3.5   
 [6] stats4_3.3.2       yaml_2.1.13        mgcv_1.8-15        base64enc_0.1-3    chron_2.3-47      
[11] nloptr_1.0.4       foreach_1.4.3      stringr_1.1.0      MatrixModels_0.4-1 munsell_0.4.3     
[16] gtable_0.2.0       codetools_0.2-15   evaluate_0.10      labeling_0.3       knitr_1.14        
[21] SparseM_1.72       quantreg_5.29      pbkrtest_0.4-6     Rcpp_0.12.7        backports_1.0.4   
[26] scales_0.4.0       formatR_1.4        jsonlite_1.1       lme4_1.1-12        gridExtra_2.2.1   
[31] digest_0.6.10      stringi_1.1.2      grid_3.3.2         rprojroot_1.1      tools_3.3.2       
[36] bitops_1.0-6       magrittr_1.5       car_2.1-3          Matrix_1.2-7.1     rsconnect_0.7.0-2 
[41] data.table_1.9.6   assertthat_0.1     minqa_1.2.4        rmarkdown_1.3      iterators_1.0.8   
[46] R6_2.2.0           nlme_3.1-128       compiler_3.3.2    
LS0tDQp0aXRsZTogInNxbGRmIGV4YW1wbGVzIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KVG9kYXkgd2Ugd2lsbCB0YWtlIGEgbG9vayBhdDoNCg0KIy4gdGhlIGlyaXMgZGF0YQ0KIy4gdGhlIGRpYW1vbmRzIGRhdGEgaW4gdGhlIGdncGxvdDIgbGlicmFyeQ0KIy4gdGhlIGdhcG1pbmRlciBkYXRhIGluIHRoZSBnYXBtaW5kZXIgbGlicmFyeQ0KIy4gdGhlIG55Y2ZsaWdodHMxMyBkYXRhIGluIHRoZSBueWNsaWdodHMxMyBsaWJyYXJ5DQoNCkEgZ29vZCB3ZWJzaXRlIHRvIGxvb2sgYXQgdG8gbGVhcm4gbW9yZSBhYm91dCBzcWwgaXMgW3czc2Nob29sLmNvbV0oaHR0cHM6Ly93d3cudzNzY2hvb2xzLmNvbS9zcWwvZGVmYXVsdC5hc3ApDQoNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShnYXBtaW5kZXIpDQpsaWJyYXJ5KG55Y2ZsaWdodHMxMykNCmBgYA0KDQpgYGB7cn0NCmhlYWQoaXJpcykNCmBgYA0KDQoNCmBgYHtyfQ0KaGVhZChkaWFtb25kcykNCmBgYA0KDQpgYGB7cn0NCmhlYWQoZ2FwbWluZGVyKQ0KYGBgDQoNCg0KYGBge3J9DQpoZWFkKGFpcmxpbmVzKQ0KaGVhZChmbGlnaHRzKQ0KaGVhZChwbGFuZXMpDQpoZWFkKHdlYXRoZXIpDQpgYGANCg0KMS4gTG9vayBhdCB0aGUgaXJpcyBkYXRhLg0KDQpgYGB7ciwgb3V0LndpZHRoID0gIjQwMHB4In0NCmtuaXRyOjppbmNsdWRlX2dyYXBoaWNzKCJFOi9Vbml2ZXJzaXR5L2NsYXNzZXMvMjAxNi0yMDE3L1NwcmluZzIwMTcvU3RhdDY2MjAvc3FsZGYvdGhVRkFZOUlDMC5qcGciKQ0KYGBgDQoNCiMgVHJ5IGFnZ3JlZ2F0ZSgpIGZ1bmN0aW9uDQoNCmBgYHtyfQ0Kc3RyKGlyaXMpDQoNCnNwZWNpZXMubWVhbnMgPC0gd2l0aChpcmlzLCBhZ2dyZWdhdGUoaXJpc1stNV0sIGJ5PWxpc3QoU3BlY2llcyksIEZVTj1tZWFuLCBuYS5ybT1UUlVFKSkNCg0KcHJpbnQoc3BlY2llcy5tZWFucykNCg0KYGBgDQoNCjIuIFN0YXJ0IGJ5IGxvb2tpbmcgYXQgdGhlICoqZGlhbW9uZCoqIGRhdGEuDQoNCkhlcmUgaXMgYSB2ZXJ5IG5pY2UgYmxvZyBwb3N0IGFib3V0IHVzaW5nICoqc3FsZGYqKi4NCg0KaHR0cHM6Ly93d3cuci1ibG9nZ2Vycy5jb20vbWFuaXB1bGF0aW5nLWRhdGEtZnJhbWVzLXVzaW5nLXNxbGRmLWEtYnJpZWYtb3ZlcnZpZXcvDQoNCg0KYGBge3IsIG91dC53aWR0aCA9ICI0MDBweCJ9DQprbml0cjo6aW5jbHVkZV9ncmFwaGljcygiRTovVW5pdmVyc2l0eS9jbGFzc2VzLzIwMTYtMjAxNy9TcHJpbmcyMDE3L1N0YXQ2NjIwL3NxbGRmL2RpYW1vbmQucG5nIikNCmBgYA0KDQpgYGB7cn0NCmRhdGEoImRpYW1vbmRzIikNCg0KaGVhZChkaWFtb25kcykNCg0KdGFpbChkaWFtb25kcykNCg0KYGBgDQoNCmBgYHtyfQ0KcHJpbnQoZGlhbW9uZHMpDQoNCnN0cihkaWFtb25kcykNCmBgYA0KDQoNCmBgYHtyfQ0KZGlhbW9uZHMuY3V0IDwtIHdpdGgoZGlhbW9uZHMsIGFnZ3JlZ2F0ZShjYmluZChjYXJhdCxwcmljZSkgfiBjdXQsIEZVTj1tZWFuKSkNCg0KcHJpbnQoZGlhbW9uZHMuY3V0KQ0KYGBgDQoNCmBgYHtyfQ0KZGlhbW9uZHMuY29sb3IgPC0gd2l0aChkaWFtb25kcywgYWdncmVnYXRlKGNiaW5kKGNhcmF0LHByaWNlKSB+IGNvbG9yLCBGVU49bWVhbikpDQoNCnByaW50KGRpYW1vbmRzLmNvbG9yKQ0KYGBgDQoNCmBgYHtyfQ0KZGlhbW9uZHMuY2xhcml0eSA8LSB3aXRoKGRpYW1vbmRzLCBhZ2dyZWdhdGUoY2JpbmQoY2FyYXQscHJpY2UpIH4gY2xhcml0eSwgRlVOPW1lYW4pKQ0KDQpwcmludChkaWFtb25kcy5jbGFyaXR5KQ0KYGBgDQoNCiMgVHJ5IHN1YnNldCgpIGZ1bmN0aW9uDQoNCmBgYHtyfQ0KZGlhbW9uZHMuZ29vZCA8LSBzdWJzZXQoZGlhbW9uZHMsIGN1dD09J0dvb2QnKQ0KDQpwcmludChkaWFtb25kcy5nb29kKQ0KDQpkaWFtb25kcy5nb29kLm1lYW5zIDwtIHdpdGgoZGlhbW9uZHMuZ29vZCwgYWdncmVnYXRlKGNiaW5kKGNhcmF0LHByaWNlKSB+IGNvbG9yLCBGVU49bWVhbikpDQoNCnByaW50KGRpYW1vbmRzLmdvb2QubWVhbnMpDQoNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShzcWxkZikNCmdvb2REaWFtb25kcyA8LSBzcWxkZigic2VsZWN0ICogZnJvbSBkaWFtb25kcyB3aGVyZSBjdXQ9J0dvb2QnIikNCg0KcHJpbnQoZ29vZERpYW1vbmRzKQ0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeShzcWxkZikNCg0KZ29vZERpYW1vbmRzIDwtIHNxbGRmKCJzZWxlY3QgKiBmcm9tIGRpYW1vbmRzIHdoZXJlIGN1dD0nR29vZCcgIikNCg0KcHJpbnQoZ29vZERpYW1vbmRzKQ0KYGBgDQoNCmBgYHtyfQ0KDQpnb29kRGlhbW9uZHMgPC0gc3FsZGYoInNlbGVjdCBjYXJhdCBmcm9tIGRpYW1vbmRzIHdoZXJlIGN1dD0nR29vZCcgYW5kIGNvbG9yPSdFJyAiKQ0KDQpwcmludChnb29kRGlhbW9uZHMpDQpgYGANCg0KDQpgYGB7cn0NCmdvb2REaWFtb25kcyA8LSBzcWxkZigic2VsZWN0ICogZnJvbSBkaWFtb25kcyB3aGVyZSBjdXQ9J0dvb2QnIGFuZCBjb2xvcj0nRScgIikNCg0KcHJpbnQoZ29vZERpYW1vbmRzKQ0KYGBgDQoNCjMuIExvb2sgYXQgdGhlIGdhcG1pZGVyIGRhdGEuDQoNCmBgYHtyfQ0KDQpsaWJyYXJ5KHRpYmJsZSkNCmxpYnJhcnkoZ2dwbG90MikNCg0KaGVhZChnYXBtaW5kZXIpDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoZ2FwbWluZGVyLCBhZXMoeCA9IGxvZzEwKGdkcFBlcmNhcCksIHkgPSBsaWZlRXhwKSkgKw0KICBnZW9tX3BvaW50KCkgKyBzY2FsZV94X2xvZzEwKCkgKyBnZW9tX3BvaW50KGFlcyhjb2xvciA9IGNvbnRpbmVudCkpDQpgYGANCg0KYGBge3J9DQpwcmludChnYXBtaW5kZXIpDQoNCkFtZXJpY2FzIDwtIHNxbGRmKCJTRUxFQ1QgKiBGUk9NIGdhcG1pbmRlciANCiAgICAgICAgICAgICAgICAgIFdIRVJFIGNvbnRpbmVudD0nQXNpYScgQU5EIHllYXIgPSAnMjAwNycgQU5EIGxpZmVFeHAgPiA3NSANCiAgICAgICAgICAgICAgICAgIE9SREVSIEJZIGxpZmVFeHAgREVTQyIpDQoNCnByaW50KEFtZXJpY2FzKQ0KDQpnZ3Bsb3QoQW1lcmljYXMsIGFlcyh4PWdkcFBlcmNhcCwgeT1saWZlRXhwKSkgKw0KICAgIGdlb21fcG9pbnQoc2hhcGU9MSkgKyAgICAjIFVzZSBob2xsb3cgY2lyY2xlcw0KICAgIGdlb21fc21vb3RoKG1ldGhvZD1sbSkgICAjIEFkZCBsaW5lYXIgcmVncmVzc2lvbiBsaW5lIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjICAoYnkgZGVmYXVsdCBpbmNsdWRlcyA5NSUgY29uZmlkZW5jZSByZWdpb24pDQoNCmBgYA0KDQoNCjQuIExvb2sgYXQgdGhlIG55Y2ZsaWdodHMxMyBkYXRhLiAgSGVyZSB0aGUgZGF0YSBmaWxlIGNvbnRhaW5zIG1hbnkgZmlsZXMgYW5kIGNhbiBiZSB1c2VkIHRvIG1lcmdlIHN1YnN0ZXMgb2YgdGhlIGRhdGEuDQoNCmBgYHtyfQ0KaGVhZChhaXJsaW5lcykNCmhlYWQoZmxpZ2h0cykNCmhlYWQocGxhbmVzKQ0KaGVhZCh3ZWF0aGVyKQ0KYGBgDQoNCiMgVHJ5IG1lcmdlKCkgZnVuY3Rpb24NCg0KYGBge3J9DQpmbGlnaHRzX3BsYW5lcy5kYXRhIDwtIG1lcmdlKHggPSBmbGlnaHRzLCB5ID0gcGxhbmVzLCBieSA9ICJ0YWlsbnVtIiwgYWxsLnggPSBUUlVFKSAjIExlZnQgDQpoZWFkKGZsaWdodHNfcGxhbmVzLmRhdGEpDQpkaW0oZmxpZ2h0c19wbGFuZXMuZGF0YSkNCmBgYA0KDQoNCg0KYGBge3J9DQpmbGlnaHRzLmRhdGEgPC0gc3FsZGYoIlNFTEVDVCAqIEZST00gZmxpZ2h0cyAiKQ0KDQpoZWFkKGZsaWdodHMuZGF0YSkNCmRpbShmbGlnaHRzLmRhdGEpDQoNCmZsaWdodHMuZGF0YSA8LSBzcWxkZigiU0VMRUNUICogRlJPTSBmbGlnaHRzIFdIRVJFIGNhcnJpZXIgPSAnVUEnIikNCmhlYWQoZmxpZ2h0cy5kYXRhKQ0KZGltKGZsaWdodHMuZGF0YSkNCg0KZmxpZ2h0cy5kYXRhIDwtIHNxbGRmKCJTRUxFQ1QgKiBGUk9NIGZsaWdodHMgV0hFUkUgY2FycmllciA8PiAnVUEnIikNCmhlYWQoZmxpZ2h0cy5kYXRhKQ0KZGltKGZsaWdodHMuZGF0YSkNCg0KZmxpZ2h0c19wbGFuZXMuZGF0YSA8LSBzcWxkZigiU0VMRUNUICogRlJPTSBmbGlnaHRzIExFRlQgSk9JTiBwbGFuZXMgVVNJTkcodGFpbG51bSkiKQ0KaGVhZChmbGlnaHRzX3BsYW5lcy5kYXRhKQ0KZGltKGZsaWdodHNfcGxhbmVzLmRhdGEpDQoNCmBgYA0KDQoNCg0KDQpTYXZlIHRoZSBmaWxlIHRvIGFuIHNxbGl0ZSBkYXRhc2V0Lg0KDQpgYGB7cn0NCmxpYnJhcnkoZHBseXIpDQoNCm55Y2ZsaWdodHMxM19zcWxpdGUocGF0aCA9ICJDOi9Vc2Vycy9FcmljIEEuIFN1ZXNzL0RvY3VtZW50cyIpDQpgYGANCg0KDQpPdGhlciB0b3BpY3MuDQoNCjEuIFtUb3AgNyBTUUwgaW50ZXJ2aWV3IHF1ZXN0aW9ucyBmb3IgdGVjaCBwcm9mZXNzaW9uYWxzXShodHRwOi8vYmxvZy55aGF0LmNvbS9wb3N0cy9TUUwtaW50ZXJ2aWV3LXF1ZXN0aW9ucy5odG1sKQ0KMi4gSW50ZXJlc3RpbmcgW2RvY3VtZW50XShodHRwczovL2NyYW4ucnN0dWRpby5jb20vd2ViL3BhY2thZ2VzL3NhdmVzL3ZpZ25ldHRlcy9zYXZlcy5wZGYpIGFib3V0IGxvYWRpbmcgYW5kIHNhdmluZyBkYXRhIGZpbGVzIGluIGRpZmZlcmVudCBkYXRhIGZvcm1hdHMuDQoNCmBgYHtyfQ0Kc2Vzc2lvbkluZm8oKQ0KYGBgDQoNCg0K