Today we will take a look at:
A good website to look at to learn more about sql is w3school.com
library(ggplot2)
library(gapminder)
library(nycflights13)
head(iris)
head(diamonds)
head(gapminder)
head(airlines)
head(flights)
head(planes)
head(weather)
knitr::include_graphics("E:/University/classes/2016-2017/Spring2017/Stat6620/sqldf/thUFAY9IC0.jpg")
str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
species.means <- with(iris, aggregate(iris[-5], by=list(Species), FUN=mean, na.rm=TRUE))
print(species.means)
Here is a very nice blog post about using sqldf.
https://www.r-bloggers.com/manipulating-data-frames-using-sqldf-a-brief-overview/
knitr::include_graphics("E:/University/classes/2016-2017/Spring2017/Stat6620/sqldf/diamond.png")
data("diamonds")
head(diamonds)
tail(diamonds)
print(diamonds)
str(diamonds)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 53940 obs. of 10 variables:
$ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num 55 61 65 58 58 57 57 55 61 61 ...
$ price : int 326 326 327 334 335 336 336 337 337 338 ...
$ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
diamonds.cut <- with(diamonds, aggregate(cbind(carat,price) ~ cut, FUN=mean))
print(diamonds.cut)
diamonds.color <- with(diamonds, aggregate(cbind(carat,price) ~ color, FUN=mean))
print(diamonds.color)
diamonds.clarity <- with(diamonds, aggregate(cbind(carat,price) ~ clarity, FUN=mean))
print(diamonds.clarity)
diamonds.good <- subset(diamonds, cut=='Good')
print(diamonds.good)
diamonds.good.means <- with(diamonds.good, aggregate(cbind(carat,price) ~ color, FUN=mean))
print(diamonds.good.means)
library(sqldf)
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Loading required package: DBI
goodDiamonds <- sqldf("select * from diamonds where cut='Good'")
Loading required package: tcltk
print(goodDiamonds)
library(sqldf)
goodDiamonds <- sqldf("select * from diamonds where cut='Good' ")
print(goodDiamonds)
goodDiamonds <- sqldf("select carat from diamonds where cut='Good' and color='E' ")
print(goodDiamonds)
goodDiamonds <- sqldf("select * from diamonds where cut='Good' and color='E' ")
print(goodDiamonds)
library(tibble)
library(ggplot2)
head(gapminder)
ggplot(gapminder, aes(x = log10(gdpPercap), y = lifeExp)) +
geom_point() + scale_x_log10() + geom_point(aes(color = continent))
print(gapminder)
Americas <- sqldf("SELECT * FROM gapminder
WHERE continent='Asia' AND year = '2007' AND lifeExp > 75
ORDER BY lifeExp DESC")
print(Americas)
ggplot(Americas, aes(x=gdpPercap, y=lifeExp)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# (by default includes 95% confidence region)
head(airlines)
head(flights)
head(planes)
head(weather)
flights_planes.data <- merge(x = flights, y = planes, by = "tailnum", all.x = TRUE)
head(flights_planes.data)
dim(flights_planes.data)
[1] 336776 27
flights.data <- sqldf("SELECT * FROM flights ")
head(flights.data)
dim(flights.data)
[1] 336776 19
flights.data <- sqldf("SELECT * FROM flights WHERE carrier = 'UA'")
head(flights.data)
dim(flights.data)
[1] 58665 19
flights.data <- sqldf("SELECT * FROM flights WHERE carrier <> 'UA'")
head(flights.data)
dim(flights.data)
[1] 278111 19
flights_planes.data <- sqldf("SELECT * FROM flights LEFT JOIN planes USING(tailnum)")
head(flights_planes.data)
dim(flights_planes.data)
[1] 336776 27
Save the file to an sqlite dataset.
library(dplyr)
Attaching package: <U+393C><U+3E31>dplyr<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:plyr<U+393C><U+3E32>:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following object is masked from <U+393C><U+3E31>package:randomForest<U+393C><U+3E32>:
combine
The following object is masked from <U+393C><U+3E31>package:MASS<U+393C><U+3E32>:
select
The following objects are masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
filter, lag
The following objects are masked from <U+393C><U+3E31>package:base<U+393C><U+3E32>:
intersect, setdiff, setequal, union
nycflights13_sqlite(path = "C:/Users/Eric A. Suess/Documents")
Caching nycflights db at C:/Users/Eric A. Suess/Documents/nycflights13.sqlite
src: sqlite 3.8.6 [C:/Users/Eric A. Suess/Documents/nycflights13.sqlite]
tbls: airlines, airports, flights, planes, sqlite_stat1, weather
Other topics.
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] tcltk parallel splines stats graphics grDevices utils datasets methods base
other attached packages:
[1] dplyr_0.5.0 tibble_1.2 sqldf_0.4-10 RSQLite_1.0.0 DBI_0.5-1
[6] gsubfn_0.6-6 proto_1.0.0 nycflights13_0.2.0 gapminder_0.2.0 plyr_1.8.4
[11] gbm_2.1.1 survival_2.39-5 caTools_1.17.1 nnet_7.3-12 randomForest_4.6-12
[16] caretEnsemble_2.0.0 rpart_4.1-10 pROC_1.8 mlbench_2.1-1 pscl_1.4.9
[21] MASS_7.3-45 caret_6.0-71 ggplot2_2.1.0 lattice_0.20-34 RevoUtilsMath_10.0.0
loaded via a namespace (and not attached):
[1] RevoUtils_10.0.2 reshape2_1.4.2 pbapply_1.3-1 colorspace_1.2-7 htmltools_0.3.5
[6] stats4_3.3.2 yaml_2.1.13 mgcv_1.8-15 base64enc_0.1-3 chron_2.3-47
[11] nloptr_1.0.4 foreach_1.4.3 stringr_1.1.0 MatrixModels_0.4-1 munsell_0.4.3
[16] gtable_0.2.0 codetools_0.2-15 evaluate_0.10 labeling_0.3 knitr_1.14
[21] SparseM_1.72 quantreg_5.29 pbkrtest_0.4-6 Rcpp_0.12.7 backports_1.0.4
[26] scales_0.4.0 formatR_1.4 jsonlite_1.1 lme4_1.1-12 gridExtra_2.2.1
[31] digest_0.6.10 stringi_1.1.2 grid_3.3.2 rprojroot_1.1 tools_3.3.2
[36] bitops_1.0-6 magrittr_1.5 car_2.1-3 Matrix_1.2-7.1 rsconnect_0.7.0-2
[41] data.table_1.9.6 assertthat_0.1 minqa_1.2.4 rmarkdown_1.3 iterators_1.0.8
[46] R6_2.2.0 nlme_3.1-128 compiler_3.3.2