CONFIGURAR DBPLYR
library(dplyr)
library(dbplyr)
## read csv
argencon<-readr::read_csv("~/argencon.csv")
## Create SQLite connection `in-memory`
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
## copy argencon data frame to SQLite
copy_to(con, argencon,name = "argencon_db")
## crate table
argencon_tbl <- tbl(con, "argencon_db")
Queries
Número de observaciones por subtipo de dominio DGA
argencon_tbl %>% group_by(label) %>% summarise(n=n()) %>% show_query()
<SQL>
SELECT `label`, COUNT(*) AS `n`
FROM `argencon_db`
GROUP BY `label`
Número de observaciones por subtipo de dominio DGA
argencon_tbl %>% filter(label=="dga") %>% group_by(source) %>% summarise(n=n()) %>% show_query()
<SQL>
SELECT `source`, COUNT(*) AS `n`
FROM `argencon_db`
WHERE (`label` = 'dga')
GROUP BY `source`
Número de observaciones por subtipo de dominio NORMAL
argencon_tbl %>% filter(label=="normal") %>% group_by(source) %>% summarise(n=n()) %>% show_query()
<SQL>
SELECT `source`, COUNT(*) AS `n`
FROM `argencon_db`
WHERE (`label` = 'normal')
GROUP BY `source`
Longitud promedio de los dominios DGA y NORMAL
argencon_tbl %>% group_by(label) %>% summarise(len=mean(length(domain)) ) %>% show_query()
<SQL>
SELECT `label`, AVG(length(`domain`)) AS `len`
FROM `argencon_db`
GROUP BY `label`
Seleccionar de manera aleatoria un 20% del conjunto de datos.
sample_tbl %>% filter( row_number() < total *.2) %>% show_query()
<SQL>
SELECT `domain`, `label`, `source`, `date`, `id`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`, ROW_NUMBER() OVER (ORDER BY random()) AS `q01`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`, ROW_NUMBER() OVER (ORDER BY random()) AS `q01`
FROM `argencon_db`)
WHERE (`q01` <= 2918496)))
WHERE (`q01` < 2918496 * 0.2)
Seleccionar de manera aleatoria un 10% del conjunto de datos distinto y excluyente al 20% anterior.
sample_tbl %>% filter( row_number() > total *.2 & row_number() < total *.3) %>% show_query()
<SQL>
SELECT `domain`, `label`, `source`, `date`, `id`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`, ROW_NUMBER() OVER (ORDER BY random()) AS `q01`, ROW_NUMBER() OVER (ORDER BY random()) AS `q02`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`
FROM (SELECT `domain`, `label`, `source`, `date`, `id`, ROW_NUMBER() OVER (ORDER BY random()) AS `q01`
FROM `argencon_db`)
WHERE (`q01` <= 2918496)))
WHERE (`q01` > 2918496 * 0.2 AND `q02` < 2918496 * 0.3)
LS0tCnRpdGxlOiAiREJQTFlSIGV4YW1wbGVzIgpvdXRwdXQ6IAogIGh0bWxfbm90ZWJvb2s6IAogICAgdGhlbWU6IGNlcnVsZWFuCi0tLQoKIyBDT05GSUdVUkFSIERCUExZUgoKYGBge3J9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZGJwbHlyKQojIyByZWFkIGNzdgphcmdlbmNvbjwtcmVhZHI6OnJlYWRfY3N2KCJ+L2FyZ2VuY29uLmNzdiIpIAojIyBDcmVhdGUgU1FMaXRlIGNvbm5lY3Rpb24gYGluLW1lbW9yeWAKY29uIDwtIERCSTo6ZGJDb25uZWN0KFJTUUxpdGU6OlNRTGl0ZSgpLCAiOm1lbW9yeToiKQojIyBjb3B5IGFyZ2VuY29uIGRhdGEgZnJhbWUgdG8gU1FMaXRlCmNvcHlfdG8oY29uLCBhcmdlbmNvbixuYW1lID0gImFyZ2VuY29uX2RiIikKIyMgY3JhdGUgdGFibGUKYXJnZW5jb25fdGJsIDwtIHRibChjb24sICJhcmdlbmNvbl9kYiIpCmBgYAojIFF1ZXJpZXMKCiMjIE7Dum1lcm8gZGUgb2JzZXJ2YWNpb25lcyBwb3Igc3VidGlwbyBkZSBkb21pbmlvIERHQQpgYGB7cn0KYXJnZW5jb25fdGJsICU+JSBncm91cF9ieShsYWJlbCkgJT4lIHN1bW1hcmlzZShuPW4oKSkgJT4lIHNob3dfcXVlcnkoKQpgYGAKCiMjIE7Dum1lcm8gZGUgb2JzZXJ2YWNpb25lcyBwb3Igc3VidGlwbyBkZSBkb21pbmlvIERHQQpgYGB7cn0KYXJnZW5jb25fdGJsICU+JSBmaWx0ZXIobGFiZWw9PSJkZ2EiKSAlPiUgZ3JvdXBfYnkoc291cmNlKSAlPiUgc3VtbWFyaXNlKG49bigpKSAlPiUgc2hvd19xdWVyeSgpCmBgYAoKIyMgTsO6bWVybyBkZSBvYnNlcnZhY2lvbmVzIHBvciBzdWJ0aXBvIGRlIGRvbWluaW8gTk9STUFMCmBgYHtyfQphcmdlbmNvbl90YmwgJT4lIGZpbHRlcihsYWJlbD09Im5vcm1hbCIpICU+JSBncm91cF9ieShzb3VyY2UpICU+JSBzdW1tYXJpc2Uobj1uKCkpICU+JSBzaG93X3F1ZXJ5KCkKYGBgCgojIyBMb25naXR1ZCBwcm9tZWRpbyBkZSBsb3MgZG9taW5pb3MgREdBIHkgTk9STUFMCmBgYHtyfQphcmdlbmNvbl90YmwgJT4lIGdyb3VwX2J5KGxhYmVsKSAlPiUgc3VtbWFyaXNlKGxlbj1tZWFuKGxlbmd0aChkb21haW4pKSApICU+JSBzaG93X3F1ZXJ5KCkKYGBgCgojIyBTZWxlY2Npb25hciBkZSBtYW5lcmEgYWxlYXRvcmlhIHVuIDIwJSBkZWwgY29uanVudG8gZGUgZGF0b3MuCmBgYHtyfQp0b3RhbCA8LSBhcmdlbmNvbl90YmwgJT4lIGNvdW50KCkgJT4lIHB1bGwobikKIyBhIG5ldyBkYiB3aXRoICBzaHVmZmxlZCBkYXRhCnNhbXBsZV90Ymw8LWFyZ2VuY29uX3RibCAlPiUgc2xpY2Vfc2FtcGxlKCBuPSB0b3RhbCkgCgpzYW1wbGVfdGJsICU+JSBmaWx0ZXIoIHJvd19udW1iZXIoKSA8IHRvdGFsICouMikgJT4lIHNob3dfcXVlcnkoKQpgYGAKCiMjIFNlbGVjY2lvbmFyIGRlIG1hbmVyYSBhbGVhdG9yaWEgIHVuIDEwJSBkZWwgY29uanVudG8gZGUgZGF0b3MgZGlzdGludG8geSBleGNsdXllbnRlIGFsIDIwJSBhbnRlcmlvci4KYGBge3J9CnNhbXBsZV90YmwgJT4lIGZpbHRlciggcm93X251bWJlcigpID4gdG90YWwgKi4yICYgcm93X251bWJlcigpIDwgdG90YWwgKi4zKSAlPiUgc2hvd19xdWVyeSgpCmBgYAoK