Praktikum 3 - Pengolahan Database Menggunakan R

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RSQLite)
library(DBI)

Koneksi ke SQlite

chinook<-dbConnect(SQLite(), "Chinook_Sqlite.sqlite")
dbListTables(chinook)
##  [1] "Album"         "Artist"        "Customer"      "Employee"     
##  [5] "Genre"         "Invoice"       "InvoiceLine"   "MediaType"    
##  [9] "Playlist"      "PlaylistTrack" "Track"

Untuk mengakses tabel pada database dapat digunakan fungsi tbl dari package dplyr. Fungsi tbl berguna untuk melakukan konversi tabel ke bentuk data.frame dalam R. Berikut ilustrasinya

customer <- tbl(src=chinook,"Customer")
class(customer)
## [1] "tbl_SQLiteConnection" "tbl_dbi"              "tbl_sql"             
## [4] "tbl_lazy"             "tbl"
glimpse(customer)
## Rows: ??
## Columns: 13
## Database: sqlite 3.41.2 [D:\Kuliah\S2\Sains Data\Responsi\P3\Chinook_Sqlite.sqlite]
## $ CustomerId   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ FirstName    <chr> "Luís", "Leonie", "François", "Bjørn", "František", "Hele…
## $ LastName     <chr> "Gonçalves", "Köhler", "Tremblay", "Hansen", "Wichterlová…
## $ Company      <chr> "Embraer - Empresa Brasileira de Aeronáutica S.A.", NA, N…
## $ Address      <chr> "Av. Brigadeiro Faria Lima, 2170", "Theodor-Heuss-Straße …
## $ City         <chr> "São José dos Campos", "Stuttgart", "Montréal", "Oslo", "…
## $ State        <chr> "SP", NA, "QC", NA, NA, NA, NA, NA, NA, "SP", "SP", "RJ",…
## $ Country      <chr> "Brazil", "Germany", "Canada", "Norway", "Czech Republic"…
## $ PostalCode   <chr> "12227-000", "70174", "H2G 1A7", "0171", "14700", "14300"…
## $ Phone        <chr> "+55 (12) 3923-5555", "+49 0711 2842222", "+1 (514) 721-4…
## $ Fax          <chr> "+55 (12) 3923-5566", NA, NA, NA, "+420 2 4172 5555", NA,…
## $ Email        <chr> "luisg@embraer.com.br", "leonekohler@surfeu.de", "ftrembl…
## $ SupportRepId <int> 3, 5, 3, 4, 4, 5, 5, 4, 4, 4, 5, 3, 4, 5, 3, 4, 5, 3, 3, …
object.size(customer)
## 6192 bytes
customer2 <- as.data.frame(customer)
glimpse(customer2)
## Rows: 59
## Columns: 13
## $ CustomerId   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ FirstName    <chr> "Luís", "Leonie", "François", "Bjørn", "František", "Hele…
## $ LastName     <chr> "Gonçalves", "Köhler", "Tremblay", "Hansen", "Wichterlová…
## $ Company      <chr> "Embraer - Empresa Brasileira de Aeronáutica S.A.", NA, N…
## $ Address      <chr> "Av. Brigadeiro Faria Lima, 2170", "Theodor-Heuss-Straße …
## $ City         <chr> "São José dos Campos", "Stuttgart", "Montréal", "Oslo", "…
## $ State        <chr> "SP", NA, "QC", NA, NA, NA, NA, NA, NA, "SP", "SP", "RJ",…
## $ Country      <chr> "Brazil", "Germany", "Canada", "Norway", "Czech Republic"…
## $ PostalCode   <chr> "12227-000", "70174", "H2G 1A7", "0171", "14700", "14300"…
## $ Phone        <chr> "+55 (12) 3923-5555", "+49 0711 2842222", "+1 (514) 721-4…
## $ Fax          <chr> "+55 (12) 3923-5566", NA, NA, NA, "+420 2 4172 5555", NA,…
## $ Email        <chr> "luisg@embraer.com.br", "leonekohler@surfeu.de", "ftrembl…
## $ SupportRepId <int> 3, 5, 3, 4, 4, 5, 5, 4, 4, 4, 5, 3, 4, 5, 3, 4, 5, 3, 3, …
object.size(customer2)
## 39176 bytes

SQL Querry menggunakan dplyr

invoice <- tbl(src=chinook,"Invoice")
invoice_country <- invoice %>% 
  group_by(BillingCountry) %>% 
  summarize(mean_invoice=mean(Total))
show_query(invoice_country)
## Warning: Missing values are always removed in SQL aggregation functions.
## Use `na.rm = TRUE` to silence this warning
## This warning is displayed once every 8 hours.

## <SQL>
## SELECT `BillingCountry`, AVG(`Total`) AS `mean_invoice`
## FROM `Invoice`
## GROUP BY `BillingCountry`
invoice_country %>% as.data.frame() %>% View()

Penggabungan Tabel atau dataset

  1. Inner Join: penggabungan ini menghasilkan hanya baris-baris di mana ada kesesuaian di kedua tabel.
  2. Left Join: penggabungan ini menghasilkan semua baris dari tabel kiri dan baris yang bersesuaian dari tabel kanan. Jika tidak ada kesesuaian di tabel kanan, kita tetap akan mendapatkan data dari tabel kiri.
  3. Right Join: Ini seperti kebalikan dari left join. Kita akan mendapatkan semua baris dari tabel kanan dan dan baris yang bersesuaian dari tabel kiri. Jika tidak ada kesesuaian di tabel kiri, kita tetap akan mendapatkan data dari tabel kanan.
  4. Full Join: penggabungan ini menghasilkan semua baris dari tabel kiri dan kanan.
  5. Anti Join: penggabungan ini menghasilkan baris dari tabel kiri yang tidak bersesuaian dengan baris dari tabel kiri.
  6. Semi Join: penggabungan ini menghasilkan baris dari tabel kiri yang bersesuaian dengan baris dari tabel kanan. ## Left Join
res <- tbl(src=chinook,"customer") %>% 
  select(CustomerId,FirstName,LastName,Country) %>% 
  left_join(y = tbl(src=chinook,"Invoice") %>% select(CustomerId,Total) ,
            by = "CustomerId")
  
glimpse(res)
## Rows: ??
## Columns: 5
## Database: sqlite 3.41.2 [D:\Kuliah\S2\Sains Data\Responsi\P3\Chinook_Sqlite.sqlite]
## $ CustomerId <int> 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,…
## $ FirstName  <chr> "Luís", "Luís", "Luís", "Luís", "Luís", "Luís", "Luís", "Le…
## $ LastName   <chr> "Gonçalves", "Gonçalves", "Gonçalves", "Gonçalves", "Gonçal…
## $ Country    <chr> "Brazil", "Brazil", "Brazil", "Brazil", "Brazil", "Brazil",…
## $ Total      <dbl> 3.98, 3.96, 5.94, 0.99, 1.98, 13.86, 8.91, 1.98, 13.86, 8.9…
show_query(res)
## <SQL>
## SELECT
##   `customer`.`CustomerId` AS `CustomerId`,
##   `FirstName`,
##   `LastName`,
##   `Country`,
##   `Total`
## FROM `customer`
## LEFT JOIN `Invoice`
##   ON (`customer`.`CustomerId` = `Invoice`.`CustomerId`)
dbDisconnect(chinook)

Data Band Membership dari package dplyr

data("band_members",package = "dplyr")
band_members
## # A tibble: 3 × 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles
data("band_instruments",package = "dplyr")
band_instruments
## # A tibble: 3 × 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar
data("band_instruments2",package = "dplyr")
band_instruments2
## # A tibble: 3 × 2
##   artist plays 
##   <chr>  <chr> 
## 1 John   guitar
## 2 Paul   bass  
## 3 Keith  guitar

Fungsi Inner_Join()

inner_join(band_members,band_instruments,by="name")
## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass
band_members %>% 
  inner_join(band_instruments,by = "name")
## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

Mengambil bagian yang sama pada kolom nama

Fungsi left_join()

band_members %>% left_join(band_instruments)
## Joining with `by = join_by(name)`

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

Fungsi right_join

band_members %>% right_join(band_instruments)
## Joining with `by = join_by(name)`

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

Fungsi full_join()

band_members %>%  full_join(band_instruments)
## Joining with `by = join_by(name)`

## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Fungsi semi_join()

band_members %>%  semi_join(band_instruments)
## Joining with `by = join_by(name)`

## # A tibble: 2 × 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles

Fungsi anti_join()

band_members %>% anti_join(band_instruments)
## Joining with `by = join_by(name)`

## # A tibble: 1 × 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones