How to use MongoDB with R
Berikut ini adalah Materi Praktikum 12 dari Mata Kuliah STA562-Manajemen Data Statistika Mahasiswa Magister Statistika dan Sains Data untuk Peminatan Big Data Analytics
Persiapan
Beberapa hal yang harus disiapkan sebelum memulai praktikum ini adalah
- R dan atau RStudio
- MongoDB Atlas free-tier cluster (sudah dilakukan pada praktikum 11)
- Load Sample Dataset pada Cluster yang sudah Anda buat (secara default dinamakan Cluster0 )
RStudio
Pada RStudio, Anda perlu untuk melakukan instalasi package mongolite
install.packages("mongolite")
library(mongolite)
MongoDB Atlas Free Tier
Pada bagian ini, jika Anda sudah membuat Cluster0
(atau nama lain) Anda perlu melakukan 2 hal,
- Membuat Username dan Password pada Menu Security>Database Access.
Khusus Password, jika Anda menggunakan karakter spesial seperti @ dan teman-temannya, Anda perlu merubahnya menjadi URL Encoded.
Password : p@ssword
Menjadi : p%40ssword
- Pada Network Access, bagian IP Addresses masukkan 0.0.0.0/0 (Allow Access from anywhere –> ini Optional).
Load Sample Dataset
Mengingat Proses Load Sample Dataset ini cukup memakan waktu, Anda diharapkan sudah melakukan ini sebelum dilakukan Praktikum dimulai.
MongoDB dengan R
Menyambungkan MongoDB Database ke R
Untuk mendapatkan cluster-name
dan code
, Anda bisa membuka Menu Deployment dan pilih Opsi Connect, lalu pilih Connect Your Application.
# This is the connection_string. You can get the exact url from your MongoDB cluster screen
= 'mongodb+srv://<username>:<password>@<cluster-name>.<code>.mongodb.net/sample_training'
connection_string = mongo(collection="trips",
trips_collection db="sample_training",
url=connection_string)
Untuk melihat username
dan password
, Anda bisa buka di Menu Security>Database Access.
Anda bisa melakukan pemeriksaan apakah R Anda sudah tersambung dengan MongoDB database sample_training
dengan memeriksa jumlah dokumen pada trips_collection
$count() trips_collection
## [1] 10000
Memeriksa Struktur Data
Berikut ini adalah contoh salah satu dokumen dari trips_collection
$iterate()$one() trips_collection
## $tripduration
## [1] 240
##
## $`start station id`
## [1] 3165
##
## $`start station name`
## [1] "Central Park West & W 72 St"
##
## $`end station id`
## [1] 3168
##
## $`end station name`
## [1] "Central Park West & W 85 St"
##
## $bikeid
## [1] 22174
##
## $usertype
## [1] "Subscriber"
##
## $`birth year`
## [1] 1976
##
## $`start station location`
## $`start station location`$type
## [1] "Point"
##
## $`start station location`$coordinates
## $`start station location`$coordinates[[1]]
## [1] -73.97621
##
## $`start station location`$coordinates[[2]]
## [1] 40.77579
##
##
##
## $`end station location`
## $`end station location`$type
## [1] "Point"
##
## $`end station location`$coordinates
## $`end station location`$coordinates[[1]]
## [1] -73.96962
##
## $`end station location`$coordinates[[2]]
## [1] 40.78473
##
##
##
## $`start time`
## [1] "2016-01-01 00:06:35 UTC"
##
## $`stop time`
## [1] "2016-01-01 00:10:36 UTC"
Anda sebenarnya bisa melihatnya dari MongoDB Atlas, ataupun RStudio
Beberapa Contoh Query
Durasi Perjalanan Terpanjang
$find(sort = '{"tripduration" : -1}' ,
trips_collectionlimit = 10,
fields = '{"_id" : true, "tripduration" : true}')
## _id tripduration
## 1 572bb8222b288919b68ac07c 326222
## 2 572bb8232b288919b68b0f0d 279620
## 3 572bb8232b288919b68b0593 173357
## 4 572bb8232b288919b68ae9ee 152023
## 5 572bb8222b288919b68ac1f0 146099
## 6 572bb8222b288919b68ac34d 142890
## 7 572bb8222b288919b68ad078 95221
## 8 572bb8222b288919b68acf5a 89241
## 9 572bb8222b288919b68ad221 86715
## 10 572bb8222b288919b68acf62 81807
Query Lebih Lanjut
Berikut ini adalah contoh Query, dimana dicari Dokumen dengan - usertype= Subscriber
,
- tripduration > 500 detik,
- dengan Start dan End Stasiun namanya adalah sama.
= trips_collection$find('{"usertype":"Subscriber",
query "tripduration":{"$gt":500},
"$expr": {"$eq": ["$start station name",
"$end station name"]}}')
Beberapa Operasi Dasar di R
class(query)
## [1] "data.frame"
nrow(query)
## [1] 97
rbind(head(query,2),tail(query,2))
## tripduration start station id start station name end station id
## 1 2494 504 1 Ave & E 15 St 504
## 2 1480 2006 Central Park S & 6 Ave 2006
## 96 2397 3160 Central Park West & W 76 St 3160
## 97 928 478 11 Ave & W 41 St 478
## end station name bikeid usertype birth year
## 1 1 Ave & E 15 St 22584 Subscriber 1959
## 2 Central Park S & 6 Ave 14562 Subscriber 1952
## 96 Central Park West & W 76 St 17682 Subscriber 1988
## 97 11 Ave & W 41 St 17747 Subscriber 1994
## start station location.type start station location.coordinates
## 1 Point -73.98166, 40.73222
## 2 Point -73.97634, 40.76591
## 96 Point -73.97375, 40.77897
## 97 Point -73.99884, 40.76030
## end station location.type end station location.coordinates
## 1 Point -73.98166, 40.73222
## 2 Point -73.97634, 40.76591
## 96 Point -73.97375, 40.77897
## 97 Point -73.99884, 40.76030
## start time stop time
## 1 2016-01-01 00:20:05 2016-01-01 01:01:40
## 2 2016-01-01 00:01:12 2016-01-01 00:25:52
## 96 2016-01-02 16:47:56 2016-01-02 17:27:53
## 97 2016-01-02 18:04:58 2016-01-02 18:20:27
Visualisasi
More Common User
= trips_collection$aggregate('[{"$group":{"_id":"$usertype", "Count": {"$sum":1}}}]') user_types
library(tidyverse)
library(lubridate)
library(ggplot2)
ggplot(user_types,
aes(x=reorder(`_id`,Count),
y=Count))+
geom_bar(stat="identity",
color='yellow',
fill='#FFC300')+
geom_text(aes(label = Count),
color = "red")+
coord_flip()+
xlab("User Type")
Failed Inspections
Pada Visualisasi ini, tidak lagi digunakan trips_collection, melainkan inspections_collection.
= mongo(collection="inspections",
inspections_collection db="sample_training",
url=connection_string)
= inspections_collection$aggregate('[{"$addFields": {"format_year":{"$year":{"$toDate":"$date"}}}},
year_failures {"$match":{"result":"Fail"}},
{"$group":{"_id":"$format_year", "Failed": {"$sum":1}}}]')
ggplot(year_failures,
aes(x=reorder(`_id`,Failed),
y=Failed))+
geom_bar(stat="identity",
width=0.4,
color='skyblue',
fill='skyblue')+
geom_text(aes(label = Failed),
color = "black")+
coord_flip()+
xlab("Year")
Companies Trend
Pada Visualisasi ini, menggunakan companies_collection.
= mongo(collection="companies",
companies_collection db="sample_training",
url=connection_string)
= companies_collection$aggregate('[
consulting_companies_year_wise {"$match":{"category_code":"consulting","founded_year":{"$gt":2003}}},
{"$group":{"_id":"$founded_year", "Count": {"$sum":1}}},
{"$sort":{"_id": 1}}
]')
ggplot(consulting_companies_year_wise,
aes(x=`_id`,y=Count))+
geom_line(size=2,
color="blue")+
geom_point(size=4,
color="red")+
ylab("Number of consulting companies")+
ggtitle("Year-wise (2004 onwards) companies founded in the category 'consulting'")+
xlab("Year")
Companies Location
Pada visualisasi ini masih digunakan companies_collection.
# Get the location array objects
= companies_collection$aggregate('[{"$match":{"name":"Facebook"}},
fb_locs {"$unwind":{"path":"$offices"}}]')
# Get individual fields from each array object
<- fb_locs$offices$longitude
loc_long <- fb_locs$offices$latitude
loc_lat <- fb_locs$offices$city
loc_city
# Plot the map
# install.packages("maps")
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
map("world",
fill=TRUE,
col="white",
bg="lightblue",
ylim=c(-60, 90),
mar=c(0,0,0,0))
points(loc_long,
loc_lat,col="red",
pch=16)
text(loc_long,
y = loc_lat,
loc_city,pos = 4,
col="red")
Referensi
..https://www.mongodb.com/languages/mongodb-and-r-example
Badan Informasi Geospasial, abdul.aziz@big.go.id↩︎