Dr. Markus Schmidberger
October 14th, 2013 Munich, Germany
Email: markus@mongosoup.de
Twitter: @cloudHPC
tomorrow: forward-looking predictive analysis
more complex methods, more data available, more processing time required
Check my Strata London Tutorial “Big Data Analyses with R”
(5+5) - 1 * 3
[1] 7
x <- 3
x
[1] 3
x^2 + 4
[1] 13
y <- c(1,2,3)
y
[1] 1 2 3
x <- 1:10
x
[1] 1 2 3 4 5 6 7 8 9 10
x < 5
[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
x[3:7]
[1] 3 4 5 6 7
mean(x)
[1] 5.5
help("mean")
?mean
library(onion)
data(bunny)
head(bunny, n=3)
x y z
[1,] -0.03783 0.1279 0.004475
[2,] -0.04478 0.1289 0.001905
[3,] -0.06801 0.1512 0.037195
p3d(bunny,theta=3,
phi=104,box=FALSE)
kmeans(dat, 4)
K-means clustering with 4 clusters of sizes 21, 18, 30, 31
Cluster means:
[,1] [,2]
1 0.7755 0.8509
2 -0.1557 -0.2305
3 1.2299 1.1472
4 0.1510 0.1507
Clustering vector:
[1] 4 2 4 4 2 4 4 4 2 4 4 4 2 2 4 4 1 4 2 2 2 4 4 4 2 4 2 4 4 2 4 2 2 4 4
[36] 4 4 4 4 4 4 4 4 2 4 2 2 4 2 2 1 1 1 1 3 1 3 3 3 1 1 3 3 3 3 1 3 1 3 3
[71] 1 3 1 1 3 3 3 3 1 1 3 3 1 1 1 3 3 3 3 1 3 1 3 3 3 3 1 3 3 3
Within cluster sum of squares by cluster:
[1] 3.318 1.166 4.019 3.195
(between_SS / total_SS = 83.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss"
[5] "tot.withinss" "betweenss" "size"
plot(dat, col = cl$cluster, cex=2, pch=16)
points(cl$centers, col = 1:4, pch = 13, cex = 4)
data(iris)
head(iris, n=3)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
class(iris)
[1] "data.frame"
running SQL statements on R data frames
library(sqldf)
sqldf("select * from iris limit 2")
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
sqldf("select count(*) from iris")
count(*)
1 150
on CRAN there are two packages to connect R with MongoDB
library(Rmongo)
mongo <- mongoDbConnect("cc_JwQcDLJSYQJb", "dbs001.mongosoup.de", 27017)
dbAuthenticate(mongo, username="JwQcDLJSYQJb", password="RSXPkUkXXXXX")
dbShowCollections(mongo)
dbGetQuery(mongo, "zips","{'state':'AL'}")
dbInsertDocument(mongo, "test_data", '{"foo": "bar", "size": 5 }')
dbDisconnect(mongo)
library(rmongodb)
mongo <- mongo.create(host="dbs001.mongosoup.de", db="cc_JwQcDLJSYQJb", username="JwQcDLJSYQJb", password="RSXPkUkXXXXX")
mongo
[1] 0
attr(,"mongo")
<pointer: 0x105a1de80>
attr(,"class")
[1] "mongo"
attr(,"host")
[1] "dbs001.mongosoup.de"
attr(,"name")
[1] ""
attr(,"username")
[1] "JwQcDLJSYQJb"
attr(,"password")
[1] "RSXPkUkxRdOX"
attr(,"db")
[1] "cc_JwQcDLJSYQJb"
attr(,"timeout")
[1] 0
mongo.get.database.collections(mongo, "cc_JwQcDLJSYQJb")
[1] "cc_JwQcDLJSYQJb.zips" "cc_JwQcDLJSYQJb.ccp" "cc_JwQcDLJSYQJb.test"
mongo <- mongo.disconnect(mongo)
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "state", "AL")
[1] TRUE
query <- mongo.bson.from.buffer(buf)
query
state : 2 AL
res <- mongo.find.one(mongo, "cc_JwQcDLJSYQJb.zips", query)
res
city : 2 ACMAR
loc : 4
0 : 1 -86.515570
1 : 1 33.584132
pop : 16 6055
state : 2 AL
_id : 2 35004
out <- mongo.bson.to.list(res)
out$loc
[1] -86.52 33.58
typeof(out$loc)
[1] "double"
out$pop
[1] 6055
out$state
[1] "AL"
cursor <- mongo.find(mongo, "cc_JwQcDLJSYQJb.zips", query)
res <- NULL
while (mongo.cursor.next(cursor)){
value <- mongo.cursor.value(cursor)
Rvalue <- mongo.bson.to.list(value)
res <- rbind(res, Rvalue)
}
err <- mongo.cursor.destroy(cursor)
head(res, n=4)
city loc pop state _id
Rvalue "ACMAR" Numeric,2 6055 "AL" "35004"
Rvalue "ADAMSVILLE" Numeric,2 10616 "AL" "35005"
Rvalue "ADGER" Numeric,2 3205 "AL" "35006"
Rvalue "KEYSTONE" Numeric,2 14218 "AL" "35007"
b <- mongo.bson.from.list(
list(name="Fred", age=29, city="Boston"))
b
name : 2 Fred
age : 1 29.000000
city : 2 Boston
mongo.bson.to.list(b)
$name
[1] "Fred"
$age
[1] 29
$city
[1] "Boston"
?mongo.bson
?mongo.bson.buffer.append
?mongo.bson.buffer.start.array
?mongo.bson.buffer.start.object
buf <- mongo.bson.buffer.create()
mongo.bson.buffer.append(buf, "aggregate", "zips")
mongo.bson.buffer.start.array(buf, "pipeline")
mongo.bson.buffer.start.object(buf, "$group")
mongo.bson.buffer.append(buf, "_id", "$state")
mongo.bson.buffer.start.object(buf, "totalPop")
mongo.bson.buffer.append(buf, "$sum", "$pop")
mongo.bson.buffer.finish.object(buf)
mongo.bson.buffer.finish.object(buf)
mongo.bson.buffer.start.object(buf, "$match")
mongo.bson.buffer.start.object(buf, "totalPop")
mongo.bson.buffer.append(buf, "$gte", "10000")
mongo.bson.buffer.finish.object(buf)
mongo.bson.buffer.finish.object(buf)
mongo.bson.buffer.finish.object(buf)
query <- mongo.bson.from.buffer(buf)
buf <- mongo.bson.buffer.create()
query <- mongo.bson.from.buffer(buf)
buf <- mongo.bson.buffer.create()
err <- mongo.bson.buffer.append(buf, "user", 1)
err <- mongo.bson.buffer.append(buf, "type", 1)
field <- mongo.bson.from.buffer(buf)
out <- mongo.find(mongo, "cc_JwQcDLJSYQJb.ccp", query, fields=field, limit=1000)
res <- NULL
while (mongo.cursor.next(out)){
value <- mongo.cursor.value(out)
Rvalue <- mongo.bson.to.list(value)
res <- rbind(res, Rvalue)
}
boxplot( as.integer(table(unlist(res[,2])) ), cex=4, horizontal=TRUE, main="Number of actions per user")
Email: markus@mongosoup.de
Twitter: @cloudHPC