opts_chunk$set(eval = FALSE)
Some preliminaries to get the data in order.
# Oceans2
rm(list = ls())
require(ggplot2)
require(plyr)
require(lubridate)
source("ICOADS parsing.R")
source("../Map Functions.R")
Pulling in the Maury data
data = loadInData("~/shipping/ICOADS/maury.txt")
Doing the split voyage level.
data = splitDataByVoyage(data)
Save the R data.frame into a directory that MALLET will be able to interpret as a text, and run a topic model on it:
MALLEABLE = data.frame(text = data$voyageid, word = paste(round(data$lat, 1),
round(data$long, 1), sep = "X"))
# I'm p system('mkdir /tmp/MALLET') system('rm /tmp/MALLET/*')
# Write one text file for each voyage with all the points it passed
# through.
ddply(MALLEABLE, .(text), function(text) {
write.table(text$word, paste("/tmp/MALLET/", text$text[1], ".txt", sep = ""),
row.names = F, col.names = F, quote = F)
}, .progress = "text")
# To get it to take numbers, I need to change the token-regex; that was
# the only major hiccup I found here.
system("cd ~/mallet-2.0.7/; bin/mallet import-dir --input /tmp/MALLET --output ships.mallet --keep-sequence --token-regex \"\\S+\";")
# I ran this several times with topic topic sizes; only including in the
# post 9 and 25
system("cd ~/mallet-2.0.7/; bin/mallet train-topics --input ships.mallet --output-topic-keys shipKeys.txt --num-topics 16 --output-doc-topics shipTopics.txt --output-state state.txt.gz;")
# This is the data I want to plot.
system("cd ~/mallet-2.0.7/; gunzip -f state.txt.gz")
Read in those MALLET results and clean them up for analysis.
malletresults = read.table("~/mallet-2.0.7/state.txt", sep = " ", col.names = c("?",
"file", "loc", "total", "point", "topic"), colClasses = c("NULL", "factor",
"NULL", "NULL", "factor", "factor"))
nrow(malletresults)
length(unique(malletresults$file))
levels(malletresults$file) = gsub(".*/", "", levels(malletresults$file))
levels(malletresults$file) = gsub(".txt", "", levels(malletresults$file))
words = (strsplit(as.character(malletresults$point), "x"))
plottable = ldply(words, as.numeric, progress = "text")
names(plottable) = c("lat", "long")
plottable$topic = malletresults$topic
plottable$long[plottable$long > 180] = plottable$long[plottable$long > 180] -
360
Make some plots:
ggplot(plottable, aes(x = as.numeric(long), y = as.numeric(lat))) + geom_point(alpha = 0.01,
size = 0.3) + facet_wrap(~topic, scales = "free", ncol = 4) + annotation_map(map = world,
fill = "#F3E0BD") + theme_nothing + labs(title = "Distribution of points across a 9-topic model")
ggplot(plottable[plottable$topic == 8, ], aes(x = as.numeric(long), y = as.numeric(lat))) +
geom_point(alpha = 0.04, size = 0.5) + facet_wrap(~topic, scales = "free",
ncol = 4) + annotation_map(map = world, fill = "#F3E0BD") + theme_nothing +
labs(title = "Topic 8 is a chimera of a sort text-based topic modelling analysis wouldn't uncover")
counts = table(plottable$word)
countz = as.data.frame(counts)
countz$rank = rank(-countz$Freq)
ggplot(countz, aes(x = rank, y = Freq)) + geom_point() + scale_x_log10() + scale_y_log10()
Try K-means clustering on the voyages in topic space (note: this didn't work–most ended up in a single k-means cluster–so I didn't include it in the blog post). Might work better with a higher number of topics, a couple hundred or something, but that would take better priors.
topicdists = table(malletresults$file, malletresults$topic)
head(topicdists)
topicdists = apply(topicdists, 2, function(z) {
z/sum(z, na.rm = T)
})
class(topicdists) = "matrix"
f = kmeans(topicdists, centers = 9)
data$cluster = f$cluster[match(data$voyageid, names(f$cluster))]
offset = 220
plotWorld = Recenter(world, offset, idfield = "group")
plotData = Recenter(data, offset, shapeType = "segment", idfield = "voyageid")
head(plotData)
ggplot(plotData[!is.na(plotData$cluster), ], aes(x = as.numeric(long), y = as.numeric(lat))) +
geom_path(aes(group = group), alpha = 0.02) + annotation_map(map = plotWorld,
fill = "beige") + facet_wrap(~cluster, ncol = 3) + theme_nothign