opts_chunk$set(eval = FALSE)

Some preliminaries to get the data in order.

# Oceans2
rm(list = ls())
require(ggplot2)
require(plyr)
require(lubridate)
source("ICOADS parsing.R")
source("../Map Functions.R")

Pulling in the Maury data

data = loadInData("~/shipping/ICOADS/maury.txt")

Doing the split voyage level.

data = splitDataByVoyage(data)

Save the R data.frame into a directory that MALLET will be able to interpret as a text, and run a topic model on it:

MALLEABLE = data.frame(text = data$voyageid, word = paste(round(data$lat, 1), 
    round(data$long, 1), sep = "X"))

# I'm p system('mkdir /tmp/MALLET') system('rm /tmp/MALLET/*')

# Write one text file for each voyage with all the points it passed
# through.
ddply(MALLEABLE, .(text), function(text) {
    write.table(text$word, paste("/tmp/MALLET/", text$text[1], ".txt", sep = ""), 
        row.names = F, col.names = F, quote = F)
}, .progress = "text")


# To get it to take numbers, I need to change the token-regex; that was
# the only major hiccup I found here.
system("cd ~/mallet-2.0.7/; bin/mallet import-dir --input /tmp/MALLET --output ships.mallet --keep-sequence --token-regex \"\\S+\";")
# I ran this several times with topic topic sizes; only including in the
# post 9 and 25
system("cd ~/mallet-2.0.7/; bin/mallet train-topics --input ships.mallet --output-topic-keys shipKeys.txt --num-topics 16 --output-doc-topics shipTopics.txt --output-state state.txt.gz;")

# This is the data I want to plot.
system("cd ~/mallet-2.0.7/; gunzip -f state.txt.gz")

Read in those MALLET results and clean them up for analysis.

malletresults = read.table("~/mallet-2.0.7/state.txt", sep = " ", col.names = c("?", 
    "file", "loc", "total", "point", "topic"), colClasses = c("NULL", "factor", 
    "NULL", "NULL", "factor", "factor"))
nrow(malletresults)
length(unique(malletresults$file))
levels(malletresults$file) = gsub(".*/", "", levels(malletresults$file))
levels(malletresults$file) = gsub(".txt", "", levels(malletresults$file))

words = (strsplit(as.character(malletresults$point), "x"))
plottable = ldply(words, as.numeric, progress = "text")

names(plottable) = c("lat", "long")
plottable$topic = malletresults$topic
plottable$long[plottable$long > 180] = plottable$long[plottable$long > 180] - 
    360

Make some plots:

ggplot(plottable, aes(x = as.numeric(long), y = as.numeric(lat))) + geom_point(alpha = 0.01, 
    size = 0.3) + facet_wrap(~topic, scales = "free", ncol = 4) + annotation_map(map = world, 
    fill = "#F3E0BD") + theme_nothing + labs(title = "Distribution of points across a 9-topic model")

ggplot(plottable[plottable$topic == 8, ], aes(x = as.numeric(long), y = as.numeric(lat))) + 
    geom_point(alpha = 0.04, size = 0.5) + facet_wrap(~topic, scales = "free", 
    ncol = 4) + annotation_map(map = world, fill = "#F3E0BD") + theme_nothing + 
    labs(title = "Topic 8 is a chimera of a sort text-based topic modelling analysis wouldn't uncover")

counts = table(plottable$word)
countz = as.data.frame(counts)
countz$rank = rank(-countz$Freq)
ggplot(countz, aes(x = rank, y = Freq)) + geom_point() + scale_x_log10() + scale_y_log10()

Try K-means clustering on the voyages in topic space (note: this didn't work–most ended up in a single k-means cluster–so I didn't include it in the blog post). Might work better with a higher number of topics, a couple hundred or something, but that would take better priors.

topicdists = table(malletresults$file, malletresults$topic)
head(topicdists)
topicdists = apply(topicdists, 2, function(z) {
    z/sum(z, na.rm = T)
})
class(topicdists) = "matrix"
f = kmeans(topicdists, centers = 9)
data$cluster = f$cluster[match(data$voyageid, names(f$cluster))]

offset = 220
plotWorld = Recenter(world, offset, idfield = "group")
plotData = Recenter(data, offset, shapeType = "segment", idfield = "voyageid")
head(plotData)
ggplot(plotData[!is.na(plotData$cluster), ], aes(x = as.numeric(long), y = as.numeric(lat))) + 
    geom_path(aes(group = group), alpha = 0.02) + annotation_map(map = plotWorld, 
    fill = "beige") + facet_wrap(~cluster, ncol = 3) + theme_nothign