if(file.exists("/dsvm"))
{
# Set environment variables for the Data Science VM
Sys.setenv(SPARK_HOME = "/dsvm/tools/spark/current",
HADOOP_HOME = "/opt/hadoop/current",
YARN_CONF_DIR = "/opt/hadoop/current/etc/hadoop",
PATH = paste0(Sys.getenv("PATH"), ":/opt/hadoop/current/bin"),
JAVA_HOME = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
)
} else {
Sys.setenv(SPARK_HOME="/usr/hdp/current/spark2-client")
}
USE_SPARK <- TRUE
USE_CLUSTER <- USE_SPARK & FALSE # TRUE
USE_HDFS <- USE_SPARK
LOCAL_DATA_DIR <- "."
LOCAL_DATA_FILE <- "unique_name_training_data.xdf"
TEST_DATA_FILE <- "unique_name_test_data.xdf"
HDFS_DATA_DIR <- "uniquenamedata"
HDFS_TEST_DIR <- "uniquenametest"
local_data_file <- file.path(LOCAL_DATA_DIR, LOCAL_DATA_FILE)
local_test_file <- file.path(LOCAL_DATA_DIR, TEST_DATA_FILE)
if (USE_HDFS){
HDFS_DATA_PATH <- if (USE_CLUSTER){
"/user/RevoShare/sshuser/learning_curves" # HDInsight
} else {
"/user/RevoShare/remoteuser/Data/learning_curves" # single node
}
hdfs_xdf <- RxXdfData(file.path(HDFS_DATA_PATH, HDFS_DATA_DIR),
fileSystem=RxHdfsFileSystem(),
createCompositeSet=TRUE,
blocksPerCompositeFile = 1)
hdfs_test_xdf <- RxXdfData(file.path(HDFS_DATA_PATH, HDFS_TEST_DIR),
fileSystem=RxHdfsFileSystem(),
createCompositeSet=TRUE,
blocksPerCompositeFile = 1)
if (!rxHadoopFileExists(hdfs_test_xdf@file)){
print("Creating HDFS directories and copying data")
dataPath <- strsplit(HDFS_DATA_PATH, "/", fixed=TRUE)[[1]]
for (depth in 2:length(dataPath)){
subPath <- paste(dataPath[1:depth], collapse='/')
rxHadoopMakeDir(subPath)
}
imported_data_table <- file.path(HDFS_DATA_PATH, "imported_data_table.xdf")
imported_test_set <- file.path(HDFS_DATA_PATH, "imported_test_set.xdf")
if (!rxHadoopFileExists(imported_data_table))
rxHadoopCopyFromLocal(local_data_file, imported_data_table)
if (!rxHadoopFileExists(imported_test_set))
rxHadoopCopyFromLocal(local_test_file, imported_test_set)
rxDataStep(RxXdfData(imported_data_table,
fileSystem=RxHdfsFileSystem()),
outFile=hdfs_xdf,
blocksPerRead=1e5, overwrite=TRUE)
rxDataStep(RxXdfData(imported_test_set,
fileSystem=RxHdfsFileSystem()),
outFile=hdfs_test_xdf,
blocksPerRead=1e5, overwrite=TRUE)
}
data_table <- hdfs_xdf
test_set <- hdfs_test_xdf
} else {
data_table <- RxXdfData(local_data_file)
test_set <- RxXdfData(local_test_file)
}
# collect metadata
xdf_info <- rxGetInfo(data_table)
# total number of cases in input dataset
N <- xdf_info$numRows
# Start time
t0 <- Sys.time()
# Use the featurizer to generate chargrams, and fit a linear model
fit <- rxFastLinear(
is_real ~ chargrams,
data_table,
type="binary", normalize="no",
l1Weight=0, l2Weight=1e-8,
mlTransforms = featurizeText(
vars = c(chargrams="name"),
case='lower',
keepNumbers=FALSE,
keepDiacritics=FALSE,
keepPunctuations=FALSE,
charFeatureExtractor=ngramCount(
ngramLength=3,
weighting="tf",
maxNumTerms=1e8
),
wordFeatureExtractor=NULL
)
)
## Not adding a normalizer.
## Using 2 threads to train.
## Automatically choosing a check frequency of 2.
## Auto-tuning parameters: maxIterations = 2.
## Using model from last iteration.
## Not training a calibrator because it is not needed.
## Elapsed time: 00:00:14.9120760
## Elapsed time: 00:00:00.2415402
t1 <- Sys.time()
difftime(t1, t0)
## Time difference of 15.89469 secs
# Use the model to score the test cases.
predictions <- rxPredict(fit, test_set, "predictions.xdf", overwrite=TRUE,
extraVarsToWrite=c("name", "category", "is_real"))
## Elapsed time: 00:00:06.7336246
# Cross-tabulate by predicted and actual category.
rxCrossTabs(~ F(PredictedLabel) : category, predictions)$counts
## $`F(PredictedLabel):category`
## category
## F_PredictedLabel pseudogibberish random real
## 0 237545 236087 2793
## 1 80 1468 20909
# Build an ROC curve from the `predictions` XDF file.
rxRocCurve("is_real", "Probability", predictions)
# This function pulls the best- (or worst-) scoring examples from a given category.
# In the first pass, it extracts the category members into their own XDF file. We
# take this opportunity to take the logarithm of the probability, because these values
# are more spread out and it is easier to use them for approximate quantiles.
# Then we load the top (or bottom) 1 percent of cases into memory, and only
# sort these.
get_extremes_from_category <- function(xdf, category="real", cmp=">", percent=99){
category_file <- sprintf("predictions_%s.xdf", category)
category_xdf <- rxDataStep(predictions,
category_file,
rowSelection=category==cat,
transforms=list(log10Probability=log10(Probability)),
transformObjects=list(cat=category),
overwrite=TRUE)
xformObj <- list(q100=rxQuantile("log10Probability",
category_xdf,
probs = (0:100)/100),
cmp_fun=get(cmp),
percentile=sprintf("%d%%",
if(cmp=="<")
percent
else
100 - percent))
pct_df <- rxDataStep(category_xdf,
rowSelection=cmp_fun(log10Probability, q100[[percentile]]),
transformObjects = xformObj)
pct_df[order(pct_df[["Probability"]], decreasing = TRUE),]
}
# Best-scoring real names:
head(get_extremes_from_category(predictions,
category="real",
cmp=">", percent=99))
## name category is_real PredictedLabel Score Probability
## 6262 Lisamarie real TRUE TRUE 10.74977 0.9999785
## 22064 Juliannamarie real TRUE TRUE 10.69621 0.9999774
## 3676 Marilyne real TRUE TRUE 10.60427 0.9999752
## 16248 Jessamarie real TRUE TRUE 10.53534 0.9999734
## 5854 Marisha real TRUE TRUE 10.44404 0.9999709
## 15172 Marleisha real TRUE TRUE 10.10326 0.9999591
## log10Probability
## 6262 -9.319049e-06
## 22064 -9.836779e-06
## 3676 -1.076870e-05
## 16248 -1.154530e-05
## 5854 -1.265842e-05
## 15172 -1.778402e-05
# Worst-scoring real names:
tail(get_extremes_from_category(predictions,
category="real",
cmp="<", percent=1))
## name category is_real PredictedLabel Score Probability
## 159 Ikponmwosa real TRUE FALSE -9.229097 9.813215e-05
## 89 Vu real TRUE FALSE -9.455945 7.821702e-05
## 171 Eygpt real TRUE FALSE -10.003417 4.524301e-05
## 50 Lg real TRUE FALSE -10.068079 4.241021e-05
## 204 Zzyzx real TRUE FALSE -10.181203 3.787419e-05
## 47 Lj real TRUE FALSE -10.338304 3.236814e-05
## log10Probability
## 159 -4.008189
## 89 -4.106699
## 171 -4.344448
## 50 -4.372530
## 204 -4.421657
## 47 -4.489882
# Best-scoring random names:
head(get_extremes_from_category(predictions,
category="random",
cmp=">", percent=99))
## name category is_real PredictedLabel Score Probability
## 13258 kellis random FALSE TRUE 6.233860 0.9980420
## 90424 ralie random FALSE TRUE 5.903892 0.9972786
## 4912 nerian random FALSE TRUE 5.561465 0.9961716
## 63470 tabion random FALSE TRUE 5.559600 0.9961644
## 155783 miria random FALSE TRUE 5.366848 0.9953529
## 5588 tyneema random FALSE TRUE 5.282112 0.9949440
## log10Probability
## 13258 -0.0008511876
## 90424 -0.0011834867
## 4912 -0.0016658466
## 63470 -0.0016689648
## 155783 -0.0020229295
## 5588 -0.0022013729
# Best-scoring pseudogibberish names:
head(get_extremes_from_category(predictions,
category="pseudogibberish",
cmp=">", percent=99))
## name category is_real PredictedLabel Score
## 83922 Ja\\][ pseudogibberish FALSE TRUE 3.225237
## 178696 Ja\\][ pseudogibberish FALSE TRUE 3.225237
## 21467 m,.777a pseudogibberish FALSE TRUE 2.800683
## 144302 m,./a\\][ pseudogibberish FALSE TRUE 2.800683
## 153836 m,a\\][ pseudogibberish FALSE TRUE 2.800683
## 106255 sa\\][?NA pseudogibberish FALSE TRUE 2.678231
## Probability log10Probability
## 83922 0.9617730 -0.01692740
## 178696 0.9617730 -0.01692740
## 21467 0.9427127 -0.02562063
## 144302 0.9427127 -0.02562063
## 153836 0.9427127 -0.02562063
## 106255 0.9357298 -0.02884954
# Examine the first few coefficients from the model
knitr::kable(data.frame(head(coef(fit), n=12)))
head.coef.fit…n…12. | |
---|---|
(Bias) | -6.966318 |
q | -14.922252 |
r|e|r | -14.612547 |
q|u | 13.826286 |
n|a|n | -13.708379 |
i|i | -12.873913 |
x | -11.916374 |
s|a|s | -11.900106 |
a|a | -11.502841 |
v | -11.345049 |
<␂>|a|<␃> | -11.299555 |
b|e|r | 10.852152 |
# How many cases in each category do we have in the test set?
# This is equivalent to: table(predictions$category)
rxCrossTabs(~ category, predictions)$counts
## $category
##
## pseudogibberish 237625
## random 237555
## real 23702
# How do they break down if we look at only names containing 'sas'?
# With in-memory data, you could do the same thing like this:
# xtabs(~ category, predictions[grep("sas", predictions$name, ignore.case=TRUE),])
rxCrossTabs(~ category, predictions,
rowSelection=grepl("sas", name, ignore.case=TRUE))$counts
## $category
##
## pseudogibberish 3354
## random 62
## real 8
# Real names are more likely to contain the letter 'i'
rxCrossTabs(~ category, predictions,
rowSelection=grepl("i", name, ignore.case=TRUE))$counts
## $category
##
## pseudogibberish 32142
## random 53583
## real 11735
# ... but less likely to contain 'ii'
rxCrossTabs(~ category, predictions,
rowSelection=grepl("ii", name, ignore.case=TRUE))$counts
## $category
##
## pseudogibberish 1106
## random 1884
## real 50
# Names rarely contain 'f'
rxCrossTabs(~ category, predictions,
rowSelection=grepl("f", name, ignore.case=TRUE))$counts
## $category
##
## pseudogibberish 74990
## random 53259
## real 652
# ... but 'ff' is more common than you would expect based on the frequency of 'f'
rxCrossTabs(~ category, predictions,
rowSelection=grepl("ff", name, ignore.case=TRUE))$counts
## $category
##
## pseudogibberish 2524
## random 1887
## real 64
# Load helper functions.
source("learning_curve_lib.R")
K_FOLDS <- 3 # number of cross-validation groups
SALT <- 1 # add to pseudorandom number generator seed
MAX_TSS <- (1 - 1/K_FOLDS) * N # approximate number of training cases.
# Define parameter ranges to scan over; we'll be more ambitious
# if we can run on a cluster.
if (USE_CLUSTER){
L1_WEIGHTS <- c(0, 10^(c(-8, -6)))
L2_WEIGHTS <- 10^c(-9, -6, -4)
# L2_WEIGHTS cannot be smaller than 9.9999999*10^-10
N_GRAM_LENGTHS <- 1:4
NUM_TSS <- 10
TEST_SET_KFOLD_IDS <- 1 # 1:3
} else {
L1_WEIGHTS <- 0
L2_WEIGHTS <- 10^c(-9, -5)
N_GRAM_LENGTHS <- 1:3
NUM_TSS <- 8
TEST_SET_KFOLD_IDS <- 1
}
names(N_GRAM_LENGTHS) <- N_GRAM_LENGTHS
# Calculate the proportions of the training set that we will actually use.
# These values are evenly spaced on a log scale.
training_fractions <- get_training_set_fractions(1000, MAX_TSS, NUM_TSS)
# rxFastLinear is one of the algorithms in MicrosoftML
library(MicrosoftML)
# Provide additional arguments specific for each learner, as needed.
# This approach is useful if you want to compare different learners.
LEARNERS <- list(
rxFastLinear=list(convergenceTolerance = 0.1,
normalize="No",
lossFunction=logLoss())
)
# Generate a set of text featurizer specifications;
# here each uses a different ngramLength.
ML_TRANSFORMS <- lapply(N_GRAM_LENGTHS, function(ngl)
featurizeText(vars = c(chargrams = "name"),
case='lower',
keepNumbers=FALSE,
keepDiacritics = FALSE,
keepPunctuations = FALSE,
charFeatureExtractor = ngramCount(
ngramLength=ngl, weighting = "tf", maxNumTerms=1e8),
wordFeatureExtractor = NULL))
# Collect the values for the parameter grid ...
grid_dimensions <- list( model_class=names(LEARNERS),
training_fraction=training_fractions,
with_formula="is_real ~ chargrams",
test_set_kfold_id=TEST_SET_KFOLD_IDS,
KFOLDS=K_FOLDS,
mlTransforms=ML_TRANSFORMS,
l1Weight = L1_WEIGHTS,
l2Weight = L2_WEIGHTS
)
# ... and expand the grid into a dataframe.
parameter_table <- do.call(expand.grid, c(grid_dimensions, stringsAsFactors=FALSE))
# Examine the resulting parameter table.
head(parameter_table)
## model_class training_fraction with_formula test_set_kfold_id
## 1 rxFastLinear 0.001002243 is_real ~ chargrams 1
## 2 rxFastLinear 0.002687853 is_real ~ chargrams 1
## 3 rxFastLinear 0.007208384 is_real ~ chargrams 1
## 4 rxFastLinear 0.019331712 is_real ~ chargrams 1
## 5 rxFastLinear 0.051844505 is_real ~ chargrams 1
## 6 rxFastLinear 0.139038526 is_real ~ chargrams 1
## KFOLDS
## 1 3
## 2 3
## 3 3
## 4 3
## 5 3
## 6 3
## mlTransforms
## 1 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 2 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 3 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 4 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 5 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 6 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## l1Weight l2Weight
## 1 0 1e-09
## 2 0 1e-09
## 3 0 1e-09
## 4 0 1e-09
## 5 0 1e-09
## 6 0 1e-09
# Generate a list of parameter sets. Each row in the parameter table becomes
# an element in this list, plus we add other settings that are not part of the grid.
parameter_list <- lapply(1:nrow(parameter_table), function(i){
par <- parameter_table[i,]
par <- as.list(c(data_table=data_table,
par,
LEARNERS[[par$model_class]],
type="binary"))
par
})
# Set the compute context to determine where the jobs will be sent.
if (USE_CLUSTER){
cc <- rxSparkConnect(
consoleOutput=TRUE,
numExecutors=4,
executorCores=8,
executorOverheadMemory = "20000m")
} else if (USE_SPARK){
cc <- rxSparkConnect(consoleOutput = TRUE,
executorCores = 4,
driverMem = "2g",
executorMem = "2g",
executorOverheadMem = "4g")
} else {
rxSetComputeContext("localpar")
}
# Cache the results manually, so that RMarkdown can use the results
# we get by stepping through the code.
TRAINING_RESULTS_FILE <- "training_results.Rds"
if (file.exists(TRAINING_RESULTS_FILE)){
training_results <- readRDS(TRAINING_RESULTS_FILE)
} else {
t3 <- Sys.time()
# Farm out the sets of parameters as jobs sent to the current
# compute context.
# data_table <- data_table@file
training_results <- rxExec(run_training_fraction,
elemArgs = parameter_list,
execObjects = c("data_table", "SALT")) #
print(difftime(Sys.time(), t3))
# Cache the results
saveRDS(training_results, TRAINING_RESULTS_FILE)
}
if (USE_SPARK) rxSparkDisconnect(cc)
Since the results of all these calculations fit easily into memory, we can use open source R from here on. This is a tidyverse
pipeline
library(dplyr)
library(tidyr)
library(ggplot2)
# Note that we need to pull the value of ngramLength out of the mltransform
# specification using a regular expression
tidy_results <- training_results %>%
lapply(function(tr){
names(tr)[10] <- "mltransforms"
tr
}) %>%
bind_rows %>%
gather(set, AUC, training, test) %>%
mutate(kfold = factor(kfold),
l1Weight=factor(l1Weight),
l2Weight=factor(l2Weight),
ngramLength=gsub(".*ngramLength=([0-9]+) .*", "\\1", mltransforms))
# Plot the grid of results
tidy_results %>%
ggplot(aes(x=log10(tss), y=AUC, col=ngramLength, linetype=set)) +
geom_line(size=1.0) +
facet_grid(l1Weight ~ l2Weight) +
ggtitle("faceting by regularization weights")
# Zoom in on the best parameter set
max_auc <- tidy_results %>%
filter(set=="test") %>%
select(AUC) %>%
max
max_p <- tidy_results %>% filter(set=="test" & AUC==max_auc)
tidy_results %>%
filter(l1Weight==max_p$l1Weight & l2Weight==max_p$l2Weight) %>%
ggplot(aes(x=log10(tss), y=AUC, col=ngramLength, linetype=set,
group=interaction(set, kfold, ngramLength))) +
geom_line(size=1.0) +
coord_cartesian(ylim=c(0.995, 1.0)) +
facet_grid(l1Weight ~ l2Weight) +
ggtitle("zoom in on best test results")