if(file.exists("/dsvm"))
{
  # Set environment variables for the Data Science VM
  Sys.setenv(SPARK_HOME = "/dsvm/tools/spark/current",
             HADOOP_HOME = "/opt/hadoop/current",
             YARN_CONF_DIR = "/opt/hadoop/current/etc/hadoop", 
             PATH = paste0(Sys.getenv("PATH"), ":/opt/hadoop/current/bin"),
             JAVA_HOME = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
  )
} else {
  Sys.setenv(SPARK_HOME="/usr/hdp/current/spark2-client")
}
USE_SPARK <- TRUE
USE_CLUSTER <- USE_SPARK & FALSE # TRUE

USE_HDFS <- USE_SPARK

LOCAL_DATA_DIR <- "."
LOCAL_DATA_FILE <- "unique_name_training_data.xdf"

TEST_DATA_FILE <- "unique_name_test_data.xdf"

HDFS_DATA_DIR <- "uniquenamedata"
HDFS_TEST_DIR <- "uniquenametest"

local_data_file <- file.path(LOCAL_DATA_DIR, LOCAL_DATA_FILE)
local_test_file <- file.path(LOCAL_DATA_DIR, TEST_DATA_FILE)

if (USE_HDFS){
  HDFS_DATA_PATH <-  if (USE_CLUSTER){
    "/user/RevoShare/sshuser/learning_curves" # HDInsight
  } else {
    "/user/RevoShare/remoteuser/Data/learning_curves"  # single node
  }
  
  hdfs_xdf <- RxXdfData(file.path(HDFS_DATA_PATH, HDFS_DATA_DIR),
                          fileSystem=RxHdfsFileSystem(), 
                          createCompositeSet=TRUE,
                          blocksPerCompositeFile = 1)
  
  hdfs_test_xdf <- RxXdfData(file.path(HDFS_DATA_PATH, HDFS_TEST_DIR),
                          fileSystem=RxHdfsFileSystem(), 
                          createCompositeSet=TRUE,
                          blocksPerCompositeFile = 1)

  if (!rxHadoopFileExists(hdfs_test_xdf@file)){
    print("Creating HDFS directories and copying data")
    
    dataPath <- strsplit(HDFS_DATA_PATH, "/", fixed=TRUE)[[1]]
    
    for (depth in 2:length(dataPath)){
      subPath <- paste(dataPath[1:depth], collapse='/')
      rxHadoopMakeDir(subPath)
    }
    
    imported_data_table <- file.path(HDFS_DATA_PATH, "imported_data_table.xdf")
    imported_test_set <- file.path(HDFS_DATA_PATH, "imported_test_set.xdf")
    
    if (!rxHadoopFileExists(imported_data_table))
      rxHadoopCopyFromLocal(local_data_file, imported_data_table)
    
    if (!rxHadoopFileExists(imported_test_set))
      rxHadoopCopyFromLocal(local_test_file, imported_test_set)
    
    rxDataStep(RxXdfData(imported_data_table,
                         fileSystem=RxHdfsFileSystem()),
               outFile=hdfs_xdf, 
               blocksPerRead=1e5, overwrite=TRUE)
    rxDataStep(RxXdfData(imported_test_set,
                         fileSystem=RxHdfsFileSystem()),
               outFile=hdfs_test_xdf,
               blocksPerRead=1e5, overwrite=TRUE)
  }
  
  data_table <- hdfs_xdf
  test_set <- hdfs_test_xdf
  
} else {
  
  data_table <- RxXdfData(local_data_file)
  test_set <- RxXdfData(local_test_file)
  
}

# collect metadata
xdf_info <- rxGetInfo(data_table)

# total number of cases in input dataset
N <- xdf_info$numRows

Train a model

# Start time
t0 <- Sys.time()

# Use the featurizer to generate chargrams, and fit a linear model
fit <- rxFastLinear(
    is_real ~ chargrams, 
    data_table,
    type="binary", normalize="no",
    l1Weight=0, l2Weight=1e-8,
    mlTransforms = featurizeText(
        vars = c(chargrams="name"),
        case='lower',
        keepNumbers=FALSE, 
        keepDiacritics=FALSE, 
        keepPunctuations=FALSE,
        charFeatureExtractor=ngramCount(
            ngramLength=3, 
            weighting="tf", 
            maxNumTerms=1e8
        ),
        wordFeatureExtractor=NULL
    )
)
## Not adding a normalizer.
## Using 2 threads to train.
## Automatically choosing a check frequency of 2.
## Auto-tuning parameters: maxIterations = 2.
## Using model from last iteration.
## Not training a calibrator because it is not needed.
## Elapsed time: 00:00:14.9120760
## Elapsed time: 00:00:00.2415402
t1 <- Sys.time()

difftime(t1, t0)
## Time difference of 15.89469 secs

Examine model predictions

# Use the model to score the test cases.
predictions <- rxPredict(fit, test_set, "predictions.xdf", overwrite=TRUE,
                         extraVarsToWrite=c("name", "category", "is_real"))
## Elapsed time: 00:00:06.7336246
# Cross-tabulate by predicted and actual category.
rxCrossTabs(~ F(PredictedLabel) : category, predictions)$counts
## $`F(PredictedLabel):category`
##                 category
## F_PredictedLabel pseudogibberish random  real
##                0          237545 236087  2793
##                1              80   1468 20909
# Build an ROC curve from the `predictions` XDF file.
rxRocCurve("is_real", "Probability", predictions)

# This function pulls the best- (or worst-) scoring examples from a given category.
# In the first pass, it extracts the category members into their own XDF file. We
# take this opportunity to take the logarithm of the probability, because these values
# are more spread out and it is easier to use them for approximate quantiles.
# Then we load the top (or bottom) 1 percent of cases into memory, and only
# sort these.
get_extremes_from_category <- function(xdf, category="real", cmp=">", percent=99){
  category_file <- sprintf("predictions_%s.xdf", category)
  category_xdf <- rxDataStep(predictions, 
                             category_file, 
                             rowSelection=category==cat, 
                             transforms=list(log10Probability=log10(Probability)),
                             transformObjects=list(cat=category),
                             overwrite=TRUE)
  
  xformObj <- list(q100=rxQuantile("log10Probability", 
                                   category_xdf, 
                                   probs = (0:100)/100),
                   cmp_fun=get(cmp),
                   percentile=sprintf("%d%%", 
                                      if(cmp=="<") 
                                        percent 
                                      else 
                                        100 - percent))
  pct_df <- rxDataStep(category_xdf,
                       rowSelection=cmp_fun(log10Probability, q100[[percentile]]),
                       transformObjects = xformObj)
  pct_df[order(pct_df[["Probability"]], decreasing = TRUE),]
}


# Best-scoring real names:
head(get_extremes_from_category(predictions, 
                                category="real", 
                                cmp=">", percent=99))
##                name category is_real PredictedLabel    Score Probability
## 6262      Lisamarie     real    TRUE           TRUE 10.74977   0.9999785
## 22064 Juliannamarie     real    TRUE           TRUE 10.69621   0.9999774
## 3676       Marilyne     real    TRUE           TRUE 10.60427   0.9999752
## 16248    Jessamarie     real    TRUE           TRUE 10.53534   0.9999734
## 5854        Marisha     real    TRUE           TRUE 10.44404   0.9999709
## 15172     Marleisha     real    TRUE           TRUE 10.10326   0.9999591
##       log10Probability
## 6262     -9.319049e-06
## 22064    -9.836779e-06
## 3676     -1.076870e-05
## 16248    -1.154530e-05
## 5854     -1.265842e-05
## 15172    -1.778402e-05
# Worst-scoring real names:
tail(get_extremes_from_category(predictions, 
                                category="real", 
                                cmp="<", percent=1))
##           name category is_real PredictedLabel      Score  Probability
## 159 Ikponmwosa     real    TRUE          FALSE  -9.229097 9.813215e-05
## 89          Vu     real    TRUE          FALSE  -9.455945 7.821702e-05
## 171      Eygpt     real    TRUE          FALSE -10.003417 4.524301e-05
## 50          Lg     real    TRUE          FALSE -10.068079 4.241021e-05
## 204      Zzyzx     real    TRUE          FALSE -10.181203 3.787419e-05
## 47          Lj     real    TRUE          FALSE -10.338304 3.236814e-05
##     log10Probability
## 159        -4.008189
## 89         -4.106699
## 171        -4.344448
## 50         -4.372530
## 204        -4.421657
## 47         -4.489882
# Best-scoring random names:
head(get_extremes_from_category(predictions, 
                                category="random", 
                                cmp=">", percent=99))
##           name category is_real PredictedLabel    Score Probability
## 13258   kellis   random   FALSE           TRUE 6.233860   0.9980420
## 90424    ralie   random   FALSE           TRUE 5.903892   0.9972786
## 4912    nerian   random   FALSE           TRUE 5.561465   0.9961716
## 63470   tabion   random   FALSE           TRUE 5.559600   0.9961644
## 155783   miria   random   FALSE           TRUE 5.366848   0.9953529
## 5588   tyneema   random   FALSE           TRUE 5.282112   0.9949440
##        log10Probability
## 13258     -0.0008511876
## 90424     -0.0011834867
## 4912      -0.0016658466
## 63470     -0.0016689648
## 155783    -0.0020229295
## 5588      -0.0022013729
# Best-scoring pseudogibberish names:
head(get_extremes_from_category(predictions, 
                                category="pseudogibberish", 
                                cmp=">", percent=99))
##             name        category is_real PredictedLabel    Score
## 83922     Ja\\][ pseudogibberish   FALSE           TRUE 3.225237
## 178696    Ja\\][ pseudogibberish   FALSE           TRUE 3.225237
## 21467    m,.777a pseudogibberish   FALSE           TRUE 2.800683
## 144302 m,./a\\][ pseudogibberish   FALSE           TRUE 2.800683
## 153836   m,a\\][ pseudogibberish   FALSE           TRUE 2.800683
## 106255 sa\\][?NA pseudogibberish   FALSE           TRUE 2.678231
##        Probability log10Probability
## 83922    0.9617730      -0.01692740
## 178696   0.9617730      -0.01692740
## 21467    0.9427127      -0.02562063
## 144302   0.9427127      -0.02562063
## 153836   0.9427127      -0.02562063
## 106255   0.9357298      -0.02884954

Examine model coefficients

# Examine the first few coefficients from the model
knitr::kable(data.frame(head(coef(fit), n=12)))
head.coef.fit…n…12.
(Bias) -6.966318
q -14.922252
r|e|r -14.612547
q|u 13.826286
n|a|n -13.708379
i|i -12.873913
x -11.916374
s|a|s -11.900106
a|a -11.502841
v -11.345049
<␂>|a|<␃> -11.299555
b|e|r 10.852152
# How many cases in each category do we have in the test set?
# This is equivalent to: table(predictions$category)
rxCrossTabs(~ category, predictions)$counts
## $category
##                       
## pseudogibberish 237625
## random          237555
## real             23702
# How do they break down if we look at only names containing 'sas'?
# With in-memory data, you could do the same thing like this:
# xtabs(~ category, predictions[grep("sas", predictions$name, ignore.case=TRUE),])
rxCrossTabs(~ category, predictions, 
            rowSelection=grepl("sas", name, ignore.case=TRUE))$counts
## $category
##                     
## pseudogibberish 3354
## random            62
## real               8
# Real names are more likely to contain the letter 'i'
rxCrossTabs(~ category, predictions, 
            rowSelection=grepl("i", name, ignore.case=TRUE))$counts
## $category
##                      
## pseudogibberish 32142
## random          53583
## real            11735
# ... but less likely to contain 'ii'
rxCrossTabs(~ category, predictions, 
            rowSelection=grepl("ii", name, ignore.case=TRUE))$counts
## $category
##                     
## pseudogibberish 1106
## random          1884
## real              50
# Names rarely contain 'f'
rxCrossTabs(~ category, predictions, 
            rowSelection=grepl("f", name, ignore.case=TRUE))$counts
## $category
##                      
## pseudogibberish 74990
## random          53259
## real              652
# ... but 'ff' is more common than you would expect based on the frequency of 'f'
rxCrossTabs(~ category, predictions, 
            rowSelection=grepl("ff", name, ignore.case=TRUE))$counts
## $category
##                     
## pseudogibberish 2524
## random          1887
## real              64

Learning Curves

Define a grid of parameter values

# Load helper functions.
source("learning_curve_lib.R")

K_FOLDS <- 3  # number of cross-validation groups
SALT <- 1     # add to pseudorandom number generator seed
MAX_TSS <- (1 - 1/K_FOLDS) * N # approximate number of training cases.

# Define parameter ranges to scan over; we'll be more ambitious 
# if we can run on a cluster.
if (USE_CLUSTER){
  L1_WEIGHTS <- c(0, 10^(c(-8, -6)))
  L2_WEIGHTS <- 10^c(-9, -6, -4)
  # L2_WEIGHTS cannot be smaller than 9.9999999*10^-10
  N_GRAM_LENGTHS <- 1:4
  NUM_TSS <- 10
  TEST_SET_KFOLD_IDS <- 1 # 1:3
} else {
  L1_WEIGHTS <- 0
  L2_WEIGHTS <- 10^c(-9, -5)
  N_GRAM_LENGTHS <- 1:3
  NUM_TSS <- 8
  TEST_SET_KFOLD_IDS <- 1
}
names(N_GRAM_LENGTHS) <- N_GRAM_LENGTHS

# Calculate the proportions of the training set that we will actually use.
# These values are evenly spaced on a log scale.
training_fractions <- get_training_set_fractions(1000, MAX_TSS, NUM_TSS)

# rxFastLinear is one of the algorithms in MicrosoftML
library(MicrosoftML)

# Provide additional arguments specific for each learner, as needed.
# This approach is useful if you want to compare different learners.
LEARNERS <- list(
  rxFastLinear=list(convergenceTolerance = 0.1, 
                    normalize="No", 
                    lossFunction=logLoss())
)

# Generate a set of text featurizer specifications;
# here each uses a different ngramLength.
ML_TRANSFORMS <- lapply(N_GRAM_LENGTHS, function(ngl)
  featurizeText(vars = c(chargrams = "name"),
                case='lower',
                keepNumbers=FALSE, 
                keepDiacritics = FALSE, 
                keepPunctuations = FALSE,
                charFeatureExtractor = ngramCount(
                  ngramLength=ngl, weighting = "tf", maxNumTerms=1e8),
                wordFeatureExtractor = NULL))

# Collect the values for the parameter grid ...
grid_dimensions <- list( model_class=names(LEARNERS),
                         training_fraction=training_fractions,
                         with_formula="is_real ~ chargrams",
                         test_set_kfold_id=TEST_SET_KFOLD_IDS,
                         KFOLDS=K_FOLDS,
                         mlTransforms=ML_TRANSFORMS,
                         l1Weight = L1_WEIGHTS,
                         l2Weight = L2_WEIGHTS
                   )

# ... and expand the grid into a dataframe.
parameter_table <- do.call(expand.grid, c(grid_dimensions, stringsAsFactors=FALSE))

# Examine the resulting parameter table.
head(parameter_table)
##    model_class training_fraction        with_formula test_set_kfold_id
## 1 rxFastLinear       0.001002243 is_real ~ chargrams                 1
## 2 rxFastLinear       0.002687853 is_real ~ chargrams                 1
## 3 rxFastLinear       0.007208384 is_real ~ chargrams                 1
## 4 rxFastLinear       0.019331712 is_real ~ chargrams                 1
## 5 rxFastLinear       0.051844505 is_real ~ chargrams                 1
## 6 rxFastLinear       0.139038526 is_real ~ chargrams                 1
##   KFOLDS
## 1      3
## 2      3
## 3      3
## 4      3
## 5      3
## 6      3
##                                                                                                                                                                                                                                                                   mlTransforms
## 1 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 2 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 3 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 4 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 5 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
## 6 Text{column=chargrams:name language=English case=lower keepDiacritics=- keepPunctuations=- keepNumbers=- vectorNormalizer=l2 highPrecisionLanguageDetection=+ wordExtractor={} charExtractor=NgramExtractor{ ngramLength=1 skipLength=0 maxNumTerms=100000000 weighting=tf}}
##   l1Weight l2Weight
## 1        0    1e-09
## 2        0    1e-09
## 3        0    1e-09
## 4        0    1e-09
## 5        0    1e-09
## 6        0    1e-09

Break the table into jobs

# Generate a list of parameter sets. Each row in the parameter table becomes
# an element in this list, plus we add other settings that are not part of the grid.
parameter_list <- lapply(1:nrow(parameter_table), function(i){
  par <- parameter_table[i,]
  par <- as.list(c(data_table=data_table,
                   par,
                   LEARNERS[[par$model_class]],
                 type="binary"))
  par
})


# Set the compute context to determine where the jobs will be sent.
if (USE_CLUSTER){
  cc <- rxSparkConnect(
      consoleOutput=TRUE,
      numExecutors=4,
      executorCores=8,
      executorOverheadMemory = "20000m")
} else if (USE_SPARK){
  cc <- rxSparkConnect(consoleOutput = TRUE,
                       executorCores = 4,
                       driverMem = "2g",
                       executorMem = "2g",
                       executorOverheadMem = "4g")
} else {
  rxSetComputeContext("localpar")
}

Run the jobs

# Cache the results manually, so that RMarkdown can use the results
# we get by stepping through the code.
TRAINING_RESULTS_FILE <- "training_results.Rds"

if (file.exists(TRAINING_RESULTS_FILE)){
  training_results <- readRDS(TRAINING_RESULTS_FILE) 
} else {
  t3 <- Sys.time()
  # Farm out the sets of parameters as jobs sent to the current
  # compute context.
  # data_table <- data_table@file

  training_results <- rxExec(run_training_fraction,
                             elemArgs = parameter_list,
                             execObjects = c("data_table", "SALT")) # 
  
  print(difftime(Sys.time(), t3))
  
  # Cache the results
  saveRDS(training_results, TRAINING_RESULTS_FILE)
}

if (USE_SPARK) rxSparkDisconnect(cc)

Visualize Results

Since the results of all these calculations fit easily into memory, we can use open source R from here on. This is a tidyverse pipeline

library(dplyr)
library(tidyr)
library(ggplot2)

# Note that we need to pull the value of ngramLength out of the mltransform 
# specification using a regular expression

tidy_results <- training_results %>% 
  lapply(function(tr){
    names(tr)[10] <- "mltransforms"
    tr
  }) %>%
  bind_rows %>%
  gather(set, AUC, training, test) %>%
  mutate(kfold = factor(kfold), 
         l1Weight=factor(l1Weight), 
         l2Weight=factor(l2Weight),
         ngramLength=gsub(".*ngramLength=([0-9]+) .*", "\\1", mltransforms))

# Plot the grid of results
tidy_results %>%
  ggplot(aes(x=log10(tss), y=AUC, col=ngramLength, linetype=set)) + 
  geom_line(size=1.0) + 
  facet_grid(l1Weight ~ l2Weight) +
    ggtitle("faceting by regularization weights")

# Zoom in on the best parameter set
max_auc <- tidy_results %>% 
  filter(set=="test") %>% 
  select(AUC) %>% 
  max

max_p <- tidy_results %>% filter(set=="test" & AUC==max_auc)
  
tidy_results %>%
  filter(l1Weight==max_p$l1Weight & l2Weight==max_p$l2Weight) %>%
  ggplot(aes(x=log10(tss), y=AUC, col=ngramLength, linetype=set,
             group=interaction(set, kfold, ngramLength))) + 
  geom_line(size=1.0) + 
  coord_cartesian(ylim=c(0.995, 1.0)) +
  facet_grid(l1Weight ~ l2Weight) +
  ggtitle("zoom in on best test results")