This script exercises the SmartType model and library and calculates performance statistics.

Initialize Environment

Set execution environment.

setwd("~/Academic/DataScience/Capstone/prod")           # Where is R running?
set.seed(9035768)       # Set seed so result is repeatable, but different from training set.

source("SmartTypeLibrary.R")                            # Load predition library routines.

raw.data.folder <- "../data/en_US"                      # Where is the raw data?
file.list <- c("en_US.blogs.txt","en_US.news.txt","en_US.twitter.txt")  # Data files.

test.data.fraction <- 0.0001    # What fraction of the raw data should be included?
train.data.fraction <- 0.10     # Which trained model should be evaluated?

Initialize SmartType library.

Load the trained model into the SmartType library.

model.file <- paste("models",toString(train.data.fraction*100),"ngram","model.rds",sep="/")
SmartType.initialize(model.file)

Verify predict() Method

Perform spot checks to verify most recent changes.

SmartType.predict("thanks for the",4,showwc=FALSE)
##  [1] "follow"         "rt"             "mention"        "ff"            
##  [5] "shout"          "retweet"        "rts"            "support"       
##  [9] "tweet"          "heads"          "kind"           "love"          
## [13] "shoutout"       "tip"            "great"          "info"          
## [17] "link"           "recommendation" "invite"         "add"           
## [21] "follows"        "so"             "help"           "reminder"      
## [25] "suggestion"     "update"         "feedback"       "mentions"      
## [29] "response"       "retweets"       "awesome"        "birthday"      
## [33] "memories"       "post"           "tweets"         "advice"        
## [37] "article"        "good"           "props"          "share"
SmartType.predict("for the",3,showwc=FALSE)
##  [1] "first"   "follow"  "rt"      "next"    "rest"    "last"    "new"    
##  [8] "past"    "day"     "most"    "mention" "ff"      "second"  "best"   
## [15] "shout"   "weekend" "retweet" "future"  "support" "city"    "season" 
## [22] "sake"    "year"    "night"   "love"    "same"    "people"  "state"  
## [29] "rts"     "us"      "record"  "th"      "show"    "week"    "game"   
## [36] "summer"  "team"    "big"     "job"     "tweet"   "whole"   "kids"   
## [43] "third"   "great"   "right"   "entire"  "kind"    "good"    "time"   
## [50] "better"
SmartType.predict("the",2,showwc=FALSE)
##  [1] "first"    "same"     "best"     "new"      "most"     "world"   
##  [7] "way"      "last"     "other"    "time"     "next"     "only"    
## [13] "end"      "day"      "state"    "city"     "us"       "right"   
## [19] "past"     "game"     "second"   "top"      "one"      "rest"    
## [25] "house"    "th"       "year"     "two"      "whole"    "people"  
## [31] "fact"     "big"      "company"  "team"     "season"   "book"    
## [37] "country"  "s"        "story"    "show"     "middle"   "united"  
## [43] "case"     "follow"   "final"    "national" "word"     "future"  
## [49] "night"    "more"
SmartType.predict("",1,showwc=FALSE)
##  [1] "the"   "and"   "for"   "that"  "you"   "with"  "was"   "this" 
##  [9] "have"  "are"   "but"   "not"   "from"  "all"   "they"  "will" 
## [17] "its"   "said"  "just"  "your"  "his"   "out"   "about" "one"  
## [25] "what"  "like"  "when"  "has"   "who"   "can"   "more"  "had"  
## [33] "get"   "were"  "time"  "there" "her"   "their" "would" "some" 
## [41] "new"   "she"   "our"   "been"  "good"  "now"   "how"   "day"  
## [49] "know"  "them"
# Quiz 2
SmartType.predict("with his little",4,showwc=TRUE)

##  [1] "bit"       "more"      "girl"      "too"       "to"       
##  [6] "time"      "things"    "boy"       "while"     "and"      
## [11] "of"        "ones"      "guy"       "over"      "better"   
## [16] "thing"     "one"       "as"        "or"        "brother"  
## [21] "girls"     "different" "less"      "sister"    "about"    
## [26] "in"        "something" "late"      "help"      "longer"   
## [31] "i"         "kid"       "like"      "extra"     "league"   
## [36] "man"       "people"    "red"       "kids"      "piece"    
## [41] "did"       "baby"      "italy"     "town"      "boys"     
## [46] "break"     "but"       "later"     "nervous"   "miss"
SmartType.predict("you must be",4,showwc=TRUE)

##  [1] "a"         "the"       "in"        "done"      "able"     
##  [6] "at"        "on"        "so"        "willing"   "kept"     
## [11] "one"       "an"        "approved"  "nice"      "taken"    
## [16] "very"      "following" "made"      "paid"      "prepared" 
## [21] "something" "to"        "doing"     "good"      "like"     
## [26] "more"
# Quiz 3
SmartType.predict("artic monkeys this",4,showwc=TRUE)

##  [1] "is"        "year"      "week"      "morning"   "was"      
##  [6] "weekend"   "time"      "one"       "season"    "month"    
## [11] "point"     "summer"    "will"      "book"      "blog"     
## [16] "and"       "post"      "day"       "way"       "game"     
## [21] "years"     "i"         "to"        "world"     "new"      
## [26] "has"       "case"      "afternoon" "past"      "in"       
## [31] "guy"       "would"     "for"       "a"         "story"    
## [36] "movie"     "the"       "country"   "as"        "but"      
## [41] "place"     "weeks"     "little"    "spring"    "project"  
## [46] "series"    "evening"   "song"      "saturday"  "means"
SmartType.predict("to take a",4,showwc=TRUE)

##  [1] "look"    "picture" "few"     "break"   "lead"    "nap"     "shower" 
##  [8] "photo"   "moment"  "stand"   "closer"  "trip"    "chance"  "step"   
## [15] "test"
SmartType.predict("of adam sandlers",4,showwc=TRUE)

##  [1] "the"   "and"   "for"   "that"  "you"   "with"  "was"   "this" 
##  [9] "have"  "are"   "but"   "not"   "from"  "all"   "they"  "will" 
## [17] "its"   "said"  "just"  "your"  "his"   "out"   "about" "one"  
## [25] "what"  "like"  "when"  "has"   "who"   "can"   "more"  "had"  
## [33] "get"   "were"  "time"  "there" "her"   "their" "would" "some" 
## [41] "new"   "she"   "our"   "been"  "good"  "now"   "how"   "day"  
## [49] "know"  "them"

Verify evaluate() Method

SmartType.evaluate("Could be added to the gallery wall.")
##   words matches effective chars saved benefit predictions avg.ms
## 1     7       4      0.57    35    10    0.29          22  0.291
SmartType.evaluate("could be added to THE 9 gallery wall")
##   words matches effective chars saved benefit predictions avg.ms
## 1     7       4      0.57    36    10    0.28          22  0.275
SmartType.evaluate(c("could be added to the gallery wall","Could be added to the gallery wall."))
##   words matches effective chars saved benefit predictions avg.ms
## 1     7       4      0.57    34    10    0.29          22  0.280
## 2     7       4      0.57    35    10    0.29          22  0.276

Evaluate Model and Algorithm Effectiveness

Perform more rigorous validation. Use a sample of the original data as a test set. Sample documents from blogs, news, and tweets.

docs <- vector()
for (i in 1:length(file.list)) {
        pathname <- paste(raw.data.folder,file.list[i],sep="/")
        
        # Read raw data.
        f <- file(pathname,"rb")
        while(TRUE) {
                line <- readLines(f,n=1,warn=FALSE,skipNul=TRUE)
                if (length(line) > 0) {
                        if (runif(1) <= test.data.fraction) {   # Randomly include/discard?
                                docs <- c(docs,line)            # Concatenate. (Efficient?)
                        }
                }
                else {
                        break
                }
        }
        close(f)
}

Pre-process the validation samples in the same way that the training data was pre-processed.

docs <- sapply(docs,removeNumbers,USE.NAMES=FALSE)
docs <- sapply(docs,removePunctuation,USE.NAMES=FALSE)
docs <- sapply(docs,tolower,USE.NAMES=FALSE)
docs <- sapply(docs,stripWhitespace,USE.NAMES=FALSE)

Run evaluate() method on test data set.

s <- SmartType.evaluate(docs)
s
##     words matches effective chars saved benefit predictions avg.ms
## 1      35      19      0.54   195    48    0.25         132  0.565
## 2       6       1      0.17    46     1    0.02          41  0.392
## 3      29      15      0.52   171    43    0.25         115  0.537
## 4       3       0      0.00    22     0    0.00          22  0.521
## 5      12       3      0.25    64     9    0.14          49  0.558
## 6       5       3      0.60    19     6    0.32          12  0.456
## 7      12       6      0.50    59    12    0.20          42  0.515
## 8      11       4      0.36    59     9    0.15          44  0.524
## 9       3       0      0.00    23     0    0.00          21  0.284
## 10      5       0      0.00    35     0    0.00          31  0.496
## 11      7       6      0.86    31    19    0.61          12  0.517
## 12      3       1      0.33    12     3    0.25          10  0.541
## 13      5       3      0.60    23    11    0.48          11  0.203
## 14     74      28      0.38   371    68    0.18         258  0.639
## 15      4       0      0.00    37     0    0.00          34  0.555
## 16     33      17      0.52   159    41    0.26         103  0.601
## 17      6       1      0.17    39     3    0.08          32  0.617
## 18      6       3      0.50    33     8    0.24          23  0.234
## 19     37      19      0.51   197    57    0.29         123  0.566
## 20     40      28      0.70   171    76    0.44          84  0.582
## 21      8       2      0.25    35     5    0.14          25  0.656
## 22     11       5      0.45    48    12    0.25          31  0.494
## 23     14       6      0.43    77    14    0.18          56  0.537
## 24     58      19      0.33   401    52    0.13         311  0.711
## 25     71      34      0.48   405   104    0.26         265  0.646
## 26      7       5      0.71    35    15    0.43          19  0.304
## 27     50      19      0.38   298    63    0.21         205  0.706
## 28    161      81      0.50   875   237    0.27         559  0.688
## 29     44      19      0.43   244    46    0.19         174  0.687
## 30     15       9      0.60    71    22    0.31          44  0.516
## 31     89      35      0.39   500    92    0.18         355  0.674
## 32     45      20      0.44   252    53    0.21         175  0.635
## 33     35      15      0.43   223    45    0.20         159  0.656
## 34    108      57      0.53   548   156    0.28         342  0.627
## 35      2       0      0.00     5     0    0.00           6  0.272
## 36     77      31      0.40   439   103    0.23         293  0.717
## 37     25      18      0.72   123    48    0.39          69  0.614
## 38      2       1      0.50    11     5    0.45           6  0.070
## 39      7       2      0.29    31     3    0.10          24  0.453
## 40     46      18      0.39   287    50    0.17         210  0.654
## 41     11       7      0.64    40    15    0.38          24  0.477
## 42     33      22      0.67   158    63    0.40          85  0.498
## 43     34      20      0.59   168    46    0.27         108  0.570
## 44     89      35      0.39   556   101    0.18         402  0.689
## 45      1       0      0.00     3     0    0.00           3  0.000
## 46     11       1      0.09    58     1    0.02          48  0.583
## 47     58      24      0.41   362    63    0.17         266  0.640
## 48     59      18      0.31   353    54    0.15         259  0.701
## 49     13       2      0.15    81     4    0.05          66  0.650
## 50     38      15      0.39   211    39    0.18         150  0.638
## 51      9       3      0.33    49     8    0.16          36  0.556
## 52     41      22      0.54   234    59    0.25         157  0.592
## 53     28      18      0.64   134    46    0.34          79  0.481
## 54     46      21      0.46   265    54    0.20         187  0.635
## 55      4       1      0.25    16     1    0.06          13  0.370
## 56      3       2      0.67    17    12    0.71           5  0.140
## 57      9       4      0.44    44    13    0.30          27  0.570
## 58     43      20      0.47   199    63    0.32         114  0.652
## 59     62      22      0.35   308    52    0.17         217  0.659
## 60      4       0      0.00    27     0    0.00          24  0.533
## 61     26      13      0.50   144    41    0.28          91  0.581
## 62      1       0      0.00     7     0    0.00           6  0.000
## 63     23      10      0.43   147    29    0.20         106  0.664
## 64    203      87      0.43  1174   220    0.19         839  0.682
## 65     19       7      0.37   118    15    0.13          92  0.558
## 66     43      16      0.37   263    40    0.15         197  0.671
## 67      4       2      0.50    19     5    0.26          13  0.112
## 68     25      13      0.52   134    43    0.32          82  0.689
## 69     36      17      0.47   211    53    0.25         140  0.662
## 70      5       2      0.40    17     5    0.29          12  0.526
## 71      2       0      0.00    14     0    0.00          13  0.095
## 72     61      31      0.51   346    80    0.23         237  0.634
## 73      6       2      0.33    37     5    0.14          29  0.383
## 74     18      11      0.61    81    28    0.35          47  0.446
## 75     13       5      0.38    62    11    0.18          44  0.570
## 76      4       0      0.00    26     0    0.00          22  0.282
## 77     28      12      0.43   156    32    0.21         109  0.641
## 78      5       0      0.00    32     0    0.00          28  0.481
## 79    139      59      0.42   730   134    0.18         517  0.664
## 80     20       6      0.30   120    22    0.18          87  0.653
## 81    113      66      0.58   605   188    0.31         371  0.634
## 82      7       5      0.71    30    15    0.50          14  0.321
## 83     18      10      0.56    78    28    0.36          43  0.545
## 84     15       9      0.60    78    32    0.41          41  0.380
## 85     24       8      0.33   128    18    0.14          95  0.539
## 86    143      78      0.55   716   209    0.29         443  0.611
## 87     45      16      0.36   281    47    0.17         206  0.719
## 88     39      20      0.51   197    62    0.31         117  0.531
## 89      3       0      0.00     8     0    0.00           8  0.486
## 90     76      26      0.34   452    67    0.15         336  0.658
## 91    113      35      0.31   696    99    0.14         520  0.736
## 92     51      17      0.33   366    56    0.15         277  0.688
## 93     26       9      0.35   169    21    0.12         132  0.707
## 94     14       3      0.21    83    14    0.17          59  0.682
## 95     31      17      0.55   153    48    0.31          92  0.482
## 96     53      20      0.38   338    69    0.20         237  0.650
## 97    111      56      0.50   622   159    0.26         409  0.716
## 98     13       9      0.69    69    27    0.39          39  0.379
## 99     37      18      0.49   202    47    0.23         137  0.663
## 100    54      28      0.52   274    76    0.28         173  0.627
## 101    10       4      0.40    48     6    0.12          37  0.443
## 102    39      19      0.49   201    53    0.26         129  0.584
## 103    44      25      0.57   250    79    0.32         153  0.688
## 104    49      26      0.53   242    70    0.29         150  0.610
## 105    21      10      0.48   118    28    0.24          80  0.608
## 106    70      35      0.50   391   104    0.27         253  0.638
## 107    22      11      0.50   120    31    0.26          79  0.634
## 108    32      13      0.41   209    38    0.18         152  0.665
## 109     3       0      0.00    23     0    0.00          20  0.386
## 110    12       5      0.42    80    19    0.24          55  0.672
## 111     2       1      0.50     8     1    0.12           7  0.057
## 112     5       0      0.00    27     0    0.00          25  0.580
## 113    11       3      0.27    79    17    0.22          55  0.619
## 114     6       1      0.17    27     1    0.04          22  0.572
## 115    36      15      0.42   193    56    0.29         117  0.632
## 116    40      17      0.42   246    42    0.17         182  0.678
## 117     1       0      0.00    15     0    0.00          15  0.000
## 118    34      14      0.41   203    44    0.22         140  0.633
## 119    27      18      0.67   154    58    0.38          88  0.544
## 120    26      11      0.42   153    32    0.21         107  0.659
## 121    28       7      0.25   198    20    0.10         158  0.696
## 122    45      19      0.42   258    56    0.22         177  0.709
## 123    20       4      0.20   131    11    0.08         104  0.671
## 124    21       9      0.43   121    25    0.21          85  0.577
## 125    29      12      0.41   174    33    0.19         125  0.604
## 126    22       8      0.36   129    28    0.22          88  0.629
## 127     3       0      0.00    12     0    0.00          10  0.353
## 128    33      16      0.48   198    50    0.25         132  0.611
## 129    54      27      0.50   296    70    0.24         200  0.616
## 130    12       3      0.25    69     7    0.10          54  0.542
## 131    49      21      0.43   303    79    0.26         197  0.627
## 132    47      22      0.47   277    72    0.26         181  0.698
## 133    38       9      0.24   231    31    0.13         172  0.704
## 134    38      13      0.34   207    33    0.16         152  0.690
## 135    23      13      0.57   125    35    0.28          81  0.497
## 136    13       5      0.38    79    20    0.25          54  0.705
## 137    39      16      0.41   232    38    0.16         172  0.650
## 138    59      32      0.54   342    99    0.29         217  0.704
## 139    66      25      0.38   402    71    0.18         291  0.720
## 140    42      22      0.52   199    54    0.27         126  0.612
## 141    44      21      0.48   244    52    0.21         170  0.592
## 142    14       8      0.57    77    29    0.38          43  0.564
## 143     6       1      0.17    38     3    0.08          31  0.518
## 144     2       0      0.00     8     0    0.00           7  0.059
## 145    33      13      0.39   208    46    0.22         143  0.740
## 146    72      32      0.44   414    86    0.21         289  0.707
## 147    32      12      0.38   193    38    0.20         136  0.684
## 148    12       3      0.25    88    19    0.22          61  0.592
## 149    61      26      0.43   339    73    0.22         232  0.652
## 150    28      17      0.61   160    52    0.32          98  0.644
## 151     9       5      0.56    59    28    0.47          28  0.651
## 152    31      12      0.39   174    26    0.15         130  0.617
## 153    35      17      0.49   180    42    0.23         121  0.636
## 154    41      19      0.46   247    46    0.19         180  0.657
## 155    60      25      0.42   367    69    0.19         264  0.643
## 156    30      18      0.60   174    69    0.40          94  0.554
## 157    27      11      0.41   160    35    0.22         110  0.581
## 158    89      40      0.45   493   115    0.23         330  0.674
## 159    22      15      0.68   106    44    0.42          56  0.499
## 160    38      28      0.74   168    82    0.49          77  0.506
## 161    22       9      0.41   116    26    0.22          80  0.679
## 162    26       7      0.27   156    19    0.12         119  0.689
## 163    37      13      0.35   217    44    0.20         150  0.681
## 164    11       9      0.82    50    21    0.42          28  0.443
## 165    34      14      0.41   171    33    0.19         119  0.581
## 166    52      25      0.48   296    76    0.26         194  0.615
## 167    38      17      0.45   215    51    0.24         144  0.630
## 168    42      16      0.38   246    46    0.19         175  0.670
## 169     4       0      0.00    33     0    0.00          30  0.473
## 170    30      13      0.43   169    32    0.19         121  0.652
## 171    20       8      0.40   115    21    0.18          83  0.560
## 172   117      39      0.33   747   137    0.18         533  0.757
## 173    33      19      0.58   159    56    0.35          90  0.539
## 174    75      24      0.32   410    72    0.18         288  0.737
## 175    53      19      0.36   321    60    0.19         228  0.729
## 176    49      36      0.73   232   109    0.47         111  0.461
## 177    27       6      0.22   153    12    0.08         120  0.724
## 178    39      21      0.54   224    68    0.30         139  0.683
## 179    50      29      0.58   295   112    0.38         163  0.663
## 180    10       4      0.40    67    14    0.21          48  0.641
## 181    38      19      0.50   201    46    0.23         137  0.687
## 182    44      27      0.61   223    79    0.35         127  0.533
## 183    99      62      0.63   499   166    0.33         297  0.603
## 184    24      11      0.46   151    45    0.30          94  0.672
## 185    76      29      0.38   414    81    0.20         287  0.621
## 186   109      51      0.47   624   163    0.26         404  0.679
## 187    43      13      0.30   248    45    0.18         174  0.689
## 188    41      23      0.56   240    63    0.26         159  0.652
## 189    24       7      0.29   157    21    0.13         120  0.719
## 190    23      11      0.48   140    35    0.25          94  0.562
## 191    34      18      0.53   204    73    0.36         116  0.595
## 192    22       5      0.23   120    14    0.12          90  0.700
## 193    63      20      0.32   357    50    0.14         265  0.716
## 194    77      25      0.32   497    66    0.13         380  0.692
## 195    59      27      0.46   308    67    0.22         210  0.639
## 196    32      12      0.38   182    43    0.24         120  0.594
## 197    99      51      0.52   568   151    0.27         370  0.662
## 198    83      44      0.53   453   129    0.28         286  0.625
## 199    44      19      0.43   275    58    0.21         193  0.723
## 200    45      20      0.44   253    60    0.24         169  0.685
## 201    18      10      0.56   112    35    0.31          70  0.597
## 202     5       0      0.00    26     0    0.00          22  0.603
## 203    12       2      0.17    96     3    0.03          84  0.728
## 204     8       4      0.50    51    17    0.33          31  0.429
## 205    18       9      0.50    89    31    0.35          50  0.606
## 206    11       5      0.45    53    11    0.21          36  0.589
## 207    13       8      0.62    57    23    0.40          30  0.609
## 208     8       5      0.62    36    12    0.33          22  0.363
## 209    26      12      0.46   126    24    0.19          89  0.660
## 210    10       2      0.20    63     7    0.11          49  0.683
## 211    10       5      0.50    52    23    0.44          25  0.626
## 212    10       4      0.40    55    11    0.20          39  0.572
## 213     4       2      0.50    17     7    0.41           9  0.392
## 214     6       4      0.67    27    10    0.37          16  0.191
## 215    11       4      0.36    55     8    0.15          40  0.562
## 216    13       9      0.69    75    35    0.47          36  0.447
## 217    20      11      0.55    85    29    0.34          48  0.551
## 218     8       4      0.50    45    12    0.27          30  0.430
## 219    13       8      0.62    66    33    0.50          28  0.494
## 220     5       2      0.40    22     4    0.18          16  0.285
## 221    11       4      0.36    55    11    0.20          38  0.463
## 222    10       3      0.30    52     4    0.08          42  0.596
## 223     4       3      0.75    16     6    0.38          10  0.189
## 224    23       8      0.35   130    25    0.19          91  0.610
## 225     5       2      0.40    32     4    0.12          26  0.272
## 226    13       4      0.31    65    15    0.23          42  0.685
## 227     9       4      0.44    46     8    0.17          34  0.576
## 228    20       6      0.30   111    17    0.15          81  0.669
## 229     6       1      0.17    31     1    0.03          26  0.472
## 230    15      10      0.67    74    31    0.42          39  0.588
## 231     6       5      0.83    26    13    0.50          13  0.235
## 232    13       6      0.46    56    14    0.25          36  0.434
## 233    15       5      0.33    58    12    0.21          39  0.615
## 234    21       5      0.24   115    14    0.12          86  0.635
## 235     5       2      0.40    31     9    0.29          20  0.316
## 236    23      12      0.52   111    29    0.26          72  0.505
## 237    14       8      0.57    69    27    0.39          37  0.561
## 238    11       7      0.64    53    23    0.43          27  0.474
## 239    15       6      0.40    75    19    0.25          48  0.508
## 240     8       4      0.50    33     8    0.24          22  0.404
## 241     9       3      0.33    46     6    0.13          35  0.535
## 242     5       3      0.60    30     7    0.23          22  0.415
## 243    21      13      0.62   110    48    0.44          55  0.696
## 244    14       3      0.21    66     9    0.14          47  0.609
## 245    10       3      0.30    40     5    0.12          29  0.473
## 246    10       6      0.60    49    17    0.35          29  0.516
## 247     2       0      0.00    10     0    0.00           9  0.047
## 248    19       8      0.42    94    18    0.19          66  0.608
## 249    20      13      0.65    89    36    0.40          47  0.527
## 250    10       5      0.50    53    14    0.26          34  0.611
## 251    13       6      0.46    61    15    0.25          40  0.554
## 252     7       1      0.14    40     4    0.10          31  0.455
## 253     8       5      0.62    35    10    0.29          23  0.299
## 254    19      15      0.79    83    42    0.51          37  0.445
## 255     2       0      0.00     8     0    0.00           7  0.060
## 256     6       1      0.17    39     3    0.08          32  0.740
## 257    12       5      0.42    65    17    0.26          42  0.477
## 258    17       7      0.41    87    16    0.18          62  0.670
## 259     7       4      0.57    35    13    0.37          19  0.313
## 260     8       1      0.12    42     4    0.10          32  0.581
## 261    10       8      0.80    35    20    0.57          14  0.259
## 262     4       3      0.75    23    12    0.52          11  0.073
## 263     6       2      0.33    32     3    0.09          26  0.343
## 264     5       1      0.20    30     2    0.07          25  0.634
## 265     7       4      0.57    36     8    0.22          26  0.350
## 266    17      10      0.59    84    34    0.40          44  0.606
## 267    19       9      0.47   114    26    0.23          79  0.582
## 268    13       4      0.31    62    15    0.24          39  0.617
## 269    12       7      0.58    53    17    0.32          32  0.495
## 270    16       4      0.25    93     9    0.10          73  0.546
## 271     5       1      0.20    47     3    0.06          41  0.347
## 272    16      13      0.81    67    34    0.51          31  0.414
## 273    28      19      0.68   137    61    0.45          68  0.585
## 274    23      11      0.48   112    28    0.25          73  0.665
## 275    16       8      0.50    73    18    0.25          48  0.468
## 276    12       5      0.42    62    12    0.19          44  0.577
## 277    16      10      0.62    81    34    0.42          42  0.628
## 278    12       7      0.58    69    26    0.38          39  0.673
## 279     8       1      0.12    43     2    0.05          34  0.797
## 280     9       4      0.44    44    10    0.23          30  0.617
## 281     6       2      0.33    37     5    0.14          29  0.576
## 282    27      13      0.48   132    38    0.29          81  0.847
## 283    17       7      0.41   101    22    0.22          70  0.687
## 284     7       4      0.57    43    13    0.30          28  0.565
## 285     2       1      0.50    13     6    0.46           7  0.059
## 286    23      13      0.57   118    38    0.32          71  0.597
## 287    19       7      0.37   100    19    0.19          70  0.678
## 288    11       6      0.55    60    24    0.40          32  0.563
## 289    19       8      0.42    99    20    0.20          69  0.649
## 290    23       8      0.35   130    22    0.17          94  0.623
## 291     5       3      0.60    20     7    0.35          12  0.345
## 292    10       4      0.40    46    13    0.28          28  0.625
## 293    18       7      0.39    99    12    0.12          77  0.627
## 294    16       8      0.50    76    27    0.36          42  0.351
## 295    17       9      0.53    87    25    0.29          55  0.562
## 296    10       4      0.40    63    15    0.24          43  0.403
## 297    16      10      0.62    70    29    0.41          36  0.413
## 298     9       5      0.56    46    22    0.48          21  0.630
## 299    12       7      0.58    51    17    0.33          30  0.512
## 300     2       1      0.50    13     7    0.54           6  0.068
## 301    18       5      0.28   103    18    0.17          72  0.659
## 302    18      10      0.56    87    26    0.30          54  0.617
## 303     7       2      0.29    36     9    0.25          23  0.369
## 304    12       6      0.50    50    12    0.24          33  0.518
## 305     7       3      0.43    32     9    0.28          20  0.552
## 306     7       1      0.14    48     4    0.08          39  0.556
## 307    17       5      0.29    83    11    0.13          60  0.660
## 308    11       5      0.45    55    15    0.27          35  0.501
## 309    14       7      0.50    60    19    0.32          35  0.607
## 310    12       4      0.33    72    10    0.14          55  0.567
## 311     9       2      0.22    41     8    0.20          26  0.573
## 312     7       2      0.29    32     6    0.19          21  0.334
## 313    21      10      0.48    96    25    0.26          61  0.681
## 314    17       5      0.29    98    14    0.14          73  0.697
## 315    15       9      0.60    74    28    0.38          41  0.633
## 316    16       8      0.50    89    27    0.30          55  0.741
## 317    10       4      0.40    43    10    0.23          28  0.433
## 318    25      14      0.56   124    40    0.32          74  0.610
## 319     7       4      0.57    28    10    0.36          16  0.431
## 320    14       9      0.64    72    37    0.51          31  0.553
## 321     9       5      0.56    45    20    0.44          22  0.574
## 322    21      11      0.52    98    30    0.31          59  0.615
## 323    12       5      0.42    64     9    0.14          49  0.501
## 324    10       5      0.50    52    13    0.25          35  0.405
## 325     3       1      0.33    17     2    0.12          14  0.332
## 326    12       9      0.75    56    30    0.54          24  0.598
## 327     4       3      0.75    13     3    0.23          10  0.343
## 328    20      10      0.50   112    35    0.31          68  0.516
## 329    24       9      0.38   134    21    0.16          99  0.724
## 330    11       3      0.27    86     9    0.10          70  0.678
## 331     8       6      0.75    44    21    0.48          22  0.320
## 332    15       8      0.53    78    24    0.31          48  0.608
## 333     4       1      0.25    28     3    0.11          23  0.517
## 334    12       3      0.25    60     9    0.15          43  0.535
## 335     7       4      0.57    27     9    0.33          16  0.339
## 336    23      10      0.43   111    23    0.21          76  0.611
## 337     7       4      0.57    42    11    0.26          29  0.341
## 338    11       8      0.73    49    19    0.39          28  0.468
## 339    19      10      0.53    80    19    0.24          53  0.546
## 340    14       6      0.43    59    15    0.25          37  0.521
## 341     3       0      0.00    19     0    0.00          17  0.305
## 342     3       2      0.67    13     4    0.31           9  0.213
## 343     5       3      0.60    27    12    0.44          14  0.064
## 344     6       4      0.67    28    13    0.46          14  0.334
## 345     7       4      0.57    32    11    0.34          18  0.627
## 346     7       3      0.43    36    10    0.28          23  0.495
## 347    23      15      0.65   118    50    0.42          61  0.565
## 348     6       4      0.67    22     6    0.27          15  0.507
## 349     8       4      0.50    43     9    0.21          31  0.595
## 350     2       0      0.00    11     0    0.00          10  0.042
## 351     7       4      0.57    39    10    0.26          27  0.494
## 352     7       3      0.43    39     9    0.23          27  0.443
## 353     9       4      0.44    50    16    0.32          30  0.488
## 354    17       6      0.35   106    18    0.17          78  0.687
## 355    15       7      0.47    82    15    0.18          60  0.591
## 356     2       0      0.00    12     0    0.00          10  0.214
## 357     5       1      0.20    25     3    0.12          21  0.667
## 358    23       7      0.30   125    18    0.14          92  0.688
## 359     7       3      0.43    32     6    0.19          23  0.530
## 360     4       2      0.50    19     7    0.37          11  0.316
## 361    30      18      0.60   138    41    0.30          86  0.632
## 362    16       8      0.50    79    19    0.24          53  0.574
## 363    18       5      0.28   104    16    0.15          76  0.696
## 364     9       2      0.22    38     3    0.08          29  0.570
## 365     3       1      0.33    14     2    0.14          10  0.075
## 366     5       0      0.00    37     0    0.00          33  0.490
## 367    13       6      0.46    71    13    0.18          52  0.521
## 368    15       6      0.40    90    12    0.13          70  0.649
## 369    14       4      0.29    58    14    0.24          35  0.499
## 370    27       8      0.30   140    25    0.18          97  0.703
## 371     7       2      0.29    34     2    0.06          28  0.550
## 372    22       9      0.41   132    22    0.17          98  0.609
## 373    26       9      0.35   132    23    0.17          93  0.610
## 374    27       8      0.30   135    20    0.15          97  0.677
## 375    13       3      0.23    61     5    0.08          47  0.557
## 376     4       1      0.25    19     1    0.05          16  0.404
## 377     6       3      0.50    21     6    0.29          13  0.299
## 378    10       5      0.50    42    13    0.31          25  0.474
## 379    10       3      0.30    51     7    0.14          38  0.550
## 380    16       8      0.50    75    18    0.24          50  0.457
## 381     5       3      0.60    21     7    0.33          13  0.378
## 382    18       5      0.28    91    12    0.13          67  0.544
## 383    13       6      0.46    68    19    0.28          43  0.582
## 384     9       5      0.56    50    21    0.42          26  0.478
## 385    18      11      0.61    91    40    0.44          45  0.612
## 386    11       4      0.36    61    12    0.20          43  0.558
## 387    12       5      0.42    74    17    0.23          51  0.602
## 388     9       4      0.44    49    14    0.29          31  0.357
## 389     4       1      0.25    26     2    0.08          22  0.402
## 390    12       7      0.58    67    23    0.34          39  0.625
## 391     5       4      0.80    19     9    0.47          10  0.383
## 392     2       0      0.00    17     0    0.00          16  0.210
## 393    28      13      0.46   126    30    0.24          82  0.569
## 394    22      13      0.59    95    36    0.38          51  0.506
## 395    18      11      0.61    79    29    0.37          44  0.319
## 396    16       7      0.44    75    16    0.21          51  0.625
## 397    13       8      0.62    65    25    0.38          35  0.531
## 398    16       9      0.56    79    26    0.33          47  0.548
## 399    24      13      0.54   102    33    0.32          58  0.599
## 400    11       6      0.55    57    17    0.30          36  0.607
## 401     2       0      0.00     8     0    0.00           7  0.181
## 402    14       9      0.64    63    22    0.35          37  0.466
## 403     4       1      0.25    26     4    0.15          20  0.260
## 404    24      10      0.42   114    22    0.19          79  0.566
## 405     9       3      0.33    46    10    0.22          31  0.608
## 406    22      12      0.55   119    35    0.29          75  0.611
## 407    15       9      0.60   102    23    0.23          74  0.609
## 408     9       6      0.67    48    13    0.27          33  0.621
## 409    21       9      0.43   112    19    0.17          81  0.637
## 410    16       7      0.44    94    18    0.19          68  0.661
## 411    13       8      0.62    56    24    0.43          28  0.550
## 412    10       3      0.30    59     9    0.15          44  0.615
## 413    17       6      0.35    92    15    0.16          67  0.665
## 414    18       9      0.50   105    25    0.24          72  0.484
## 415     2       2      1.00    13    10    0.77           3  0.113
## 416    20      10      0.50   114    30    0.26          75  0.503
## 417    15       3      0.20    94     5    0.05          78  0.555
## 418    22       7      0.32   116    21    0.18          81  0.424
## 419     7       3      0.43    36     6    0.17          27  0.399
## 420    14       7      0.50    72    20    0.28          46  0.479
## 421    11       4      0.36    53     8    0.15          39  0.537
## 422    13       7      0.54    62    18    0.29          39  0.458
## 423    11       5      0.45    46     8    0.17          33  0.453
## 424    10       6      0.60    60    22    0.37          35  0.459
## 425     4       2      0.50    22     9    0.41          12  0.232
## 426     8       4      0.50    30    10    0.33          17  0.358
## 427     5       3      0.60    27     8    0.30          18  0.243
## 428     3       0      0.00    11     0    0.00           9  0.170
## 429    22      10      0.45   117    30    0.26          76  0.569
## 430     4       1      0.25    14     1    0.07          11  0.138
## 431     8       4      0.50    36     9    0.25          24  0.277
## 432     5       4      0.80    17    10    0.59           7  0.140
## 433    24      17      0.71   120    51    0.42          63  0.479
## 434    10       4      0.40    49    11    0.22          33  0.450
## 435    25      16      0.64   136    60    0.44          68  0.528
## 436     7       4      0.57    36    13    0.36          21  0.527
## 437    11       3      0.27    50     9    0.18          34  0.502
## 438     9       5      0.56    40    17    0.42          20  0.382
## 439    12       5      0.42    58     9    0.16          43  0.520
## 440    22      13      0.59   122    42    0.34          72  0.456
## 441    20       5      0.25   111    17    0.15          80  0.556
## 442     9       4      0.44    39    10    0.26          25  0.233
## 443     3       0      0.00    12     0    0.00          10  0.270
## 444    21       7      0.33   120    21    0.18          86  0.538
## 445     9       2      0.22    49     4    0.08          39  0.397
## 446     3       1      0.33    14     4    0.29           9  0.106
## 447     6       6      1.00    25    17    0.68           9  0.142
## 448    26      17      0.65   122    44    0.36          70  0.390
## 449    15       6      0.40    80    21    0.26          51  0.467
## 450     3       0      0.00    13     0    0.00          11  0.283
## 451    14       4      0.29    99    12    0.12          78  0.496
## 452     5       1      0.20    36     1    0.03          32  0.472
## 453    14       5      0.36    69    14    0.20          47  0.577
## 454    24      12      0.50   112    36    0.32          65  0.496
## 455     6       3      0.50    29     7    0.24          20  0.416
## 456     3       2      0.67    16     5    0.31          11  0.117
## 457    12       7      0.58    58    16    0.28          38  0.257
## 458     6       3      0.50    37    13    0.35          22  0.170
## 459     6       2      0.33    32     6    0.19          23  0.457
## 460    17       5      0.29    92    11    0.12          70  0.598
## 461     6       4      0.67    37    14    0.38          22  0.342
## 462    11       3      0.27    47     7    0.15          32  0.428
## 463    20      14      0.70    97    40    0.41          52  0.401
## 464    11       8      0.73    51    26    0.51          22  0.156
## 465     8       1      0.12    30     2    0.07          22  0.382
## 466     6       1      0.17    36     6    0.17          26  0.427
## 467    17       9      0.53    92    33    0.36          52  0.573
colSums(s)
##       words     matches   effective       chars       saved     benefit 
##   10429.000    4713.000     199.960   57132.000   13464.000     111.100 
## predictions      avg.ms 
##   38419.000     243.967

Appendix A

sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 14393)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] wordcloud_2.5      RColorBrewer_1.1-2 tm_0.6-2          
## [4] NLP_0.1-9         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.6     digest_0.6.9    slam_0.1-38     formatR_1.4    
##  [5] magrittr_1.5    evaluate_0.9    stringi_1.1.1   rmarkdown_1.0  
##  [9] tools_3.3.1     stringr_1.0.0   yaml_2.1.13     parallel_3.3.1 
## [13] htmltools_0.3.5 knitr_1.13