This script exercises the SmartType model and library and calculates performance statistics.
Set execution environment.
setwd("~/Academic/DataScience/Capstone/prod") # Where is R running?
set.seed(9035768) # Set seed so result is repeatable, but different from training set.
source("SmartTypeLibrary.R") # Load predition library routines.
raw.data.folder <- "../data/en_US" # Where is the raw data?
file.list <- c("en_US.blogs.txt","en_US.news.txt","en_US.twitter.txt") # Data files.
test.data.fraction <- 0.0001 # What fraction of the raw data should be included?
train.data.fraction <- 0.10 # Which trained model should be evaluated?
Load the trained model into the SmartType library.
model.file <- paste("models",toString(train.data.fraction*100),"ngram","model.rds",sep="/")
SmartType.initialize(model.file)
Perform spot checks to verify most recent changes.
SmartType.predict("thanks for the",4,showwc=FALSE)
## [1] "follow" "rt" "mention" "ff"
## [5] "shout" "retweet" "rts" "support"
## [9] "tweet" "heads" "kind" "love"
## [13] "shoutout" "tip" "great" "info"
## [17] "link" "recommendation" "invite" "add"
## [21] "follows" "so" "help" "reminder"
## [25] "suggestion" "update" "feedback" "mentions"
## [29] "response" "retweets" "awesome" "birthday"
## [33] "memories" "post" "tweets" "advice"
## [37] "article" "good" "props" "share"
SmartType.predict("for the",3,showwc=FALSE)
## [1] "first" "follow" "rt" "next" "rest" "last" "new"
## [8] "past" "day" "most" "mention" "ff" "second" "best"
## [15] "shout" "weekend" "retweet" "future" "support" "city" "season"
## [22] "sake" "year" "night" "love" "same" "people" "state"
## [29] "rts" "us" "record" "th" "show" "week" "game"
## [36] "summer" "team" "big" "job" "tweet" "whole" "kids"
## [43] "third" "great" "right" "entire" "kind" "good" "time"
## [50] "better"
SmartType.predict("the",2,showwc=FALSE)
## [1] "first" "same" "best" "new" "most" "world"
## [7] "way" "last" "other" "time" "next" "only"
## [13] "end" "day" "state" "city" "us" "right"
## [19] "past" "game" "second" "top" "one" "rest"
## [25] "house" "th" "year" "two" "whole" "people"
## [31] "fact" "big" "company" "team" "season" "book"
## [37] "country" "s" "story" "show" "middle" "united"
## [43] "case" "follow" "final" "national" "word" "future"
## [49] "night" "more"
SmartType.predict("",1,showwc=FALSE)
## [1] "the" "and" "for" "that" "you" "with" "was" "this"
## [9] "have" "are" "but" "not" "from" "all" "they" "will"
## [17] "its" "said" "just" "your" "his" "out" "about" "one"
## [25] "what" "like" "when" "has" "who" "can" "more" "had"
## [33] "get" "were" "time" "there" "her" "their" "would" "some"
## [41] "new" "she" "our" "been" "good" "now" "how" "day"
## [49] "know" "them"
# Quiz 2
SmartType.predict("with his little",4,showwc=TRUE)
## [1] "bit" "more" "girl" "too" "to"
## [6] "time" "things" "boy" "while" "and"
## [11] "of" "ones" "guy" "over" "better"
## [16] "thing" "one" "as" "or" "brother"
## [21] "girls" "different" "less" "sister" "about"
## [26] "in" "something" "late" "help" "longer"
## [31] "i" "kid" "like" "extra" "league"
## [36] "man" "people" "red" "kids" "piece"
## [41] "did" "baby" "italy" "town" "boys"
## [46] "break" "but" "later" "nervous" "miss"
SmartType.predict("you must be",4,showwc=TRUE)
## [1] "a" "the" "in" "done" "able"
## [6] "at" "on" "so" "willing" "kept"
## [11] "one" "an" "approved" "nice" "taken"
## [16] "very" "following" "made" "paid" "prepared"
## [21] "something" "to" "doing" "good" "like"
## [26] "more"
# Quiz 3
SmartType.predict("artic monkeys this",4,showwc=TRUE)
## [1] "is" "year" "week" "morning" "was"
## [6] "weekend" "time" "one" "season" "month"
## [11] "point" "summer" "will" "book" "blog"
## [16] "and" "post" "day" "way" "game"
## [21] "years" "i" "to" "world" "new"
## [26] "has" "case" "afternoon" "past" "in"
## [31] "guy" "would" "for" "a" "story"
## [36] "movie" "the" "country" "as" "but"
## [41] "place" "weeks" "little" "spring" "project"
## [46] "series" "evening" "song" "saturday" "means"
SmartType.predict("to take a",4,showwc=TRUE)
## [1] "look" "picture" "few" "break" "lead" "nap" "shower"
## [8] "photo" "moment" "stand" "closer" "trip" "chance" "step"
## [15] "test"
SmartType.predict("of adam sandlers",4,showwc=TRUE)
## [1] "the" "and" "for" "that" "you" "with" "was" "this"
## [9] "have" "are" "but" "not" "from" "all" "they" "will"
## [17] "its" "said" "just" "your" "his" "out" "about" "one"
## [25] "what" "like" "when" "has" "who" "can" "more" "had"
## [33] "get" "were" "time" "there" "her" "their" "would" "some"
## [41] "new" "she" "our" "been" "good" "now" "how" "day"
## [49] "know" "them"
SmartType.evaluate("Could be added to the gallery wall.")
## words matches effective chars saved benefit predictions avg.ms
## 1 7 4 0.57 35 10 0.29 22 0.291
SmartType.evaluate("could be added to THE 9 gallery wall")
## words matches effective chars saved benefit predictions avg.ms
## 1 7 4 0.57 36 10 0.28 22 0.275
SmartType.evaluate(c("could be added to the gallery wall","Could be added to the gallery wall."))
## words matches effective chars saved benefit predictions avg.ms
## 1 7 4 0.57 34 10 0.29 22 0.280
## 2 7 4 0.57 35 10 0.29 22 0.276
Perform more rigorous validation. Use a sample of the original data as a test set. Sample documents from blogs, news, and tweets.
docs <- vector()
for (i in 1:length(file.list)) {
pathname <- paste(raw.data.folder,file.list[i],sep="/")
# Read raw data.
f <- file(pathname,"rb")
while(TRUE) {
line <- readLines(f,n=1,warn=FALSE,skipNul=TRUE)
if (length(line) > 0) {
if (runif(1) <= test.data.fraction) { # Randomly include/discard?
docs <- c(docs,line) # Concatenate. (Efficient?)
}
}
else {
break
}
}
close(f)
}
Pre-process the validation samples in the same way that the training data was pre-processed.
docs <- sapply(docs,removeNumbers,USE.NAMES=FALSE)
docs <- sapply(docs,removePunctuation,USE.NAMES=FALSE)
docs <- sapply(docs,tolower,USE.NAMES=FALSE)
docs <- sapply(docs,stripWhitespace,USE.NAMES=FALSE)
Run evaluate() method on test data set.
s <- SmartType.evaluate(docs)
s
## words matches effective chars saved benefit predictions avg.ms
## 1 35 19 0.54 195 48 0.25 132 0.565
## 2 6 1 0.17 46 1 0.02 41 0.392
## 3 29 15 0.52 171 43 0.25 115 0.537
## 4 3 0 0.00 22 0 0.00 22 0.521
## 5 12 3 0.25 64 9 0.14 49 0.558
## 6 5 3 0.60 19 6 0.32 12 0.456
## 7 12 6 0.50 59 12 0.20 42 0.515
## 8 11 4 0.36 59 9 0.15 44 0.524
## 9 3 0 0.00 23 0 0.00 21 0.284
## 10 5 0 0.00 35 0 0.00 31 0.496
## 11 7 6 0.86 31 19 0.61 12 0.517
## 12 3 1 0.33 12 3 0.25 10 0.541
## 13 5 3 0.60 23 11 0.48 11 0.203
## 14 74 28 0.38 371 68 0.18 258 0.639
## 15 4 0 0.00 37 0 0.00 34 0.555
## 16 33 17 0.52 159 41 0.26 103 0.601
## 17 6 1 0.17 39 3 0.08 32 0.617
## 18 6 3 0.50 33 8 0.24 23 0.234
## 19 37 19 0.51 197 57 0.29 123 0.566
## 20 40 28 0.70 171 76 0.44 84 0.582
## 21 8 2 0.25 35 5 0.14 25 0.656
## 22 11 5 0.45 48 12 0.25 31 0.494
## 23 14 6 0.43 77 14 0.18 56 0.537
## 24 58 19 0.33 401 52 0.13 311 0.711
## 25 71 34 0.48 405 104 0.26 265 0.646
## 26 7 5 0.71 35 15 0.43 19 0.304
## 27 50 19 0.38 298 63 0.21 205 0.706
## 28 161 81 0.50 875 237 0.27 559 0.688
## 29 44 19 0.43 244 46 0.19 174 0.687
## 30 15 9 0.60 71 22 0.31 44 0.516
## 31 89 35 0.39 500 92 0.18 355 0.674
## 32 45 20 0.44 252 53 0.21 175 0.635
## 33 35 15 0.43 223 45 0.20 159 0.656
## 34 108 57 0.53 548 156 0.28 342 0.627
## 35 2 0 0.00 5 0 0.00 6 0.272
## 36 77 31 0.40 439 103 0.23 293 0.717
## 37 25 18 0.72 123 48 0.39 69 0.614
## 38 2 1 0.50 11 5 0.45 6 0.070
## 39 7 2 0.29 31 3 0.10 24 0.453
## 40 46 18 0.39 287 50 0.17 210 0.654
## 41 11 7 0.64 40 15 0.38 24 0.477
## 42 33 22 0.67 158 63 0.40 85 0.498
## 43 34 20 0.59 168 46 0.27 108 0.570
## 44 89 35 0.39 556 101 0.18 402 0.689
## 45 1 0 0.00 3 0 0.00 3 0.000
## 46 11 1 0.09 58 1 0.02 48 0.583
## 47 58 24 0.41 362 63 0.17 266 0.640
## 48 59 18 0.31 353 54 0.15 259 0.701
## 49 13 2 0.15 81 4 0.05 66 0.650
## 50 38 15 0.39 211 39 0.18 150 0.638
## 51 9 3 0.33 49 8 0.16 36 0.556
## 52 41 22 0.54 234 59 0.25 157 0.592
## 53 28 18 0.64 134 46 0.34 79 0.481
## 54 46 21 0.46 265 54 0.20 187 0.635
## 55 4 1 0.25 16 1 0.06 13 0.370
## 56 3 2 0.67 17 12 0.71 5 0.140
## 57 9 4 0.44 44 13 0.30 27 0.570
## 58 43 20 0.47 199 63 0.32 114 0.652
## 59 62 22 0.35 308 52 0.17 217 0.659
## 60 4 0 0.00 27 0 0.00 24 0.533
## 61 26 13 0.50 144 41 0.28 91 0.581
## 62 1 0 0.00 7 0 0.00 6 0.000
## 63 23 10 0.43 147 29 0.20 106 0.664
## 64 203 87 0.43 1174 220 0.19 839 0.682
## 65 19 7 0.37 118 15 0.13 92 0.558
## 66 43 16 0.37 263 40 0.15 197 0.671
## 67 4 2 0.50 19 5 0.26 13 0.112
## 68 25 13 0.52 134 43 0.32 82 0.689
## 69 36 17 0.47 211 53 0.25 140 0.662
## 70 5 2 0.40 17 5 0.29 12 0.526
## 71 2 0 0.00 14 0 0.00 13 0.095
## 72 61 31 0.51 346 80 0.23 237 0.634
## 73 6 2 0.33 37 5 0.14 29 0.383
## 74 18 11 0.61 81 28 0.35 47 0.446
## 75 13 5 0.38 62 11 0.18 44 0.570
## 76 4 0 0.00 26 0 0.00 22 0.282
## 77 28 12 0.43 156 32 0.21 109 0.641
## 78 5 0 0.00 32 0 0.00 28 0.481
## 79 139 59 0.42 730 134 0.18 517 0.664
## 80 20 6 0.30 120 22 0.18 87 0.653
## 81 113 66 0.58 605 188 0.31 371 0.634
## 82 7 5 0.71 30 15 0.50 14 0.321
## 83 18 10 0.56 78 28 0.36 43 0.545
## 84 15 9 0.60 78 32 0.41 41 0.380
## 85 24 8 0.33 128 18 0.14 95 0.539
## 86 143 78 0.55 716 209 0.29 443 0.611
## 87 45 16 0.36 281 47 0.17 206 0.719
## 88 39 20 0.51 197 62 0.31 117 0.531
## 89 3 0 0.00 8 0 0.00 8 0.486
## 90 76 26 0.34 452 67 0.15 336 0.658
## 91 113 35 0.31 696 99 0.14 520 0.736
## 92 51 17 0.33 366 56 0.15 277 0.688
## 93 26 9 0.35 169 21 0.12 132 0.707
## 94 14 3 0.21 83 14 0.17 59 0.682
## 95 31 17 0.55 153 48 0.31 92 0.482
## 96 53 20 0.38 338 69 0.20 237 0.650
## 97 111 56 0.50 622 159 0.26 409 0.716
## 98 13 9 0.69 69 27 0.39 39 0.379
## 99 37 18 0.49 202 47 0.23 137 0.663
## 100 54 28 0.52 274 76 0.28 173 0.627
## 101 10 4 0.40 48 6 0.12 37 0.443
## 102 39 19 0.49 201 53 0.26 129 0.584
## 103 44 25 0.57 250 79 0.32 153 0.688
## 104 49 26 0.53 242 70 0.29 150 0.610
## 105 21 10 0.48 118 28 0.24 80 0.608
## 106 70 35 0.50 391 104 0.27 253 0.638
## 107 22 11 0.50 120 31 0.26 79 0.634
## 108 32 13 0.41 209 38 0.18 152 0.665
## 109 3 0 0.00 23 0 0.00 20 0.386
## 110 12 5 0.42 80 19 0.24 55 0.672
## 111 2 1 0.50 8 1 0.12 7 0.057
## 112 5 0 0.00 27 0 0.00 25 0.580
## 113 11 3 0.27 79 17 0.22 55 0.619
## 114 6 1 0.17 27 1 0.04 22 0.572
## 115 36 15 0.42 193 56 0.29 117 0.632
## 116 40 17 0.42 246 42 0.17 182 0.678
## 117 1 0 0.00 15 0 0.00 15 0.000
## 118 34 14 0.41 203 44 0.22 140 0.633
## 119 27 18 0.67 154 58 0.38 88 0.544
## 120 26 11 0.42 153 32 0.21 107 0.659
## 121 28 7 0.25 198 20 0.10 158 0.696
## 122 45 19 0.42 258 56 0.22 177 0.709
## 123 20 4 0.20 131 11 0.08 104 0.671
## 124 21 9 0.43 121 25 0.21 85 0.577
## 125 29 12 0.41 174 33 0.19 125 0.604
## 126 22 8 0.36 129 28 0.22 88 0.629
## 127 3 0 0.00 12 0 0.00 10 0.353
## 128 33 16 0.48 198 50 0.25 132 0.611
## 129 54 27 0.50 296 70 0.24 200 0.616
## 130 12 3 0.25 69 7 0.10 54 0.542
## 131 49 21 0.43 303 79 0.26 197 0.627
## 132 47 22 0.47 277 72 0.26 181 0.698
## 133 38 9 0.24 231 31 0.13 172 0.704
## 134 38 13 0.34 207 33 0.16 152 0.690
## 135 23 13 0.57 125 35 0.28 81 0.497
## 136 13 5 0.38 79 20 0.25 54 0.705
## 137 39 16 0.41 232 38 0.16 172 0.650
## 138 59 32 0.54 342 99 0.29 217 0.704
## 139 66 25 0.38 402 71 0.18 291 0.720
## 140 42 22 0.52 199 54 0.27 126 0.612
## 141 44 21 0.48 244 52 0.21 170 0.592
## 142 14 8 0.57 77 29 0.38 43 0.564
## 143 6 1 0.17 38 3 0.08 31 0.518
## 144 2 0 0.00 8 0 0.00 7 0.059
## 145 33 13 0.39 208 46 0.22 143 0.740
## 146 72 32 0.44 414 86 0.21 289 0.707
## 147 32 12 0.38 193 38 0.20 136 0.684
## 148 12 3 0.25 88 19 0.22 61 0.592
## 149 61 26 0.43 339 73 0.22 232 0.652
## 150 28 17 0.61 160 52 0.32 98 0.644
## 151 9 5 0.56 59 28 0.47 28 0.651
## 152 31 12 0.39 174 26 0.15 130 0.617
## 153 35 17 0.49 180 42 0.23 121 0.636
## 154 41 19 0.46 247 46 0.19 180 0.657
## 155 60 25 0.42 367 69 0.19 264 0.643
## 156 30 18 0.60 174 69 0.40 94 0.554
## 157 27 11 0.41 160 35 0.22 110 0.581
## 158 89 40 0.45 493 115 0.23 330 0.674
## 159 22 15 0.68 106 44 0.42 56 0.499
## 160 38 28 0.74 168 82 0.49 77 0.506
## 161 22 9 0.41 116 26 0.22 80 0.679
## 162 26 7 0.27 156 19 0.12 119 0.689
## 163 37 13 0.35 217 44 0.20 150 0.681
## 164 11 9 0.82 50 21 0.42 28 0.443
## 165 34 14 0.41 171 33 0.19 119 0.581
## 166 52 25 0.48 296 76 0.26 194 0.615
## 167 38 17 0.45 215 51 0.24 144 0.630
## 168 42 16 0.38 246 46 0.19 175 0.670
## 169 4 0 0.00 33 0 0.00 30 0.473
## 170 30 13 0.43 169 32 0.19 121 0.652
## 171 20 8 0.40 115 21 0.18 83 0.560
## 172 117 39 0.33 747 137 0.18 533 0.757
## 173 33 19 0.58 159 56 0.35 90 0.539
## 174 75 24 0.32 410 72 0.18 288 0.737
## 175 53 19 0.36 321 60 0.19 228 0.729
## 176 49 36 0.73 232 109 0.47 111 0.461
## 177 27 6 0.22 153 12 0.08 120 0.724
## 178 39 21 0.54 224 68 0.30 139 0.683
## 179 50 29 0.58 295 112 0.38 163 0.663
## 180 10 4 0.40 67 14 0.21 48 0.641
## 181 38 19 0.50 201 46 0.23 137 0.687
## 182 44 27 0.61 223 79 0.35 127 0.533
## 183 99 62 0.63 499 166 0.33 297 0.603
## 184 24 11 0.46 151 45 0.30 94 0.672
## 185 76 29 0.38 414 81 0.20 287 0.621
## 186 109 51 0.47 624 163 0.26 404 0.679
## 187 43 13 0.30 248 45 0.18 174 0.689
## 188 41 23 0.56 240 63 0.26 159 0.652
## 189 24 7 0.29 157 21 0.13 120 0.719
## 190 23 11 0.48 140 35 0.25 94 0.562
## 191 34 18 0.53 204 73 0.36 116 0.595
## 192 22 5 0.23 120 14 0.12 90 0.700
## 193 63 20 0.32 357 50 0.14 265 0.716
## 194 77 25 0.32 497 66 0.13 380 0.692
## 195 59 27 0.46 308 67 0.22 210 0.639
## 196 32 12 0.38 182 43 0.24 120 0.594
## 197 99 51 0.52 568 151 0.27 370 0.662
## 198 83 44 0.53 453 129 0.28 286 0.625
## 199 44 19 0.43 275 58 0.21 193 0.723
## 200 45 20 0.44 253 60 0.24 169 0.685
## 201 18 10 0.56 112 35 0.31 70 0.597
## 202 5 0 0.00 26 0 0.00 22 0.603
## 203 12 2 0.17 96 3 0.03 84 0.728
## 204 8 4 0.50 51 17 0.33 31 0.429
## 205 18 9 0.50 89 31 0.35 50 0.606
## 206 11 5 0.45 53 11 0.21 36 0.589
## 207 13 8 0.62 57 23 0.40 30 0.609
## 208 8 5 0.62 36 12 0.33 22 0.363
## 209 26 12 0.46 126 24 0.19 89 0.660
## 210 10 2 0.20 63 7 0.11 49 0.683
## 211 10 5 0.50 52 23 0.44 25 0.626
## 212 10 4 0.40 55 11 0.20 39 0.572
## 213 4 2 0.50 17 7 0.41 9 0.392
## 214 6 4 0.67 27 10 0.37 16 0.191
## 215 11 4 0.36 55 8 0.15 40 0.562
## 216 13 9 0.69 75 35 0.47 36 0.447
## 217 20 11 0.55 85 29 0.34 48 0.551
## 218 8 4 0.50 45 12 0.27 30 0.430
## 219 13 8 0.62 66 33 0.50 28 0.494
## 220 5 2 0.40 22 4 0.18 16 0.285
## 221 11 4 0.36 55 11 0.20 38 0.463
## 222 10 3 0.30 52 4 0.08 42 0.596
## 223 4 3 0.75 16 6 0.38 10 0.189
## 224 23 8 0.35 130 25 0.19 91 0.610
## 225 5 2 0.40 32 4 0.12 26 0.272
## 226 13 4 0.31 65 15 0.23 42 0.685
## 227 9 4 0.44 46 8 0.17 34 0.576
## 228 20 6 0.30 111 17 0.15 81 0.669
## 229 6 1 0.17 31 1 0.03 26 0.472
## 230 15 10 0.67 74 31 0.42 39 0.588
## 231 6 5 0.83 26 13 0.50 13 0.235
## 232 13 6 0.46 56 14 0.25 36 0.434
## 233 15 5 0.33 58 12 0.21 39 0.615
## 234 21 5 0.24 115 14 0.12 86 0.635
## 235 5 2 0.40 31 9 0.29 20 0.316
## 236 23 12 0.52 111 29 0.26 72 0.505
## 237 14 8 0.57 69 27 0.39 37 0.561
## 238 11 7 0.64 53 23 0.43 27 0.474
## 239 15 6 0.40 75 19 0.25 48 0.508
## 240 8 4 0.50 33 8 0.24 22 0.404
## 241 9 3 0.33 46 6 0.13 35 0.535
## 242 5 3 0.60 30 7 0.23 22 0.415
## 243 21 13 0.62 110 48 0.44 55 0.696
## 244 14 3 0.21 66 9 0.14 47 0.609
## 245 10 3 0.30 40 5 0.12 29 0.473
## 246 10 6 0.60 49 17 0.35 29 0.516
## 247 2 0 0.00 10 0 0.00 9 0.047
## 248 19 8 0.42 94 18 0.19 66 0.608
## 249 20 13 0.65 89 36 0.40 47 0.527
## 250 10 5 0.50 53 14 0.26 34 0.611
## 251 13 6 0.46 61 15 0.25 40 0.554
## 252 7 1 0.14 40 4 0.10 31 0.455
## 253 8 5 0.62 35 10 0.29 23 0.299
## 254 19 15 0.79 83 42 0.51 37 0.445
## 255 2 0 0.00 8 0 0.00 7 0.060
## 256 6 1 0.17 39 3 0.08 32 0.740
## 257 12 5 0.42 65 17 0.26 42 0.477
## 258 17 7 0.41 87 16 0.18 62 0.670
## 259 7 4 0.57 35 13 0.37 19 0.313
## 260 8 1 0.12 42 4 0.10 32 0.581
## 261 10 8 0.80 35 20 0.57 14 0.259
## 262 4 3 0.75 23 12 0.52 11 0.073
## 263 6 2 0.33 32 3 0.09 26 0.343
## 264 5 1 0.20 30 2 0.07 25 0.634
## 265 7 4 0.57 36 8 0.22 26 0.350
## 266 17 10 0.59 84 34 0.40 44 0.606
## 267 19 9 0.47 114 26 0.23 79 0.582
## 268 13 4 0.31 62 15 0.24 39 0.617
## 269 12 7 0.58 53 17 0.32 32 0.495
## 270 16 4 0.25 93 9 0.10 73 0.546
## 271 5 1 0.20 47 3 0.06 41 0.347
## 272 16 13 0.81 67 34 0.51 31 0.414
## 273 28 19 0.68 137 61 0.45 68 0.585
## 274 23 11 0.48 112 28 0.25 73 0.665
## 275 16 8 0.50 73 18 0.25 48 0.468
## 276 12 5 0.42 62 12 0.19 44 0.577
## 277 16 10 0.62 81 34 0.42 42 0.628
## 278 12 7 0.58 69 26 0.38 39 0.673
## 279 8 1 0.12 43 2 0.05 34 0.797
## 280 9 4 0.44 44 10 0.23 30 0.617
## 281 6 2 0.33 37 5 0.14 29 0.576
## 282 27 13 0.48 132 38 0.29 81 0.847
## 283 17 7 0.41 101 22 0.22 70 0.687
## 284 7 4 0.57 43 13 0.30 28 0.565
## 285 2 1 0.50 13 6 0.46 7 0.059
## 286 23 13 0.57 118 38 0.32 71 0.597
## 287 19 7 0.37 100 19 0.19 70 0.678
## 288 11 6 0.55 60 24 0.40 32 0.563
## 289 19 8 0.42 99 20 0.20 69 0.649
## 290 23 8 0.35 130 22 0.17 94 0.623
## 291 5 3 0.60 20 7 0.35 12 0.345
## 292 10 4 0.40 46 13 0.28 28 0.625
## 293 18 7 0.39 99 12 0.12 77 0.627
## 294 16 8 0.50 76 27 0.36 42 0.351
## 295 17 9 0.53 87 25 0.29 55 0.562
## 296 10 4 0.40 63 15 0.24 43 0.403
## 297 16 10 0.62 70 29 0.41 36 0.413
## 298 9 5 0.56 46 22 0.48 21 0.630
## 299 12 7 0.58 51 17 0.33 30 0.512
## 300 2 1 0.50 13 7 0.54 6 0.068
## 301 18 5 0.28 103 18 0.17 72 0.659
## 302 18 10 0.56 87 26 0.30 54 0.617
## 303 7 2 0.29 36 9 0.25 23 0.369
## 304 12 6 0.50 50 12 0.24 33 0.518
## 305 7 3 0.43 32 9 0.28 20 0.552
## 306 7 1 0.14 48 4 0.08 39 0.556
## 307 17 5 0.29 83 11 0.13 60 0.660
## 308 11 5 0.45 55 15 0.27 35 0.501
## 309 14 7 0.50 60 19 0.32 35 0.607
## 310 12 4 0.33 72 10 0.14 55 0.567
## 311 9 2 0.22 41 8 0.20 26 0.573
## 312 7 2 0.29 32 6 0.19 21 0.334
## 313 21 10 0.48 96 25 0.26 61 0.681
## 314 17 5 0.29 98 14 0.14 73 0.697
## 315 15 9 0.60 74 28 0.38 41 0.633
## 316 16 8 0.50 89 27 0.30 55 0.741
## 317 10 4 0.40 43 10 0.23 28 0.433
## 318 25 14 0.56 124 40 0.32 74 0.610
## 319 7 4 0.57 28 10 0.36 16 0.431
## 320 14 9 0.64 72 37 0.51 31 0.553
## 321 9 5 0.56 45 20 0.44 22 0.574
## 322 21 11 0.52 98 30 0.31 59 0.615
## 323 12 5 0.42 64 9 0.14 49 0.501
## 324 10 5 0.50 52 13 0.25 35 0.405
## 325 3 1 0.33 17 2 0.12 14 0.332
## 326 12 9 0.75 56 30 0.54 24 0.598
## 327 4 3 0.75 13 3 0.23 10 0.343
## 328 20 10 0.50 112 35 0.31 68 0.516
## 329 24 9 0.38 134 21 0.16 99 0.724
## 330 11 3 0.27 86 9 0.10 70 0.678
## 331 8 6 0.75 44 21 0.48 22 0.320
## 332 15 8 0.53 78 24 0.31 48 0.608
## 333 4 1 0.25 28 3 0.11 23 0.517
## 334 12 3 0.25 60 9 0.15 43 0.535
## 335 7 4 0.57 27 9 0.33 16 0.339
## 336 23 10 0.43 111 23 0.21 76 0.611
## 337 7 4 0.57 42 11 0.26 29 0.341
## 338 11 8 0.73 49 19 0.39 28 0.468
## 339 19 10 0.53 80 19 0.24 53 0.546
## 340 14 6 0.43 59 15 0.25 37 0.521
## 341 3 0 0.00 19 0 0.00 17 0.305
## 342 3 2 0.67 13 4 0.31 9 0.213
## 343 5 3 0.60 27 12 0.44 14 0.064
## 344 6 4 0.67 28 13 0.46 14 0.334
## 345 7 4 0.57 32 11 0.34 18 0.627
## 346 7 3 0.43 36 10 0.28 23 0.495
## 347 23 15 0.65 118 50 0.42 61 0.565
## 348 6 4 0.67 22 6 0.27 15 0.507
## 349 8 4 0.50 43 9 0.21 31 0.595
## 350 2 0 0.00 11 0 0.00 10 0.042
## 351 7 4 0.57 39 10 0.26 27 0.494
## 352 7 3 0.43 39 9 0.23 27 0.443
## 353 9 4 0.44 50 16 0.32 30 0.488
## 354 17 6 0.35 106 18 0.17 78 0.687
## 355 15 7 0.47 82 15 0.18 60 0.591
## 356 2 0 0.00 12 0 0.00 10 0.214
## 357 5 1 0.20 25 3 0.12 21 0.667
## 358 23 7 0.30 125 18 0.14 92 0.688
## 359 7 3 0.43 32 6 0.19 23 0.530
## 360 4 2 0.50 19 7 0.37 11 0.316
## 361 30 18 0.60 138 41 0.30 86 0.632
## 362 16 8 0.50 79 19 0.24 53 0.574
## 363 18 5 0.28 104 16 0.15 76 0.696
## 364 9 2 0.22 38 3 0.08 29 0.570
## 365 3 1 0.33 14 2 0.14 10 0.075
## 366 5 0 0.00 37 0 0.00 33 0.490
## 367 13 6 0.46 71 13 0.18 52 0.521
## 368 15 6 0.40 90 12 0.13 70 0.649
## 369 14 4 0.29 58 14 0.24 35 0.499
## 370 27 8 0.30 140 25 0.18 97 0.703
## 371 7 2 0.29 34 2 0.06 28 0.550
## 372 22 9 0.41 132 22 0.17 98 0.609
## 373 26 9 0.35 132 23 0.17 93 0.610
## 374 27 8 0.30 135 20 0.15 97 0.677
## 375 13 3 0.23 61 5 0.08 47 0.557
## 376 4 1 0.25 19 1 0.05 16 0.404
## 377 6 3 0.50 21 6 0.29 13 0.299
## 378 10 5 0.50 42 13 0.31 25 0.474
## 379 10 3 0.30 51 7 0.14 38 0.550
## 380 16 8 0.50 75 18 0.24 50 0.457
## 381 5 3 0.60 21 7 0.33 13 0.378
## 382 18 5 0.28 91 12 0.13 67 0.544
## 383 13 6 0.46 68 19 0.28 43 0.582
## 384 9 5 0.56 50 21 0.42 26 0.478
## 385 18 11 0.61 91 40 0.44 45 0.612
## 386 11 4 0.36 61 12 0.20 43 0.558
## 387 12 5 0.42 74 17 0.23 51 0.602
## 388 9 4 0.44 49 14 0.29 31 0.357
## 389 4 1 0.25 26 2 0.08 22 0.402
## 390 12 7 0.58 67 23 0.34 39 0.625
## 391 5 4 0.80 19 9 0.47 10 0.383
## 392 2 0 0.00 17 0 0.00 16 0.210
## 393 28 13 0.46 126 30 0.24 82 0.569
## 394 22 13 0.59 95 36 0.38 51 0.506
## 395 18 11 0.61 79 29 0.37 44 0.319
## 396 16 7 0.44 75 16 0.21 51 0.625
## 397 13 8 0.62 65 25 0.38 35 0.531
## 398 16 9 0.56 79 26 0.33 47 0.548
## 399 24 13 0.54 102 33 0.32 58 0.599
## 400 11 6 0.55 57 17 0.30 36 0.607
## 401 2 0 0.00 8 0 0.00 7 0.181
## 402 14 9 0.64 63 22 0.35 37 0.466
## 403 4 1 0.25 26 4 0.15 20 0.260
## 404 24 10 0.42 114 22 0.19 79 0.566
## 405 9 3 0.33 46 10 0.22 31 0.608
## 406 22 12 0.55 119 35 0.29 75 0.611
## 407 15 9 0.60 102 23 0.23 74 0.609
## 408 9 6 0.67 48 13 0.27 33 0.621
## 409 21 9 0.43 112 19 0.17 81 0.637
## 410 16 7 0.44 94 18 0.19 68 0.661
## 411 13 8 0.62 56 24 0.43 28 0.550
## 412 10 3 0.30 59 9 0.15 44 0.615
## 413 17 6 0.35 92 15 0.16 67 0.665
## 414 18 9 0.50 105 25 0.24 72 0.484
## 415 2 2 1.00 13 10 0.77 3 0.113
## 416 20 10 0.50 114 30 0.26 75 0.503
## 417 15 3 0.20 94 5 0.05 78 0.555
## 418 22 7 0.32 116 21 0.18 81 0.424
## 419 7 3 0.43 36 6 0.17 27 0.399
## 420 14 7 0.50 72 20 0.28 46 0.479
## 421 11 4 0.36 53 8 0.15 39 0.537
## 422 13 7 0.54 62 18 0.29 39 0.458
## 423 11 5 0.45 46 8 0.17 33 0.453
## 424 10 6 0.60 60 22 0.37 35 0.459
## 425 4 2 0.50 22 9 0.41 12 0.232
## 426 8 4 0.50 30 10 0.33 17 0.358
## 427 5 3 0.60 27 8 0.30 18 0.243
## 428 3 0 0.00 11 0 0.00 9 0.170
## 429 22 10 0.45 117 30 0.26 76 0.569
## 430 4 1 0.25 14 1 0.07 11 0.138
## 431 8 4 0.50 36 9 0.25 24 0.277
## 432 5 4 0.80 17 10 0.59 7 0.140
## 433 24 17 0.71 120 51 0.42 63 0.479
## 434 10 4 0.40 49 11 0.22 33 0.450
## 435 25 16 0.64 136 60 0.44 68 0.528
## 436 7 4 0.57 36 13 0.36 21 0.527
## 437 11 3 0.27 50 9 0.18 34 0.502
## 438 9 5 0.56 40 17 0.42 20 0.382
## 439 12 5 0.42 58 9 0.16 43 0.520
## 440 22 13 0.59 122 42 0.34 72 0.456
## 441 20 5 0.25 111 17 0.15 80 0.556
## 442 9 4 0.44 39 10 0.26 25 0.233
## 443 3 0 0.00 12 0 0.00 10 0.270
## 444 21 7 0.33 120 21 0.18 86 0.538
## 445 9 2 0.22 49 4 0.08 39 0.397
## 446 3 1 0.33 14 4 0.29 9 0.106
## 447 6 6 1.00 25 17 0.68 9 0.142
## 448 26 17 0.65 122 44 0.36 70 0.390
## 449 15 6 0.40 80 21 0.26 51 0.467
## 450 3 0 0.00 13 0 0.00 11 0.283
## 451 14 4 0.29 99 12 0.12 78 0.496
## 452 5 1 0.20 36 1 0.03 32 0.472
## 453 14 5 0.36 69 14 0.20 47 0.577
## 454 24 12 0.50 112 36 0.32 65 0.496
## 455 6 3 0.50 29 7 0.24 20 0.416
## 456 3 2 0.67 16 5 0.31 11 0.117
## 457 12 7 0.58 58 16 0.28 38 0.257
## 458 6 3 0.50 37 13 0.35 22 0.170
## 459 6 2 0.33 32 6 0.19 23 0.457
## 460 17 5 0.29 92 11 0.12 70 0.598
## 461 6 4 0.67 37 14 0.38 22 0.342
## 462 11 3 0.27 47 7 0.15 32 0.428
## 463 20 14 0.70 97 40 0.41 52 0.401
## 464 11 8 0.73 51 26 0.51 22 0.156
## 465 8 1 0.12 30 2 0.07 22 0.382
## 466 6 1 0.17 36 6 0.17 26 0.427
## 467 17 9 0.53 92 33 0.36 52 0.573
colSums(s)
## words matches effective chars saved benefit
## 10429.000 4713.000 199.960 57132.000 13464.000 111.100
## predictions avg.ms
## 38419.000 243.967
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 14393)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] wordcloud_2.5 RColorBrewer_1.1-2 tm_0.6-2
## [4] NLP_0.1-9
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.6 digest_0.6.9 slam_0.1-38 formatR_1.4
## [5] magrittr_1.5 evaluate_0.9 stringi_1.1.1 rmarkdown_1.0
## [9] tools_3.3.1 stringr_1.0.0 yaml_2.1.13 parallel_3.3.1
## [13] htmltools_0.3.5 knitr_1.13