packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
[1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Exploration

D = read.csv("data/emails.csv", stringsAsFactors = F)
1.1 How many emails are in the dataset?
nrow(D)
[1] 5728
1.2 How many of the emails are spam?
table(D$spam)

   0    1 
4360 1368 
1.3 Which word appears at the beginning of every email in the dataset?
substr(D$text[1:5], 1, 60)
[1] "Subject: naturally irresistible your corporate identity  lt "
[2] "Subject: the stock trading gunslinger  fanny is merrill but "
[3] "Subject: unbelievable new homes made easy  im wanting to sho"
[4] "Subject: 4 color printing special  request additional inform"
[5] "Subject: do not have money , get software cds from here !  s"
1.4 Words in every document

【P1.4】Could a spam classifier potentially benefit from including the frequency of the word that appears in every email?

  • Yes – the number of times the word appears might help us differentiate spam from ham
1.5 How many characters are in the longest email?
nchar(D$text) %>% max
[1] 43952
1.6 Which row contains the shortest email in the dataset?
nchar(D$text) %>% which.min
[1] 1992


Problem 2 - Preparing the Corpus

2.1 Corpus and DTM
corp = Corpus(VectorSource(D$text))
corp = tm_map(corp,  content_transformer(tolower))
transformation drops documents
corp = tm_map(corp, removePunctuation)
transformation drops documents
corp = tm_map(corp, removeWords, stopwords("english"))
transformation drops documents
corp = tm_map(corp, stemDocument)
transformation drops documents
dtm = DocumentTermMatrix(corp)

【P2.1】How many terms are in dtm?

dtm
<<DocumentTermMatrix (documents: 5728, terms: 28687)>>
Non-/sparse entries: 481719/163837417
Sparsity           : 100%
Maximal term length: 24
Weighting          : term frequency (tf)
2.2 Remove less frequent words

Limit dtm to contain terms appearing in at least 5%

spdtm = removeSparseTerms(dtm, 0.95)

【P2.2】How many terms are in spdtm?

spdtm 
<<DocumentTermMatrix (documents: 5728, terms: 330)>>
Non-/sparse entries: 213551/1676689
Sparsity           : 89%
Maximal term length: 10
Weighting          : term frequency (tf)
2.3 Build data frame

Build a data frame ems from spdtm

ems = as.data.frame(as.matrix(spdtm))

【P2.3】What is the most frequent word in spdtm?

colSums(ems) %>% sort %>% tail
    hou    will    vinc subject     ect   enron 
   5577    8252    8532   10202   11427   13388 
2.4 Most frequent words in HAM emalis

Incorporate target variable spam

ems$spam = D$spam

【P2.4】How many word stems appear at least 5000 times in the ham emails in the dataset?

subset(ems, spam==0) %>% colSums %>% sort %>% tail(10)
     com    pleas kaminski     2000      hou     will     vinc  subject 
    4444     4494     4801     4935     5569     6802     8531     8625 
     ect    enron 
   11417    13388 
2.5 Most frequent words in SPAM emalis

【P2.5】How many word stems appear at least 1000 times in the spam emails in the dataset?

subset(ems, spam==1) %>% colSums %>% {.[. > 1000]}
compani subject    will    spam 
   1065    1577    1450    1368 
2.6 Observation 1

【P2.6】The lists of most common words are significantly different between the spam and ham emails. What does this likely imply?

  • The frequencies of these most common words are likely to help differentiate between spam and ham
2.7 Observation 2

【P2.7】Several of the most common word stems from the ham documents, such as “enron”, “hou” (short for Houston), “vinc” (the word stem of “Vince”) and “kaminski”, are likely specific to Vincent Kaminski’s inbox. What does this mean about the applicability of the text analytics models we will train for the spam filtering problem?

  • The models we build are personalized, and would need to be further tested before being used as a spam filter for another person


Problem 3 - Building machine learning models

Split the data and build GLM, CART and random forest models …

ems$spam = factor(ems$spam)
names(ems) = make.names(names(ems))
set.seed(123); spl = sample.split(ems$spam, 0.7)
train = subset(ems, spl == TRUE)
test = subset(ems, spl == FALSE)
table(test$spam) %>% prop.table  # 0.76135

      0       1 
0.76135 0.23865 
m.glm = glm(spam ~ ., train, family = 'binomial') 
glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred
m.cart = rpart(spam ~ ., train, method="class")
set.seed(123); m.rf = randomForest(spam ~ ., train)
3.1 Prediction of Logistic Model

【P3.1a】 How many of the training set predicted probabilities from spamLog are less than 0.00001?

【P3.1b】 How many of the training set predicted probabilities from spamLog are more than 0.99999?

【P3.1c】 How many of the training set predicted probabilities from spamLog are between 0.00001 and 0.99999?

3.2 Significant predictors in the GLM model

【P3.2】How many variables are labeled as significant (at the p=0.05 level) in the logistic regression summary output?

3.3 Words in the Decision Tree

【P3.3】How many of the word stems “enron”, “hou”, “vinc”, and “kaminski” appear in the CART tree?

Recall that we suspect these word stems are specific to Vincent Kaminski and might affect the generalizability of a spam filter built with his ham data.

3.4 What is the training accuracy of the GLM model?
3.5 What is the training AUC of the GLM model?
3.6 What is the training accuracy of the CART model?
3.7 What is the training accuracy of the CART model?
3.8 What is the training accuracy of the RF model?
3.9 What is the training accuracy of the RF model?
3.10 Which model had the best training set performance, in terms of accuracy & AUC?


Problem 4 - Evaluating on the Test Set

Obtain predicted probabilities for the testing set for each of the models,

4.1 ~ 4.6 ACC/AUC of the GLM/CART/RF models

see the table above

4.7 Which model had the best testing set performance, in terms of accuracy and AUC?
  • Random Forest
4.7 Which model demonstrated the greatest degree of overfitting??
  • Logistic Regression






LS0tDQp0aXRsZTogIkFTMTAtM++8mlNQQU0gb3IgSEFNIg0KYXV0aG9yOiAi5p2O5Yqt56uRIE0wNjQwMjAwMjMiDQpkYXRlOiAiYHIgU3lzLnRpbWUoKWAiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQo8YnI+PGhyPg0KDQpgYGB7cn0NCnBhY2thZ2VzID0gYygNCiAgImRwbHlyIiwiZ2dwbG90MiIsImNhVG9vbHMiLCJ0bSIsIlNub3diYWxsQyIsIlJPQ1IiLCJycGFydCIsDQogICJycGFydC5wbG90IiwicmFuZG9tRm9yZXN0IikNCmV4aXN0aW5nID0gYXMuY2hhcmFjdGVyKGluc3RhbGxlZC5wYWNrYWdlcygpWywxXSkNCmZvcihwa2cgaW4gcGFja2FnZXNbIShwYWNrYWdlcyAlaW4lIGV4aXN0aW5nKV0pIGluc3RhbGwucGFja2FnZXMocGtnKQ0KYGBgDQoNCmBgYHtyIHdhcm5pbmc9RiwgbWVzc2FnZT1GLCBjYWNoZT1GLCBlcnJvcj1GfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpTeXMuc2V0bG9jYWxlKCJMQ19BTEwiLCJDIikNCm9wdGlvbnMoZGlnaXRzPTUsIHNjaXBlbj0xMCkNCg0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkodG0pDQpsaWJyYXJ5KFNub3diYWxsQykNCmxpYnJhcnkoUk9DUikNCmxpYnJhcnkoY2FUb29scykNCmxpYnJhcnkocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCmBgYA0KPGJyPg0KDQojIyMgUHJvYmxlbSAxIC0gRXhwbG9yYXRpb24NCmBgYHtyfQ0KRCA9IHJlYWQuY3N2KCJkYXRhL2VtYWlscy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRikNCmBgYA0KDQojIyMjIyAxLjEgSG93IG1hbnkgZW1haWxzIGFyZSBpbiB0aGUgZGF0YXNldD8NCmBgYHtyfQ0KbnJvdyhEKQ0KYGBgDQoNCiMjIyMjIDEuMiBIb3cgbWFueSBvZiB0aGUgZW1haWxzIGFyZSBzcGFtPw0KYGBge3J9DQp0YWJsZShEJHNwYW0pDQpgYGANCg0KIyMjIyMgMS4zIFdoaWNoIHdvcmQgYXBwZWFycyBhdCB0aGUgYmVnaW5uaW5nIG9mIGV2ZXJ5IGVtYWlsIGluIHRoZSBkYXRhc2V0Pw0KYGBge3J9DQpzdWJzdHIoRCR0ZXh0WzE6NV0sIDEsIDYwKQ0KYGBgDQoNCiMjIyMjIDEuNCBXb3JkcyBpbiBldmVyeSBkb2N1bWVudA0K44CQUDEuNOOAkV9Db3VsZCBhIHNwYW0gY2xhc3NpZmllciBwb3RlbnRpYWxseSBiZW5lZml0IGZyb20gaW5jbHVkaW5nIHRoZSBmcmVxdWVuY3kgb2YgdGhlIHdvcmQgdGhhdCBhcHBlYXJzIGluIGV2ZXJ5IGVtYWlsP18NCg0KKyBZZXMgLS0gdGhlIG51bWJlciBvZiB0aW1lcyB0aGUgd29yZCBhcHBlYXJzIG1pZ2h0IGhlbHAgdXMgZGlmZmVyZW50aWF0ZSBzcGFtIGZyb20gaGFtDQorDQoNCiMjIyMjIDEuNSBIb3cgbWFueSBjaGFyYWN0ZXJzIGFyZSBpbiB0aGUgbG9uZ2VzdCBlbWFpbD8NCmBgYHtyfQ0KbmNoYXIoRCR0ZXh0KSAlPiUgbWF4DQpgYGANCg0KIyMjIyMgMS42IFdoaWNoIHJvdyBjb250YWlucyB0aGUgc2hvcnRlc3QgZW1haWwgaW4gdGhlIGRhdGFzZXQ/DQpgYGB7cn0NCm5jaGFyKEQkdGV4dCkgJT4lIHdoaWNoLm1pbg0KYGBgDQo8YnI+PGhyPg0KDQojIyMgUHJvYmxlbSAyIC0gUHJlcGFyaW5nIHRoZSBDb3JwdXMNCg0KIyMjIyMgMi4xIENvcnB1cyBhbmQgRFRNDQpgYGB7cn0NCmNvcnAgPSBDb3JwdXMoVmVjdG9yU291cmNlKEQkdGV4dCkpDQpjb3JwID0gdG1fbWFwKGNvcnAsICBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKQ0KY29ycCA9IHRtX21hcChjb3JwLCByZW1vdmVQdW5jdHVhdGlvbikNCmNvcnAgPSB0bV9tYXAoY29ycCwgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygiZW5nbGlzaCIpKQ0KY29ycCA9IHRtX21hcChjb3JwLCBzdGVtRG9jdW1lbnQpDQpkdG0gPSBEb2N1bWVudFRlcm1NYXRyaXgoY29ycCkNCmBgYA0K44CQUDIuMeOAkV9Ib3cgbWFueSB0ZXJtcyBhcmUgaW4gZHRtP18gDQpgYGB7cn0NCmR0bQ0KYGBgDQoNCiMjIyMjIDIuMiBSZW1vdmUgbGVzcyBmcmVxdWVudCB3b3Jkcw0KTGltaXQgYGR0bWAgdG8gY29udGFpbiB0ZXJtcyBhcHBlYXJpbmcgaW4gYXQgbGVhc3QgNSUgDQpgYGB7cn0NCnNwZHRtID0gcmVtb3ZlU3BhcnNlVGVybXMoZHRtLCAwLjk1KQ0KYGBgDQoNCuOAkFAyLjLjgJFfSG93IG1hbnkgdGVybXMgYXJlIGluIGBzcGR0bWA/XyANCmBgYHtyfQ0Kc3BkdG0gDQpgYGANCg0KIyMjIyMgMi4zIEJ1aWxkIGRhdGEgZnJhbWUNCkJ1aWxkIGEgZGF0YSBmcmFtZSBgZW1zYCBmcm9tIGBzcGR0bWANCmBgYHtyfQ0KZW1zID0gYXMuZGF0YS5mcmFtZShhcy5tYXRyaXgoc3BkdG0pKQ0KYGBgDQoNCuOAkFAyLjPjgJFfV2hhdCBpcyB0aGUgbW9zdCBmcmVxdWVudCB3b3JkIGluIGBzcGR0bWA/XyANCmBgYHtyfQ0KY29sU3VtcyhlbXMpICU+JSBzb3J0ICU+JSB0YWlsDQpgYGANCg0KIyMjIyMgMi40IE1vc3QgZnJlcXVlbnQgd29yZHMgaW4gSEFNIGVtYWxpcw0KSW5jb3Jwb3JhdGUgdGFyZ2V0IHZhcmlhYmxlIGBzcGFtYA0KYGBge3J9DQplbXMkc3BhbSA9IEQkc3BhbQ0KYGBgDQoNCuOAkFAyLjTjgJFfSG93IG1hbnkgd29yZCBzdGVtcyBhcHBlYXIgYXQgbGVhc3QgNTAwMCB0aW1lcyBpbiB0aGUgaGFtIGVtYWlscyBpbiB0aGUgZGF0YXNldD9fDQpgYGB7cn0NCnN1YnNldChlbXMsIHNwYW09PTApICU+JSBjb2xTdW1zICU+JSBzb3J0ICU+JSB0YWlsKDEwKQ0KYGBgDQoNCiMjIyMjIDIuNSBNb3N0IGZyZXF1ZW50IHdvcmRzIGluIFNQQU0gZW1hbGlzDQrjgJBQMi4144CRX0hvdyBtYW55IHdvcmQgc3RlbXMgYXBwZWFyIGF0IGxlYXN0IDEwMDAgdGltZXMgaW4gdGhlIHNwYW0gZW1haWxzIGluIHRoZSBkYXRhc2V0P18NCmBgYHtyfQ0Kc3Vic2V0KGVtcywgc3BhbT09MSkgJT4lIGNvbFN1bXMgJT4lIHsuWy4gPiAxMDAwXX0NCmBgYA0KDQojIyMjIyAyLjYgT2JzZXJ2YXRpb24gMQ0K44CQUDIuNuOAkV9UaGUgbGlzdHMgb2YgbW9zdCBjb21tb24gd29yZHMgYXJlIHNpZ25pZmljYW50bHkgZGlmZmVyZW50IGJldHdlZW4gdGhlIHNwYW0gYW5kIGhhbSBlbWFpbHMuIFdoYXQgZG9lcyB0aGlzIGxpa2VseSBpbXBseT9fDQoNCisgVGhlIGZyZXF1ZW5jaWVzIG9mIHRoZXNlIG1vc3QgY29tbW9uIHdvcmRzIGFyZSBsaWtlbHkgdG8gaGVscCBkaWZmZXJlbnRpYXRlIGJldHdlZW4gc3BhbSBhbmQgaGFtDQorDQoNCiMjIyMjIDIuNyBPYnNlcnZhdGlvbiAyDQrjgJBQMi4344CRX1NldmVyYWwgb2YgdGhlIG1vc3QgY29tbW9uIHdvcmQgc3RlbXMgZnJvbSB0aGUgaGFtIGRvY3VtZW50cywgc3VjaCBhcyAiZW5yb24iLCAiaG91IiAoc2hvcnQgZm9yIEhvdXN0b24pLCAidmluYyIgKHRoZSB3b3JkIHN0ZW0gb2YgIlZpbmNlIikgYW5kICJrYW1pbnNraSIsIGFyZSBsaWtlbHkgc3BlY2lmaWMgdG8gVmluY2VudCBLYW1pbnNraSdzIGluYm94LiBXaGF0IGRvZXMgdGhpcyBtZWFuIGFib3V0IHRoZSBhcHBsaWNhYmlsaXR5IG9mIHRoZSB0ZXh0IGFuYWx5dGljcyBtb2RlbHMgd2Ugd2lsbCB0cmFpbiBmb3IgdGhlIHNwYW0gZmlsdGVyaW5nIHByb2JsZW0/Xw0KDQorIFRoZSBtb2RlbHMgd2UgYnVpbGQgYXJlIHBlcnNvbmFsaXplZCwgYW5kIHdvdWxkIG5lZWQgdG8gYmUgZnVydGhlciB0ZXN0ZWQgYmVmb3JlIGJlaW5nIHVzZWQgYXMgYSBzcGFtIGZpbHRlciBmb3IgYW5vdGhlciBwZXJzb24NCisNCg0KPGJyPjxocj4NCg0KIyMjIFByb2JsZW0gMyAtIEJ1aWxkaW5nIG1hY2hpbmUgbGVhcm5pbmcgbW9kZWxzDQoNClNwbGl0IHRoZSBkYXRhIGFuZCBidWlsZCBHTE0sIENBUlQgYW5kIHJhbmRvbSBmb3Jlc3QgbW9kZWxzIC4uLg0KYGBge3J9DQplbXMkc3BhbSA9IGZhY3RvcihlbXMkc3BhbSkNCm5hbWVzKGVtcykgPSBtYWtlLm5hbWVzKG5hbWVzKGVtcykpDQoNCnNldC5zZWVkKDEyMyk7IHNwbCA9IHNhbXBsZS5zcGxpdChlbXMkc3BhbSwgMC43KQ0KdHJhaW4gPSBzdWJzZXQoZW1zLCBzcGwgPT0gVFJVRSkNCnRlc3QgPSBzdWJzZXQoZW1zLCBzcGwgPT0gRkFMU0UpDQp0YWJsZSh0ZXN0JHNwYW0pICU+JSBwcm9wLnRhYmxlICAjIDAuNzYxMzUNCg0KbS5nbG0gPSBnbG0oc3BhbSB+IC4sIHRyYWluLCBmYW1pbHkgPSAnYmlub21pYWwnKSANCm0uY2FydCA9IHJwYXJ0KHNwYW0gfiAuLCB0cmFpbiwgbWV0aG9kPSJjbGFzcyIpDQpzZXQuc2VlZCgxMjMpOyBtLnJmID0gcmFuZG9tRm9yZXN0KHNwYW0gfiAuLCB0cmFpbikNCmBgYA0KDQojIyMjIyAzLjEgUHJlZGljdGlvbiBvZiBMb2dpc3RpYyBNb2RlbA0KYGBge3J9DQpwLmdsbSA9IHByZWRpY3QobS5nbG0sdHlwZT0ncmVzcG9uc2UnKSANCmBgYA0KDQrjgJBQMy4xYeOAkSBfSG93IG1hbnkgb2YgdGhlIHRyYWluaW5nIHNldCBwcmVkaWN0ZWQgcHJvYmFiaWxpdGllcyBmcm9tIHNwYW1Mb2cgYXJlIGxlc3MgdGhhbiAwLjAwMDAxP18NCmBgYHtyfQ0Kc3VtKHAuZ2xtIDwgMC4wMDAwMSkNCmBgYA0KDQrjgJBQMy4xYuOAkSBfSG93IG1hbnkgb2YgdGhlIHRyYWluaW5nIHNldCBwcmVkaWN0ZWQgcHJvYmFiaWxpdGllcyBmcm9tIHNwYW1Mb2cgYXJlIG1vcmUgdGhhbiAwLjk5OTk5P18NCmBgYHtyfQ0Kc3VtKHAuZ2xtID4gMC45OTk5OSkNCmBgYA0KDQrjgJBQMy4xY+OAkSBfSG93IG1hbnkgb2YgdGhlIHRyYWluaW5nIHNldCBwcmVkaWN0ZWQgcHJvYmFiaWxpdGllcyBmcm9tIHNwYW1Mb2cgYXJlIGJldHdlZW4gMC4wMDAwMSBhbmQgMC45OTk5OT9fDQpgYGB7cn0NCnN1bShwLmdsbSA+PSAwLjAwMDAxICYgcC5nbG0gPD0gMC45OTk5OSkNCmBgYA0KDQojIyMjIyAzLjIgU2lnbmlmaWNhbnQgcHJlZGljdG9ycyBpbiB0aGUgR0xNIG1vZGVsDQrjgJBQMy4y44CRX0hvdyBtYW55IHZhcmlhYmxlcyBhcmUgbGFiZWxlZCBhcyBzaWduaWZpY2FudCAoYXQgdGhlIHA9MC4wNSBsZXZlbCkgaW4gdGhlIGxvZ2lzdGljIHJlZ3Jlc3Npb24gc3VtbWFyeSBvdXRwdXQ/Xw0KYGBge3J9DQpzdW1tYXJ5KG0uZ2xtKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtKCBzdW1tYXJ5KG0uZ2xtKSRjb2VmWyw0XSAgPCAwLjA1ICkNCmBgYA0KDQojIyMjIyAzLjMgV29yZHMgaW4gdGhlIERlY2lzaW9uIFRyZWUgDQrjgJBQMy4z44CRX0hvdyBtYW55IG9mIHRoZSB3b3JkIHN0ZW1zICJlbnJvbiIsICJob3UiLCAidmluYyIsIGFuZCAia2FtaW5za2kiIGFwcGVhciBpbiB0aGUgQ0FSVCB0cmVlP18NCmBgYHtyfQ0KcHJwKG0uY2FydCkNCmBgYA0KUmVjYWxsIHRoYXQgd2Ugc3VzcGVjdCB0aGVzZSB3b3JkIHN0ZW1zIGFyZSBzcGVjaWZpYyB0byBWaW5jZW50IEthbWluc2tpIGFuZCBtaWdodCBhZmZlY3QgdGhlIGdlbmVyYWxpemFiaWxpdHkgb2YgYSBzcGFtIGZpbHRlciBidWlsdCB3aXRoIGhpcyBoYW0gZGF0YS4NCg0KDQojIyMjIyAzLjQgV2hhdCBpcyB0aGUgdHJhaW5pbmcgYWNjdXJhY3kgb2YgdGhlIEdMTSBtb2RlbD8NCmBgYHtyfQ0KdGFibGUodHJhaW4kc3BhbSwgcC5nbG0gPiAwLjUpICU+JSB7c3VtKGRpYWcoLikpIC8gc3VtKC4pfQ0KYGBgDQoNCiMjIyMjIDMuNSBXaGF0IGlzIHRoZSB0cmFpbmluZyBBVUMgb2YgdGhlIEdMTSBtb2RlbD8NCmBgYHtyfQ0KY29sQVVDKHAuZ2xtLCB0cmFpbiRzcGFtKQ0KYGBgDQoNCiMjIyMjIDMuNiBXaGF0IGlzIHRoZSB0cmFpbmluZyBhY2N1cmFjeSBvZiB0aGUgQ0FSVCBtb2RlbD8NCmBgYHtyfQ0KcC5jYXJ0ID0gcHJlZGljdChtLmNhcnQpWywyXQ0KdGFibGUodHJhaW4kc3BhbSwgcC5jYXJ0ID4gMC41KSAlPiUge3N1bShkaWFnKC4pKSAvIHN1bSguKX0NCmBgYA0KDQojIyMjIyAzLjcgV2hhdCBpcyB0aGUgdHJhaW5pbmcgYWNjdXJhY3kgb2YgdGhlIENBUlQgbW9kZWw/DQpgYGB7cn0NCmNvbEFVQyhwLmNhcnQsIHRyYWluJHNwYW0pDQpgYGANCg0KIyMjIyMgMy44IFdoYXQgaXMgdGhlIHRyYWluaW5nIGFjY3VyYWN5IG9mIHRoZSBSRiBtb2RlbD8NCmBgYHtyfQ0KcC5yZiA9IHByZWRpY3QobS5yZix0eXBlPSdwcm9iJylbLDJdDQp0YWJsZSh0cmFpbiRzcGFtLCBwLnJmID4gMC41KSAlPiUge3N1bShkaWFnKC4pKSAvIHN1bSguKX0NCmBgYA0KDQojIyMjIyAzLjkgV2hhdCBpcyB0aGUgdHJhaW5pbmcgYWNjdXJhY3kgb2YgdGhlIFJGIG1vZGVsPw0KYGBge3J9DQpjb2xBVUMocC5yZiwgdHJhaW4kc3BhbSkNCmBgYA0KDQojIyMjIyAzLjEwIFdoaWNoIG1vZGVsIGhhZCB0aGUgYmVzdCB0cmFpbmluZyBzZXQgcGVyZm9ybWFuY2UsIGluIHRlcm1zIG9mIGFjY3VyYWN5ICYgQVVDPw0KYGBge3J9DQpwcmVkID0gZGF0YS5mcmFtZShnbG09cC5nbG0sIGNhcnQ9cC5jYXJ0LCByZj1wLnJmKQ0KcmJpbmQoDQogIEFDQz0gYXBwbHkocHJlZCwgMiwgZnVuY3Rpb24oeCkgew0KICAgIHRhYmxlKHRyYWluJHNwYW0sIHggPiAwLjUpICU+JSB7c3VtKGRpYWcoLikpIC8gc3VtKC4pfSB9ICksDQogIGNvbEFVQyhwcmVkLCB0cmFpbiRzcGFtKQ0KICApICU+JSB0IA0KYGBgDQo8YnI+PGhyPg0KDQojIyMgUHJvYmxlbSA0IC0gRXZhbHVhdGluZyBvbiB0aGUgVGVzdCBTZXQNCg0KT2J0YWluIHByZWRpY3RlZCBwcm9iYWJpbGl0aWVzIGZvciB0aGUgdGVzdGluZyBzZXQgZm9yIGVhY2ggb2YgdGhlIG1vZGVscywgDQpgYGB7cn0NCnByZWQyID0gZGF0YS5mcmFtZSgNCiAgZ2xtID0gcHJlZGljdChtLmdsbSwgdGVzdCwgdHlwZT0ncmVzcG9uc2UnKSwNCiAgY2FydCA9IHByZWRpY3QobS5jYXJ0LCB0ZXN0KVssMl0sDQogIHJmID0gcHJlZGljdChtLnJmLCB0ZXN0LCB0eXBlPSdwcm9iJylbLDJdICkNCnJiaW5kKA0KICBBQ0MgPSBhcHBseShwcmVkMiwgMiwgZnVuY3Rpb24oeCkgew0KICAgIHRhYmxlKHRlc3Qkc3BhbSwgeCA+IDAuNSkgJT4lIHtzdW0oZGlhZyguKSkgLyBzdW0oLil9IH0gKSwNCiAgQVVDID0gY29sQVVDKHByZWQyLCB0ZXN0JHNwYW0pICApICU+JSB0DQpgYGANCg0KIyMjIyMgNC4xIH4gNC42IEFDQy9BVUMgb2YgdGhlIEdMTS9DQVJUL1JGIG1vZGVscw0Kc2VlIHRoZSB0YWJsZSBhYm92ZQ0KDQojIyMjIyA0LjcgV2hpY2ggbW9kZWwgaGFkIHRoZSBiZXN0IHRlc3Rpbmcgc2V0IHBlcmZvcm1hbmNlLCBpbiB0ZXJtcyBvZiBhY2N1cmFjeSBhbmQgQVVDPw0KDQorIFJhbmRvbSBGb3Jlc3QNCisNCg0KIyMjIyMgNC43IFdoaWNoIG1vZGVsIGRlbW9uc3RyYXRlZCB0aGUgZ3JlYXRlc3QgZGVncmVlIG9mIG92ZXJmaXR0aW5nPz8NCg0KKyBMb2dpc3RpYyBSZWdyZXNzaW9uDQorIA0KDQo8YnI+PGhyPjxicj48YnI+PGJyPjxicj4NCg0KDQoNCg0KDQo=