packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Data Exploration

D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)
1.1 The longest abstract

【P1.1】How many characters are there in the longest abstract?

nchar(D$abstract) %>% max
## [1] 3708
1.2 No abstract

【P1.2】How many search results provided no abstract?

sum(nchar(D$abstract) == 0)
## [1] 112
1.3 Shortest title

【P1.3】What is the shortest title?

D$title[ which.min(nchar(D$title)) ]
## [1] "A decade of letrozole: FACE."


Problem 2 - Preparing the Corpus

Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.

2.1 Courps & DTM
library(tm)
library(SnowballC)

# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)
## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT 
## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity           : 81%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))

【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?

  • 31
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)
## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity           : 99%
## Maximal term length: 67
## Weighting          : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA 
## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity           : 85%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))

【P2.1b】How many terms remain in dtmA after removing sparse terms?

  • 335
2.2 Abstract is longer than title

【P2.2】What is the most likely reason why dtmA has so many more terms than dtmT?

  • Abstracts tend to have many more words than titles
2.3 The most frequent word in abstracts

【P2.3】What is the most frequent word stem across all the abstracts?

which.max(colSums(dtmA))
## patient 
##      17


Problem 3 - Building a model

3.1 Make column names
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))

【P3.1】What was the effect of these functions?

  • Adding the letter T in front of all the title variable names and adding the letter A in front of all the abstract variable names.
3.2 Combine DTMs & Add target varible
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial

【P3.2】How many columns are in this combined data frame?

ncol(dtm)
## [1] 367
3.3 Splitting data
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)

【P3.3】What is the accuracy of the baseline model on the training set?

table(train$trial) %>% prop.table
## 
##       0       1 
## 0.56068 0.43932
3.4 CART Model
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")

【P3.4】What is the name of the first variable the model split on?

prp(cart)

3.5 The predicted probability

【P3.5】What is the maximum predicted probability for any result?

pred = predict(cart)[,2]
max(pred)
## [1] 0.87189
3.6 Similarity between testing & training data

【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?

  • The maximum predicted probability will likely be exactly the same in the testing set.
3.7 Accuracy Matrices

Use a threshold probability of 0.5

table(train$trial, pred > 0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441

【P3.7a】What is the training set accuracy of the CART model?

(631+441)/(631+441+131+99)
## [1] 0.82335

【P3.7b】What is the training set sensitivity of the CART model?

441/(131+441)
## [1] 0.77098

【P3.7c】What is the training set specificity of the CART model?

631/(631+99)
## [1] 0.86438


Problem 4 - Evaluating the model on the testing set

4.1 Test Accuracy

【P4.1】What is the test accuracy?

pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.75806
4.2 Test AUC

【P4.2】What is the test AUC?

colAUC(pred, test$trial)
##            [,1]
## 0 vs. 1 0.83711


Problem 5 - Decision Tradeoffs

The research procedure is …

5.1 Cost of False Negative

【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?

  • A paper that should have been included in Set A will be missed, affecting the quality of the results of Step 3
5.2 Cost of False Posituve

【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?

  • A paper will be mistakenly added to Set A, yielding additional work in Step 2 of the process but not affecting the quality of the results of Step 3.
5.3 The Threshold

【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?

  • A false negative is more costly than a false positive; the decision maker should use a probability threshold less than 0.5 for the machine learning model.