packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C/C/C/C/C/zh_TW.UTF-8"
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Data Exploration

D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)
1.1 The longest abstract

<80><90>P1.1<80><91>How many characters are there in the longest abstract?

nchar(D$abstract) %>% max
## [1] 3708
1.2 No abstract

<80><90>P1.2<80><91>How many search results provided no abstract?

sum(nchar(D$abstract) == 0)
## [1] 112
1.3 Shortest title

<80><90>P1.3<80><91>What is the shortest title?

D$title[ which.min(nchar(D$title)) ]
## [1] "A decade of letrozole: FACE."


Problem 2 - Preparing the Corpus

Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.

2.1 Courps & DTM
library(tm)
library(SnowballC)

# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)
## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 2831)>>
## Non-/sparse entries: 23415/5242245
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT 
## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10684/46976
## Sparsity           : 81%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))

<80><90>P2.1a<80><91>How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?

  • 31
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)
## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 12209)>>
## Non-/sparse entries: 153164/22555576
## Sparsity           : 99%
## Maximal term length: 67
## Weighting          : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA 
## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92016/531084
## Sparsity           : 85%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))

<80><90>P2.1b<80><91>How many terms remain in dtmA after removing sparse terms?

  • 335
2.2 Abstract is longer than title

<80><90>P2.2<80><91>What is the most likely reason why dtmA has so many more terms than dtmT?

  • Abstracts tend to have many more words than titles
2.3 The most frequent word in abstracts

<80><90>P2.3<80><91>What is the most frequent word stem across all the abstracts?

which.max(colSums(dtmA))
## patient 
##      17


Problem 3 - Building a model

3.1 Make column names
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))

<80><90>P3.1<80><91>What was the effect of these functions?

  • Adding the letter T in front of all the title variable names and adding the letter A in front of all the abstract variable names.
3.2 Combine DTMs & Add target varible
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial

<80><90>P3.2<80><91>How many columns are in this combined data frame?

ncol(dtm)
## [1] 367
3.3 Splitting data
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)

<80><90>P3.3<80><91>What is the accuracy of the baseline model on the training set?

table(train$trial) %>% prop.table
## 
##       0       1 
## 0.56068 0.43932
3.4 CART Model
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")

<80><90>P3.4<80><91>What is the name of the first variable the model split on?

prp(cart)

3.5 The predicted probability

<80><90>P3.5<80><91>What is the maximum predicted probability for any result?

pred = predict(cart)[,2]
max(pred)
## [1] 0.87189
3.6 Similarity between testing & training data

<80><90>P3.6<80><91>Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?

  • The maximum predicted probability will likely be exactly the same in the testing set.
3.7 Accuracy Matrices

Use a threshold probability of 0.5

table(train$trial, pred > 0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441

<80><90>P3.7a<80><91>What is the training set accuracy of the CART model?

(631+441)/(631+441+131+99)
## [1] 0.82335

<80><90>P3.7b<80><91>What is the training set sensitivity of the CART model?

441/(131+441)
## [1] 0.77098

<80><90>P3.7c<80><91>What is the training set specificity of the CART model?

631/(631+99)
## [1] 0.86438


Problem 4 - Evaluating the model on the testing set

4.1 Test Accuracy

<80><90>P4.1<80><91>What is the test accuracy?

pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.75806
4.2 Test AUC

<80><90>P4.2<80><91>What is the test AUC?

colAUC(pred, test$trial)
##            [,1]
## 0 vs. 1 0.83711


Problem 5 - Decision Tradeoffs

The research procedure is …

5.1 Cost of False Negative

<80><90>P5.1<80><91>What is the cost associated with the model in Step 1 making a false negative prediction?

  • A paper that should have been included in Set A will be missed, affecting the quality of the results of Step 3
5.2 Cost of False Posituve

<80><90>P5.2<80><91>What is the cost associated with the model in Step 1 making a false positive prediction?

  • A paper will be mistakenly added to Set A, yielding additional work in Step 2 of the process but not affecting the quality of the results of Step 3.
5.3 The Threshold

<80><90>P5.3<80><91>Given the costs associated with false positives and false negatives, which of the following is most accurate?

  • A false negative is more costly than a false positive; the decision maker should use a probability threshold less than 0.5 for the machine learning model.