packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Data Exploration

D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)
1.1 The longest abstract

【P1.1】How many characters are there in the longest abstract?

nchar(D$abstract) %>% max
## [1] 3708
1.2 No abstract

【P1.2】How many search results provided no abstract?

sum(nchar(D$abstract) == 0)
## [1] 112
1.3 Shortest title

【P1.3】What is the shortest title?

D$title[ which.min(nchar(D$title)) ]
## [1] "A decade of letrozole: FACE."


Problem 2 - Preparing the Corpus

Because we have both title and abstract information for trials, we need to build two corpora(語料庫) instead of one. Name them corpT and corpA.

2.1 Courps & DTM
library(tm)
library(SnowballC)

# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)
## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT 
## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity           : 81%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))

【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?

  • 31
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)
## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity           : 99%
## Maximal term length: 67
## Weighting          : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA 
## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity           : 85%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))

【P2.1b】How many terms remain in dtmA after removing sparse terms?

  • 335
2.2 Abstract is longer than title

【P2.2】What is the most likely reason why dtmA has so many more terms than dtmT?

  • Abstracts tend to have many more words than titles
2.3 The most frequent word in abstracts

【P2.3】What is the most frequent word stem across all the abstracts?

which.max(colSums(dtmA))
## patient 
##      17


Problem 3 - Building a model

3.1 Make column names
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))

【P3.1】What was the effect of these functions?

  • Adding the letter T in front of all the title variable names and adding the letter A in front of all the abstract variable names.
  • 如此一來,我們才可以快速辨識哪些詞彙是從摘要或標題而來
3.2 Combine DTMs & Add target varible
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial

【P3.2】How many columns are in this combined data frame?

ncol(dtm)
## [1] 367
3.3 Splitting data
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)

【P3.3】What is the accuracy of the baseline model on the training set?

table(train$trial) %>% prop.table
## 
##       0       1 
## 0.56068 0.43932
3.4 CART Model
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")

【P3.4】What is the name of the first variable the model split on?

prp(cart)

3.5 The predicted probability

【P3.5】What is the maximum predicted probability for any result?

pred = predict(cart)[,2]
max(pred)
## [1] 0.87189
3.6 Similarity between testing & training data

【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?

  • The maximum predicted probability will likely be exactly the same in the testing set.
  • 因為分完的結果,TR跟TS裡面trail的機率分布差不多
3.7 Accuracy Matrices

Use a threshold probability of 0.5

table(train$trial, pred > 0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441

【P3.7a】What is the training set accuracy of the CART model?

(631+441)/(631+441+131+99)
## [1] 0.82335

【P3.7b】What is the training set sensitivity of the CART model?

441/(131+441)
## [1] 0.77098

【P3.7c】What is the training set specificity of the CART model?

631/(631+99)
## [1] 0.86438


Problem 4 - Evaluating the model on the testing set

4.1 Test Accuracy

【P4.1】What is the test accuracy?

pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.75806
4.2 Test AUC

【P4.2】What is the test AUC?

colAUC(pred, test$trial)
##            [,1]
## 0 vs. 1 0.83711


Problem 5 - Decision Tradeoffs

The research procedure is …

5.1 Cost of False Negative

【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?

  • A paper that should have been included in Set A will be missed, affecting the quality of the results of Step 3
  • 讓不該過的通過標準,且事後無從察覺,因而降低最後分析結果的品質
5.2 Cost of False Posituve

【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?

  • A paper will be mistakenly added to Set A, yielding additional work in Step 2 of the process but not affecting the quality of the results of Step 3.
  • 把符合標準的攔住,加重無謂的二次檢查時間,時間成本UP
5.3 The Threshold

【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?

  • A false negative is more costly than a false positive; the decision maker should use a probability threshold less than 0.5 for the machine learning model.
  • 若為求快速產出(時間不允許的情況下),可以放寬FN;若是求高品質且精確的結果,則不管花多少時間(FP),也要嚴格防治FN的發生