packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Data Exploration

D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)
summary(D)
##     title             abstract             trial      
##  Length:1860        Length:1860        Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :0.000  
##                                        Mean   :0.439  
##                                        3rd Qu.:1.000  
##                                        Max.   :1.000
str(D)
## 'data.frame':    1860 obs. of  3 variables:
##  $ title   : chr  "Treatment of Hodgkin's disease and other cancers with 1,3-bis(2-chloroethyl)-1-nitrosourea (BCNU; NSC-409962)." "Cell mediated immune status in malignancy--pretherapy and post-therapy assessment." "Neoadjuvant vinorelbine-capecitabine versus docetaxel-doxorubicin-cyclophosphamide in early nonresponsive breas"| __truncated__ "Randomized phase 3 trial of fluorouracil, epirubicin, and cyclophosphamide alone or followed by Paclitaxel for "| __truncated__ ...
##  $ abstract: chr  "" "Twenty-eight cases of malignancies of different kinds were studied to assess T-cell activity and population bef"| __truncated__ "BACKGROUND: Among breast cancer patients, nonresponse to initial neoadjuvant chemotherapy is associated with un"| __truncated__ "BACKGROUND: Taxanes are among the most active drugs for the treatment of metastatic breast cancer, and, as a co"| __truncated__ ...
##  $ trial   : int  1 0 1 1 1 0 1 0 0 0 ...
1.1 The longest abstract

【P1.1】How many characters are there in the longest abstract?

nchar(D$abstract) %>% max
## [1] 3708
#---------------------------------
max(nchar(D$abstract))
## [1] 3708
1.2 No abstract

【P1.2】How many search results provided no abstract?

#計算為零的數量
sum(nchar(D$abstract) == 0)
## [1] 112
1.3 Shortest title

【P1.3】What is the shortest title?

D$title[ which.min(nchar(D$title)) ] #1258
## [1] "A decade of letrozole: FACE."


Problem 2 - Preparing the Corpus

Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.

2.1 Courps & DTM
library(tm)
library(SnowballC)

# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
#轉換成小寫
corpT = tm_map(corpT,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
#刪除標點符號
corpT = tm_map(corpT, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
#刪除贅字
corpT = tm_map(corpT, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
#顯示字根
corpT = tm_map(corpT, stemDocument)
## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
#轉換成字詞矩陣
dtmT = DocumentTermMatrix(corpT); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity           : 100%
## Maximal term length: 49
## Weighting          : term frequency (tf)
#刪除後5%的矩陣資料
dtmT = removeSparseTerms(dtmT, 0.95); dtmT 
## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity           : 81%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))

【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?

  • 31
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA,  content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)
## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity           : 99%
## Maximal term length: 67
## Weighting          : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA 
## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity           : 85%
## Maximal term length: 15
## Weighting          : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))

【P2.1b】How many terms remain in dtmA after removing sparse terms?

  • 335
2.2 Abstract is longer than title

【P2.2】What is the most likely reason why dtmA has so many more terms than dtmT?

  • Abstracts tend to have many more words than titles
  • 摘要字數比標題多,因此會影響到刪除出現次數低的字詞、標點符號等,而影響到處理結果。
2.3 The most frequent word in abstracts

【P2.3】What is the most frequent word stem across all the abstracts?

which.max(colSums(dtmA))
## patient 
##      17


Problem 3 - Building a model

3.1 Make column names
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))

【P3.1】What was the effect of these functions?

  • Adding the letter T in front of all the title variable names and adding the letter A in front of all the abstract variable names.
  • 在標題名稱前增加T,摘要名稱前增加A
3.2 Combine DTMs & Add target varible
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial

【P3.2】How many columns are in this combined data frame?

#計算欄數
ncol(dtm)
## [1] 367
3.3 Splitting data
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)

【P3.3】What is the accuracy of the baseline model on the training set?

table(train$trial) %>% prop.table    #0.56068
## 
##       0       1 
## 0.56068 0.43932
3.4 CART Model
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")

【P3.4】What is the name of the first variable the model split on?

prp(cart)  #Tphase

3.5 The predicted probability

【P3.5】What is the maximum predicted probability for any result?

#預測決策樹模型
pred = predict(cart)[,2]
#選出最大者
max(pred)
## [1] 0.87189
3.6 Similarity between testing & training data

【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?

  • The maximum predicted probability will likely be exactly the same in the testing set.
  • 預測結果與預測資料接近。
3.7 Accuracy Matrices

Use a threshold probability of 0.5

table(train$trial, pred > 0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441

【P3.7a】What is the training set accuracy of the CART model?

(631+441)/(631+441+131+99)
## [1] 0.82335

【P3.7b】What is the training set sensitivity of the CART model?

441/(131+441)
## [1] 0.77098

【P3.7c】What is the training set specificity of the CART model?

631/(631+99)
## [1] 0.86438


Problem 4 - Evaluating the model on the testing set

4.1 Test Accuracy

【P4.1】What is the test accuracy?

pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.75806
4.2 Test AUC

【P4.2】What is the test AUC?

colAUC(pred, test$trial)
##            [,1]
## 0 vs. 1 0.83711


Problem 5 - Decision Tradeoffs

The research procedure is …

5.1 Cost of False Negative

【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?

  • A paper that should have been included in Set A will be missed, affecting the quality of the results of Step 3
5.2 Cost of False Posituve

【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?

  • A paper will be mistakenly added to Set A, yielding additional work in Step 2 of the process but not affecting the quality of the results of Step 3.
5.3 The Threshold

【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?

  • A false negative is more costly than a false positive; the decision maker should use a probability threshold less than 0.5 for the machine learning model.