packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)
【P1.1】How many characters are there in the longest abstract?
nchar(D$abstract) %>% max
## [1] 3708
【P1.2】How many search results provided no abstract?
sum(nchar(D$abstract) == 0)
## [1] 112
【P1.3】What is the shortest title?
D$title[ which.min(nchar(D$title)) ]
## [1] "A decade of letrozole: FACE."
Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT
and corpA
.
library(tm)
library(SnowballC)
# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)
## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT
## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity : 81%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))
【P2.1a】How many terms remain in dtmT
after removing sparse terms (aka how many columns does it have)?
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)
## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity : 99%
## Maximal term length: 67
## Weighting : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA
## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity : 85%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))
【P2.1b】How many terms remain in dtmA
after removing sparse terms?
【P2.2】What is the most likely reason why dtmA
has so many more terms than dtmT
?
【P2.3】What is the most frequent word stem across all the abstracts?
which.max(colSums(dtmA))
## patient
## 17
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))
【P3.1】What was the effect of these functions?
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial
【P3.2】How many columns are in this combined data frame?
ncol(dtm)
## [1] 367
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)
【P3.3】What is the accuracy of the baseline model on the training set?
table(train$trial) %>% prop.table
##
## 0 1
## 0.56068 0.43932
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")
【P3.4】What is the name of the first variable the model split on?
prp(cart)
【P3.5】What is the maximum predicted probability for any result?
pred = predict(cart)[,2]
max(pred)
## [1] 0.87189
【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?
Use a threshold probability of 0.5
table(train$trial, pred > 0.5)
##
## FALSE TRUE
## 0 631 99
## 1 131 441
【P3.7a】What is the training set accuracy of the CART model?
(631+441)/(631+441+131+99)
## [1] 0.82335
【P3.7b】What is the training set sensitivity of the CART model?
441/(131+441)
## [1] 0.77098
【P3.7c】What is the training set specificity of the CART model?
631/(631+99)
## [1] 0.86438
【P4.1】What is the test accuracy?
pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.75806
【P4.2】What is the test AUC?
colAUC(pred, test$trial)
## [,1]
## 0 vs. 1 0.83711
The research procedure is …
【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?
【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?
【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?