packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)【P1.1】How many characters are there in the longest abstract?
nchar(D$abstract) %>% max## [1] 3708
【P1.2】How many search results provided no abstract?
sum(nchar(D$abstract) == 0)## [1] 112
【P1.3】What is the shortest title?
D$title[ which.min(nchar(D$title)) ]## [1] "A decade of letrozole: FACE."
Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.
library(tm)
library(SnowballC)
# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT, content_transformer(tolower))
corpT = tm_map(corpT, removePunctuation)
corpT = tm_map(corpT, removeWords, stopwords("english"))
corpT = tm_map(corpT, stemDocument)
dtmT = DocumentTermMatrix(corpT); dtmT## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT ## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity : 81%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA, content_transformer(tolower))
corpA = tm_map(corpA, removePunctuation)
corpA = tm_map(corpA, removeWords, stopwords("english"))
corpA = tm_map(corpA, stemDocument)
dtmA = DocumentTermMatrix(corpA); dtmA## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity : 99%
## Maximal term length: 67
## Weighting : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA ## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity : 85%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))【P2.1b】How many terms remain in dtmA after removing sparse terms?
【P2.2】What is the most likely reason why dtmA has so many more terms than dtmT?
【P2.3】What is the most frequent word stem across all the abstracts?
which.max(colSums(dtmA))## patient
## 17
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))【P3.1】What was the effect of these functions?
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial【P3.2】How many columns are in this combined data frame?
ncol(dtm)## [1] 367
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)【P3.3】What is the accuracy of the baseline model on the training set?
table(train$trial) %>% prop.table##
## 0 1
## 0.56068 0.43932
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")【P3.4】What is the name of the first variable the model split on?
prp(cart)【P3.5】What is the maximum predicted probability for any result?
pred = predict(cart)[,2]
max(pred)## [1] 0.87189
【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?
Use a threshold probability of 0.5
table(train$trial, pred > 0.5)##
## FALSE TRUE
## 0 631 99
## 1 131 441
【P3.7a】What is the training set accuracy of the CART model?
(631+441)/(631+441+131+99)## [1] 0.82335
【P3.7b】What is the training set sensitivity of the CART model?
441/(131+441)## [1] 0.77098
【P3.7c】What is the training set specificity of the CART model?
631/(631+99)## [1] 0.86438
【P4.1】What is the test accuracy?
pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.75806
【P4.2】What is the test AUC?
colAUC(pred, test$trial)## [,1]
## 0 vs. 1 0.83711
The research procedure is …
【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?
【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?
【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?