packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C/C/C/C/C/zh_TW.UTF-8"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)nchar(D$abstract) %>% max## [1] 3708
sum(nchar(D$abstract) == 0)## [1] 112
D$title[ which.min(nchar(D$title)) ]## [1] "A decade of letrozole: FACE."
Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.
library(tm)
library(SnowballC)
# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT## <<DocumentTermMatrix (documents: 1860, terms: 2831)>>
## Non-/sparse entries: 23415/5242245
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT ## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10684/46976
## Sparsity : 81%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))dtmT after removing sparse terms (aka how many columns does it have)?
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA## <<DocumentTermMatrix (documents: 1860, terms: 12209)>>
## Non-/sparse entries: 153164/22555576
## Sparsity : 99%
## Maximal term length: 67
## Weighting : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA ## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92016/531084
## Sparsity : 85%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))dtmA after removing sparse terms?
dtmA has so many more terms than dtmT?
which.max(colSums(dtmA))## patient
## 17
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))dtm = cbind(dtmT, dtmA)
dtm$trial = D$trialncol(dtm)## [1] 367
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)table(train$trial) %>% prop.table##
## 0 1
## 0.56068 0.43932
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")prp(cart)pred = predict(cart)[,2]
max(pred)## [1] 0.87189
Use a threshold probability of 0.5
table(train$trial, pred > 0.5)##
## FALSE TRUE
## 0 631 99
## 1 131 441
(631+441)/(631+441+131+99)## [1] 0.82335
441/(131+441)## [1] 0.77098
631/(631+99)## [1] 0.86438
pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.75806
colAUC(pred, test$trial)## [,1]
## 0 vs. 1 0.83711
The research procedure is …