packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# 建立一個模型幫我篩選東西,那我如果臨界機率猜比較低一點,我會篩比較多東西,我要看很多東西,但我比較不會漏掉;若臨界機率高,我就可以少看一點東西,但相對的會有風險,很多沒檢查到。rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)D = read.csv("data/clinical_trial.csv", stringsAsFactors = F)【P1.1】How many characters are there in the longest abstract?
nchar(D$abstract) %>% max## [1] 3708
【P1.2】How many search results provided no abstract?
sum(nchar(D$abstract) == 0)## [1] 112
【P1.3】What is the shortest title?
D$title[ which.min(nchar(D$title)) ]## [1] "A decade of letrozole: FACE."
Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.
library(tm)
library(SnowballC)
# Corpus & DTM for Title
corpT = Corpus(VectorSource(D$title))
corpT = tm_map(corpT, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(corpT, content_transformer(tolower)):
## transformation drops documents
corpT = tm_map(corpT, removePunctuation)## Warning in tm_map.SimpleCorpus(corpT, removePunctuation): transformation
## drops documents
corpT = tm_map(corpT, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpT, removeWords, stopwords("english")):
## transformation drops documents
corpT = tm_map(corpT, stemDocument)## Warning in tm_map.SimpleCorpus(corpT, stemDocument): transformation drops
## documents
dtmT = DocumentTermMatrix(corpT); dtmT## <<DocumentTermMatrix (documents: 1860, terms: 2833)>>
## Non-/sparse entries: 23416/5245964
## Sparsity : 100%
## Maximal term length: 49
## Weighting : term frequency (tf)
dtmT = removeSparseTerms(dtmT, 0.95); dtmT ## <<DocumentTermMatrix (documents: 1860, terms: 31)>>
## Non-/sparse entries: 10683/46977
## Sparsity : 81%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmT = as.data.frame(as.matrix(dtmT))【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?
corpA = Corpus(VectorSource(D$abstract))
corpA = tm_map(corpA, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(corpA, content_transformer(tolower)):
## transformation drops documents
corpA = tm_map(corpA, removePunctuation)## Warning in tm_map.SimpleCorpus(corpA, removePunctuation): transformation
## drops documents
corpA = tm_map(corpA, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpA, removeWords, stopwords("english")):
## transformation drops documents
corpA = tm_map(corpA, stemDocument)## Warning in tm_map.SimpleCorpus(corpA, stemDocument): transformation drops
## documents
dtmA = DocumentTermMatrix(corpA); dtmA## <<DocumentTermMatrix (documents: 1860, terms: 12335)>>
## Non-/sparse entries: 153239/22789861
## Sparsity : 99%
## Maximal term length: 67
## Weighting : term frequency (tf)
dtmA = removeSparseTerms(dtmA, 0.95); dtmA ## <<DocumentTermMatrix (documents: 1860, terms: 335)>>
## Non-/sparse entries: 92007/531093
## Sparsity : 85%
## Maximal term length: 15
## Weighting : term frequency (tf)
dtmA = as.data.frame(as.matrix(dtmA))
#移除贅字【P2.1b】How many terms remain in dtmA after removing sparse terms?
【P2.2】What is the most likely reason why dtmA has so many more terms than dtmT?
【P2.3】What is the most frequent word stem across all the abstracts?
which.max(colSums(dtmA))## patient
## 17
colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))【P3.1】What was the effect of these functions?
dtm = cbind(dtmT, dtmA)
dtm$trial = D$trial【P3.2】How many columns are in this combined data frame?
ncol(dtm)## [1] 367
library(caTools)
set.seed(144)
spl = sample.split(dtm$trial, 0.7)
train = subset(dtm, spl == TRUE)
test = subset(dtm, spl == FALSE)【P3.3】What is the accuracy of the baseline model on the training set?
table(train$trial) %>% prop.table##
## 0 1
## 0.56068 0.43932
library(rpart)
library(rpart.plot)
cart = rpart(trial~., train, method="class")【P3.4】What is the name of the first variable the model split on?
prp(cart)【P3.5】What is the maximum predicted probability for any result?
pred = predict(cart)[,2]
max(pred)## [1] 0.87189
【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?
Use a threshold probability of 0.5
table(train$trial, pred > 0.5) #設置pred>0.5的用意是什麼?##
## FALSE TRUE
## 0 631 99
## 1 131 441
【P3.7a】What is the training set accuracy of the CART model?
(631+441)/(631+441+131+99)## [1] 0.82335
【P3.7b】What is the training set sensitivity of the CART model?
441/(131+441)## [1] 0.77098
【P3.7c】What is the training set specificity of the CART model?
631/(631+99)## [1] 0.86438
【P4.1】What is the test accuracy?
pred = predict(cart,test)[,2]
table(test$trial, pred > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.75806
【P4.2】What is the test AUC?
colAUC(pred, test$trial)## [,1]
## 0 vs. 1 0.83711
The research procedure is …
【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?
【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?
【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?