packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
  "rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)


Problem 1 - Data Exploration

Load clinical_trial.csv into a data frame called trials (remembering to add the argument stringsAsFactors=FALSE), and investigate the data frame with summary() and str().

trials= read.csv("data/clinical_trial.csv", stringsAsFactors = F )
summary(trials)
##     title             abstract             trial      
##  Length:1860        Length:1860        Min.   :0.000  
##  Class :character   Class :character   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Median :0.000  
##                                        Mean   :0.439  
##                                        3rd Qu.:1.000  
##                                        Max.   :1.000
str(trials)
## 'data.frame':    1860 obs. of  3 variables:
##  $ title   : chr  "Treatment of Hodgkin's disease and other cancers with 1,3-bis(2-chloroethyl)-1-nitrosourea (BCNU; NSC-409962)." "Cell mediated immune status in malignancy--pretherapy and post-therapy assessment." "Neoadjuvant vinorelbine-capecitabine versus docetaxel-doxorubicin-cyclophosphamide in early nonresponsive breas"| __truncated__ "Randomized phase 3 trial of fluorouracil, epirubicin, and cyclophosphamide alone or followed by Paclitaxel for "| __truncated__ ...
##  $ abstract: chr  "" "Twenty-eight cases of malignancies of different kinds were studied to assess T-cell activity and population bef"| __truncated__ "BACKGROUND: Among breast cancer patients, nonresponse to initial neoadjuvant chemotherapy is associated with un"| __truncated__ "BACKGROUND: Taxanes are among the most active drugs for the treatment of metastatic breast cancer, and, as a co"| __truncated__ ...
##  $ trial   : int  1 0 1 1 1 0 1 0 0 0 ...

IMPORTANT NOTE: Some students have been getting errors like “invalid multibyte string” when performing certain parts of this homework question. If this is happening to you, use the argument fileEncoding=“latin1” when reading in the file with read.csv. This should cause those errors to go away.

We can use R’s string functions to learn more about the titles and abstracts of the located papers. The nchar() function counts the number of characters in a piece of text. Using the nchar() function on the variables in the data frame, answer the following questions:

How many characters are there in the longest abstract? (Longest here is defined as the abstract with the largest number of characters.)

nchar(trials$abstract) %>% max
## [1] 3708
1.2 No abstract

How many search results provided no abstract? (HINT: A search result provided no abstract if the number of characters in the abstract field is zero.)

table(nchar(trials$abstract)==0)
## 
## FALSE  TRUE 
##  1748   112
  • 112
1.3 Shortest title

Find the observation with the minimum number of characters in the title (the variable “title”) out of all of the observations in this dataset. What is the text of the title of this article? Include capitalization and punctuation in your response, but don’t include the quotes.

#nchar(trials$title) %>% min #這個只能求出字數
trials$title[ which.min(nchar(trials$title)) ] #才能求出內容
## [1] "A decade of letrozole: FACE."

Problem 2 - Preparing the Corpus

Because we have both title and abstract information for trials, we need to build two corpora instead of one. Name them corpT and corpA.

2.1 Courps & DTM

Following the commands from lecture, perform the following tasks (you might need to load the “tm” package first if it isn’t already loaded). Make sure to perform them in this order.

    1. Convert the title variable to corpusTitle and the abstract variable to corpusAbstract.
    1. Convert corpusTitle and corpusAbstract to lowercase.
    1. Remove the punctuation in corpusTitle and corpusAbstract.
    1. Remove the English language stop words from corpusTitle and corpusAbstract.
    1. Stem the words in corpusTitle and corpusAbstract (each stemming might take a few minutes).
    1. Build a document term matrix called dtmTitle from corpusTitle and dtmAbstract from corpusAbstract.
    1. Limit dtmTitle and dtmAbstract to terms with sparseness of at most 95% (aka terms that appear in at least 5% of documents).
    1. Convert dtmTitle and dtmAbstract to data frames (keep the names dtmTitle and dtmAbstract).
#建立title variable的corpus
corpusT= Corpus(VectorSource(trials$title))
corpusT= tm_map(corpusT, content_transformer(tolower))#都轉成小寫
## Warning in tm_map.SimpleCorpus(corpusT, content_transformer(tolower)):
## transformation drops documents
corpusT= tm_map(corpusT, removePunctuation) #移除標點符號
## Warning in tm_map.SimpleCorpus(corpusT, removePunctuation): transformation
## drops documents
corpusT= tm_map(corpusT, removeWords, stopwords("english")) #移除stopwords
## Warning in tm_map.SimpleCorpus(corpusT, removeWords, stopwords("english")):
## transformation drops documents
corpusT= tm_map(corpusT, stemDocument)#做stem
## Warning in tm_map.SimpleCorpus(corpusT, stemDocument): transformation drops
## documents
dtmT= DocumentTermMatrix(corpusT) #; dtmT #建立文字矩陣
dtmT= removeSparseTerms(dtmT, 0.95)
dtmT= as.data.frame(as.matrix(dtmT)) #; dtmT
  
#建立abstract variable的corpus
corpusA= Corpus(VectorSource(trials$abstract))
corpusA= tm_map(corpusA, content_transformer(tolower))#都轉成小寫
## Warning in tm_map.SimpleCorpus(corpusA, content_transformer(tolower)):
## transformation drops documents
corpusA= tm_map(corpusA, removePunctuation) #移除標點符號
## Warning in tm_map.SimpleCorpus(corpusA, removePunctuation): transformation
## drops documents
corpusA= tm_map(corpusA, removeWords, stopwords("english")) #移除stopwords
## Warning in tm_map.SimpleCorpus(corpusA, removeWords, stopwords("english")):
## transformation drops documents
corpusA= tm_map(corpusA, stemDocument)#做stem
## Warning in tm_map.SimpleCorpus(corpusA, stemDocument): transformation drops
## documents
dtmA= DocumentTermMatrix(corpusA) #; dtmA #建立文字矩陣
dtmA= removeSparseTerms(dtmA, 0.95)
dtmA= as.data.frame(as.matrix(dtmA)) #; dtmA

If the code length(stopwords(“english”)) does not return 174 for you, then please run the line of code in this file, which will store the standard stop words in a variable called sw. When removing stop words, use tm_map(corpusTitle, removeWords, sw) and tm_map(corpusAbstract, removeWords, sw) instead of tm_map(corpusTitle, removeWords, stopwords(“english”)) and tm_map(corpusAbstract, removeWords, stopwords(“english”)).

length(stopwords("english")) 
## [1] 174

【P2.1a】How many terms remain in dtmT after removing sparse terms (aka how many columns does it have)?

  • 31

【P2.1b】How many terms remain in dtmA after removing sparse terms?

  • 335
2.2 Abstract is longer than title

What is the most likely reason why dtmAbstract has so many more terms than dtmTitle?

  • Abstracts tend to have many more words than titles
  • 用了較多字是主因,較多樣的詞彙僅是其次
2.3 The most frequent word in abstracts

【P2.3】What is the most frequent word stem across all the abstracts?

which.max(colSums(dtmA))
## patient 
##      17

在這裡要擅用wihch.max跟which.min

Problem 3 - Building a model

3.1 Make column names

We want to combine dtmTitle and dtmAbstract into a single data frame to make predictions. However, some of the variables in these data frames have the same names. To fix this issue, run the following commands:

colnames(dtmT) = paste0("T", colnames(dtmT))
colnames(dtmA) = paste0("A", colnames(dtmA))

What was the effect of these functions?

  • Adding the letter T in front of all the title variable names and adding the letter A in front of all the abstract variable names.
3.2 Combine DTMs & Add target varible

Using cbind(), combine dtmTitle and dtmAbstract into a single data frame called dtm:

dtm = cbind(dtmT, dtmA) #將兩者合併,便於建立預測模型及後續分析用

As we did in class, add the dependent variable “trial” to dtm, copying it from the original data frame called trials.

dtm$trial = trials$trial #把原本的trial(y)加回去,成為新的dtm

How many columns are in this combined data frame?

  • 367
3.3 Splitting data

Now that we have prepared our data frame, it’s time to split it into a training and testing set and to build regression models. Set the random seed to 144 and use the sample.split function from the caTools package to split dtm into data frames named “train” and “test”, putting 70% of the data in the training set. 到此整理完畢(即已將非結構資料轉換成結構),開始建立模型

set.seed(144)
spl = sample.split(dtm$trial, SplitRatio = 0.7)
tr = subset(dtm, spl)
ts = subset(dtm, !spl)

What is the accuracy of the baseline model on the training set? (Remember that the baseline model predicts the most frequent outcome in the training set for all observations.)

table(tr$trial) %>% prop.table
## 
##       0       1 
## 0.56068 0.43932
  • 0.56068
3.4 CART Model

Build a CART model called trialCART, using all the independent variables in the training set to train the model, and then plot the CART model. Just use the default parameters to build the model (don’t add a minbucket or cp value). Remember to add the method=“class” argument, since this is a classification problem.

trialCART = rpart(trial~., tr, method="class" )
prp(trialCART)

What is the name of the first variable the model split on?

  • Tphase
3.5 The predicted probability

Obtain the training set predictions for the model (do not yet predict on the test set). Extract the predicted probability of a result being a trial (recall that this involves not setting a type argument, and keeping only the second column of the predict output). What is the maximum predicted probability for any result?

pred = predict(trialCART)[,2] 
max(pred)
## [1] 0.87189
  • 0.87189
3.6 Similarity between testing & training data

【P3.6】Without running the analysis, how do you expect the maximum predicted probability to differ in the testing set?

  • The maximum predicted probability will likely be exactly the same in the testing set.
  • The CART tree assigns the same predicted probability to each leaf node and there are a small number of leaf nodes compared to data points, we expect exactly the same maximum predicted probability.
3.7 Accuracy Matrices

For these questions, use a threshold probability of 0.5 to predict that an observation is a clinical trial.

table(tr$trial, pred>=0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441

What is the training set accuracy of the CART model?

table(tr$trial, pred>=0.5) %>% {sum(diag(.))/sum(.)}
## [1] 0.82335

【P3.7b】What is the training set sensitivity of the CART model?

table(tr$trial, pred>=0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441
441/(441+131)
## [1] 0.77098

【P3.7c】What is the training set specificity of the CART model?

table(tr$trial, pred>=0.5)
##    
##     FALSE TRUE
##   0   631   99
##   1   131  441
631/(631+99)
## [1] 0.86438

Problem 4 - Evaluating the model on the testing set

4.1 Test Accuracy

Evaluate the CART model on the testing set using the predict function and creating a vector of predicted probabilities predTest.

What is the testing set accuracy, assuming a probability threshold of 0.5 for predicting that a result is a clinical trial? 【P4.1】What is the test accuracy?

pred1= predict(trialCART, ts)[,2]
table(ts$trial, pred1>0.5) %>% {sum(diag(.))/sum(.)}
## [1] 0.75806
4.2 Test AUC

Using the ROCR package, what is the testing set AUC of the prediction model?

colAUC(pred1, ts$trial)
##            [,1]
## 0 vs. 1 0.83711

Problem 5 - Decision Tradeoffs

The research procedure is …

5.1 Cost of False Negative

【P5.1】What is the cost associated with the model in Step 1 making a false negative prediction?

  • A paper that should have been included in Set A will be missed, affecting the quality of the results of Step 3.
  • FN:錯誤的反負向預測,即本該是1的卻被預測成0,如此將使原本該被納入A的研究被missed掉,進而影響第3步的資料萃取結果。
5.2 Cost of False Posituve

【P5.2】What is the cost associated with the model in Step 1 making a false positive prediction?

  • A paper will be mistakenly added to Set A, yielding additional work in Step 2 of the process but not affecting the quality of the results of Step 3.
  • 即使本不該被納入A的卻被納入,但因第2步當中研究暂將親自閱讀並判斷,使第二步的ACC為100%,因此錯誤資料在此即會被移除,並不會影響第三步結果。
5.3 The Threshold

【P5.3】Given the costs associated with false positives and false negatives, which of the following is most accurate?

  • A false negative is more costly than a false positive; the decision maker should use a probability threshold less than 0.5 for the machine learning model.
  • FN因為會影響到真正第三步的結果,因此應盡可能使其越低越好,即使如此會使用研究者需親閱的文獻增加,然卻較不容易遺漏掉不該被省略的。