Creat a word vector for description.
Build a corpus using the word vector.
Pre-processing tasks such as removing number, whitespaces, stopwords and conversion to lower case.
Build a document term matrix (dtm).
Remove sparse words from the above dtm.
The above step leads to a count frequency matrix showing the frequency of each word in its coressponding column.
Tranform count frequency matrix to a binary instance matrix, which shows occurences of a word in a document as either 0 or 1, 1 for being present and 0 for absent.
Append the label column from the original notes dataset with the transformed dtm. The label column has 6 labels.
Randomly sample the dtm and split it into a traning set and testing set.
Build a base model of random forest with repeated 10-fold cross validation.
Check for accuracy of the model on the training set and testing set.
library(readr)
notes <- read_csv("C:/Users/welcome/Downloads/train_notes (1).csv")
notes$title <- as.factor(notes$title) # convert title to factor
notes <- notes[,-1] # remove firt column
notes <- notes[complete.cases(notes),] # remove missing values if any
dim(notes)[1]*(0.7) # check for 70% of data
## [1] 9081.1
table(notes$title) # class instances of the complete data set
##
## Body Fluid Analysis Cytology Test Diagnostic Imaging
## 3926 1515 5594
## Doctors Advice Organ Function Test Patient Related
## 1109 670 159
require(tm) # load text mining package
sd <- VectorSource(notes$description) # words vector
corpus <- Corpus(sd) # build corpus
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removePunctuation) # remove puntucations
corpus <- tm_map(corpus, stripWhitespace) # remove white spaces
corpus <- tm_map(corpus, removeWords, c(stopwords('english'), "and", "are", "the",
"both", "appears", "within", "appear",
"others", "clear", "right", "seen",
"well")) # remove stopwords
corpus <- tm_map(corpus, content_transformer(tolower)) # change to lower case
tdm <- DocumentTermMatrix(corpus) # build document term matrix
tdm_sparse <- removeSparseTerms(tdm, 0.90) # remove sparse terms
tdm_dm <- as.data.frame(as.matrix(tdm_sparse)) # count matrix
tdm_df <- as.matrix((tdm_dm > 0) + 0) # binary instance matrix
tdm_df <- as.data.frame(tdm_df)
tdm_df <- cbind(tdm_df, notes$title) # append label column from original dataset
tdm # before removing sparse terms
## <<DocumentTermMatrix (documents: 12973, terms: 7396)>>
## Non-/sparse entries: 313200/95635108
## Sparsity : 100%
## Maximal term length: 43
## Weighting : term frequency (tf)
tdm_sparse # after removing sparse terms
## <<DocumentTermMatrix (documents: 12973, terms: 56)>>
## Non-/sparse entries: 133350/593138
## Sparsity : 82%
## Maximal term length: 16
## Weighting : term frequency (tf)
tdm_dm[1:10,] # Count frequency matrix
## and biliary bladder both calculi cbd clinical corticomedullary
## 1 11 1 2 1 1 1 1 1
## 2 3 0 3 1 1 1 0 0
## 3 1 1 3 1 0 0 0 1
## 4 0 1 3 0 0 0 0 0
## 5 0 1 3 1 0 1 0 0
## 6 10 1 3 0 0 1 0 0
## 7 10 1 3 1 1 1 0 1
## 8 0 1 3 1 0 1 0 1
## 9 0 1 2 1 1 1 0 1
## 10 0 1 3 0 0 1 1 0
## differentiation distended echotexture fluid focal free gall impression
## 1 1 2 2 1 2 1 1 1
## 2 0 1 2 0 1 0 2 1
## 3 0 0 1 0 1 0 2 1
## 4 0 0 0 0 1 0 2 1
## 5 0 0 1 0 1 0 2 1
## 6 0 0 0 0 1 0 2 0
## 7 1 1 2 0 1 0 2 1
## 8 0 0 1 0 1 1 2 1
## 9 0 0 0 0 1 0 1 1
## 10 0 0 0 0 1 0 2 1
## kidneys left lesion liver maintained measures normal pancreas radicals
## 1 4 2 2 2 1 5 9 1 1
## 2 2 2 1 1 0 2 11 1 1
## 3 2 1 1 2 0 0 10 1 1
## 4 1 2 1 1 0 0 10 1 1
## 5 1 2 1 1 0 4 10 1 1
## 6 1 1 1 1 0 1 11 1 1
## 7 2 1 2 1 0 2 11 1 1
## 8 1 2 1 1 0 4 12 1 1
## 9 1 1 1 3 0 2 8 1 1
## 10 1 2 1 1 0 0 10 1 1
## right size spleen urinary well dilatation echopattern evidence kidney
## 1 2 4 2 1 1 0 0 0 0
## 2 2 6 1 1 1 2 2 4 2
## 3 1 3 1 1 0 0 3 4 2
## 4 2 5 1 1 0 2 3 4 6
## 5 2 5 1 1 0 1 1 3 2
## 6 1 5 1 1 2 2 4 4 2
## 7 1 6 1 1 2 2 2 4 0
## 8 2 6 1 1 0 2 5 5 3
## 9 1 5 1 1 0 2 5 5 3
## 10 1 5 1 1 0 2 2 3 3
## shape thickness wall abnormality detected intrahepatic prostate regular
## 1 0 0 0 0 0 0 0 0
## 2 5 3 2 0 0 0 0 0
## 3 5 2 2 1 1 1 1 0
## 4 6 1 2 1 0 1 1 0
## 5 6 4 2 1 1 1 0 0
## 6 7 4 2 0 0 1 0 0
## 7 6 5 2 0 0 1 0 0
## 8 8 4 2 1 1 1 0 0
## 9 6 2 1 0 0 1 1 0
## 10 7 2 2 1 0 1 1 0
## cells pus study lesions mass epithelial angles lung contour central
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0
## pathology hpf echoes observation
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
tdm_df[1:10,] # Binary instance matrix
## and biliary bladder both calculi cbd clinical corticomedullary
## 1 1 1 1 1 1 1 1 1
## 2 1 0 1 1 1 1 0 0
## 3 1 1 1 1 0 0 0 1
## 4 0 1 1 0 0 0 0 0
## 5 0 1 1 1 0 1 0 0
## 6 1 1 1 0 0 1 0 0
## 7 1 1 1 1 1 1 0 1
## 8 0 1 1 1 0 1 0 1
## 9 0 1 1 1 1 1 0 1
## 10 0 1 1 0 0 1 1 0
## differentiation distended echotexture fluid focal free gall impression
## 1 1 1 1 1 1 1 1 1
## 2 0 1 1 0 1 0 1 1
## 3 0 0 1 0 1 0 1 1
## 4 0 0 0 0 1 0 1 1
## 5 0 0 1 0 1 0 1 1
## 6 0 0 0 0 1 0 1 0
## 7 1 1 1 0 1 0 1 1
## 8 0 0 1 0 1 1 1 1
## 9 0 0 0 0 1 0 1 1
## 10 0 0 0 0 1 0 1 1
## kidneys left lesion liver maintained measures normal pancreas radicals
## 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 0 1 1 1 1
## 3 1 1 1 1 0 0 1 1 1
## 4 1 1 1 1 0 0 1 1 1
## 5 1 1 1 1 0 1 1 1 1
## 6 1 1 1 1 0 1 1 1 1
## 7 1 1 1 1 0 1 1 1 1
## 8 1 1 1 1 0 1 1 1 1
## 9 1 1 1 1 0 1 1 1 1
## 10 1 1 1 1 0 0 1 1 1
## right size spleen urinary well dilatation echopattern evidence kidney
## 1 1 1 1 1 1 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 0 0 1 1 1
## 4 1 1 1 1 0 1 1 1 1
## 5 1 1 1 1 0 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1
## 7 1 1 1 1 1 1 1 1 0
## 8 1 1 1 1 0 1 1 1 1
## 9 1 1 1 1 0 1 1 1 1
## 10 1 1 1 1 0 1 1 1 1
## shape thickness wall abnormality detected intrahepatic prostate regular
## 1 0 0 0 0 0 0 0 0
## 2 1 1 1 0 0 0 0 0
## 3 1 1 1 1 1 1 1 0
## 4 1 1 1 1 0 1 1 0
## 5 1 1 1 1 1 1 0 0
## 6 1 1 1 0 0 1 0 0
## 7 1 1 1 0 0 1 0 0
## 8 1 1 1 1 1 1 0 0
## 9 1 1 1 0 0 1 1 0
## 10 1 1 1 1 0 1 1 0
## cells pus study lesions mass epithelial angles lung contour central
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0
## pathology hpf echoes observation notes$title
## 1 0 0 0 0 Diagnostic Imaging
## 2 0 0 0 0 Diagnostic Imaging
## 3 0 0 0 0 Diagnostic Imaging
## 4 0 0 0 0 Diagnostic Imaging
## 5 0 0 0 0 Diagnostic Imaging
## 6 0 0 0 0 Diagnostic Imaging
## 7 0 0 0 0 Diagnostic Imaging
## 8 0 0 0 0 Diagnostic Imaging
## 9 0 0 0 0 Diagnostic Imaging
## 10 0 0 0 0 Diagnostic Imaging
s <- sample(1:nrow(tdm_df), nrow(tdm_df)*(0.70), replace = FALSE) # random sampling
train <- tdm_df[s,] # training set
test <- tdm_df[-s,] # testing set
table(train$`notes$title`) # class instances in train data
##
## Body Fluid Analysis Cytology Test Diagnostic Imaging
## 2748 1058 3925
## Doctors Advice Organ Function Test Patient Related
## 761 472 117
table(test$`notes$title`) # class indtances in test data
##
## Body Fluid Analysis Cytology Test Diagnostic Imaging
## 1178 457 1669
## Doctors Advice Organ Function Test Patient Related
## 348 198 42
require(caret) # Load caret package
ctrl <- trainControl(method = "cv", number = 7) # 10-fold CV
set.seed(100)
rf.tfidf <- train(train[,c(1:56)], train[,57],
method = "rpart", trControl = ctrl) # train random forest
rf.tfidf
## CART
##
## 9081 samples
## 56 predictor
## 6 classes: 'Body Fluid Analysis', 'Cytology Test', 'Diagnostic Imaging', 'Doctors Advice', 'Organ Function Test', 'Patient Related'
##
## No pre-processing
## Resampling: Cross-Validated (7 fold)
## Summary of sample sizes: 7782, 7781, 7784, 7784, 7786, 7785, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.03743212 0.7559641 0.6414526
## 0.04945694 0.6774839 0.5005556
## 0.45190070 0.5399708 0.2146879
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03743212.
pred <- predict(rf.tfidf, newdata = test) # Predicted values on test set
summary(predict(rf.tfidf, newdata = test))
## Body Fluid Analysis Cytology Test Diagnostic Imaging
## 2016 164 1712
## Doctors Advice Organ Function Test Patient Related
## 0 0 0