DATA PREPARATION

  1. Creat a word vector for description.

  2. Build a corpus using the word vector.

  3. Pre-processing tasks such as removing number, whitespaces, stopwords and conversion to lower case.

  4. Build a document term matrix (dtm).

  5. Remove sparse words from the above dtm.

  6. The above step leads to a count frequency matrix showing the frequency of each word in its coressponding column.

  7. Tranform count frequency matrix to a binary instance matrix, which shows occurences of a word in a document as either 0 or 1, 1 for being present and 0 for absent.

  8. Append the label column from the original notes dataset with the transformed dtm. The label column has 6 labels.

MODEL BUILDING

  1. Randomly sample the dtm and split it into a traning set and testing set.

  2. Build a base model of random forest with repeated 10-fold cross validation.

  3. Check for accuracy of the model on the training set and testing set.

Data preparation:

library(readr)
notes <- read_csv("C:/Users/welcome/Downloads/train_notes (1).csv")

notes$title <- as.factor(notes$title) # convert title to factor

notes <- notes[,-1] # remove firt column

notes <- notes[complete.cases(notes),]  # remove missing values if any

dim(notes)[1]*(0.7)  #  check for 70% of data
## [1] 9081.1
table(notes$title)  #  class instances of the complete data set
## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                3926                1515                5594 
##      Doctors Advice Organ Function Test     Patient Related 
##                1109                 670                 159

Word vector and Text pre-preprocessing:

require(tm)  # load text mining package

sd <- VectorSource(notes$description) # words vector

corpus <- Corpus(sd)  # build corpus

corpus <- tm_map(corpus, removeNumbers)  # remove numbers

corpus <- tm_map(corpus, removePunctuation) # remove puntucations

corpus <- tm_map(corpus, stripWhitespace) # remove  white spaces

corpus <- tm_map(corpus, removeWords, c(stopwords('english'), "and", "are", "the",
                                        "both", "appears", "within", "appear",
                                        "others", "clear", "right", "seen", 
                                        "well")) # remove stopwords

corpus <- tm_map(corpus, content_transformer(tolower))  # change to lower case

Building Document term matrix:

tdm <- DocumentTermMatrix(corpus)  # build document term matrix

tdm_sparse <- removeSparseTerms(tdm, 0.90) # remove sparse terms

tdm_dm <- as.data.frame(as.matrix(tdm_sparse)) # count matrix

tdm_df <- as.matrix((tdm_dm > 0) + 0) # binary instance matrix

tdm_df <- as.data.frame(tdm_df)

tdm_df <- cbind(tdm_df, notes$title) # append label column from original dataset

Document term matrix before removing sparse terms and after removing sparse terms.

tdm  # before removing sparse terms
## <<DocumentTermMatrix (documents: 12973, terms: 7396)>>
## Non-/sparse entries: 313200/95635108
## Sparsity           : 100%
## Maximal term length: 43
## Weighting          : term frequency (tf)
tdm_sparse  # after removing sparse terms
## <<DocumentTermMatrix (documents: 12973, terms: 56)>>
## Non-/sparse entries: 133350/593138
## Sparsity           : 82%
## Maximal term length: 16
## Weighting          : term frequency (tf)

Count frequency matrix VS Binary instance matrix

tdm_dm[1:10,] # Count frequency matrix
##    and biliary bladder both calculi cbd clinical corticomedullary
## 1   11       1       2    1       1   1        1                1
## 2    3       0       3    1       1   1        0                0
## 3    1       1       3    1       0   0        0                1
## 4    0       1       3    0       0   0        0                0
## 5    0       1       3    1       0   1        0                0
## 6   10       1       3    0       0   1        0                0
## 7   10       1       3    1       1   1        0                1
## 8    0       1       3    1       0   1        0                1
## 9    0       1       2    1       1   1        0                1
## 10   0       1       3    0       0   1        1                0
##    differentiation distended echotexture fluid focal free gall impression
## 1                1         2           2     1     2    1    1          1
## 2                0         1           2     0     1    0    2          1
## 3                0         0           1     0     1    0    2          1
## 4                0         0           0     0     1    0    2          1
## 5                0         0           1     0     1    0    2          1
## 6                0         0           0     0     1    0    2          0
## 7                1         1           2     0     1    0    2          1
## 8                0         0           1     0     1    1    2          1
## 9                0         0           0     0     1    0    1          1
## 10               0         0           0     0     1    0    2          1
##    kidneys left lesion liver maintained measures normal pancreas radicals
## 1        4    2      2     2          1        5      9        1        1
## 2        2    2      1     1          0        2     11        1        1
## 3        2    1      1     2          0        0     10        1        1
## 4        1    2      1     1          0        0     10        1        1
## 5        1    2      1     1          0        4     10        1        1
## 6        1    1      1     1          0        1     11        1        1
## 7        2    1      2     1          0        2     11        1        1
## 8        1    2      1     1          0        4     12        1        1
## 9        1    1      1     3          0        2      8        1        1
## 10       1    2      1     1          0        0     10        1        1
##    right size spleen urinary well dilatation echopattern evidence kidney
## 1      2    4      2       1    1          0           0        0      0
## 2      2    6      1       1    1          2           2        4      2
## 3      1    3      1       1    0          0           3        4      2
## 4      2    5      1       1    0          2           3        4      6
## 5      2    5      1       1    0          1           1        3      2
## 6      1    5      1       1    2          2           4        4      2
## 7      1    6      1       1    2          2           2        4      0
## 8      2    6      1       1    0          2           5        5      3
## 9      1    5      1       1    0          2           5        5      3
## 10     1    5      1       1    0          2           2        3      3
##    shape thickness wall abnormality detected intrahepatic prostate regular
## 1      0         0    0           0        0            0        0       0
## 2      5         3    2           0        0            0        0       0
## 3      5         2    2           1        1            1        1       0
## 4      6         1    2           1        0            1        1       0
## 5      6         4    2           1        1            1        0       0
## 6      7         4    2           0        0            1        0       0
## 7      6         5    2           0        0            1        0       0
## 8      8         4    2           1        1            1        0       0
## 9      6         2    1           0        0            1        1       0
## 10     7         2    2           1        0            1        1       0
##    cells pus study lesions mass epithelial angles lung contour central
## 1      0   0     0       0    0          0      0    0       0       0
## 2      0   0     0       0    0          0      0    0       0       0
## 3      0   0     0       0    0          0      0    0       0       0
## 4      0   0     0       0    0          0      0    0       0       0
## 5      0   0     0       0    0          0      0    0       0       0
## 6      0   0     0       0    0          0      0    0       0       0
## 7      0   0     0       0    0          0      0    0       0       0
## 8      0   0     0       0    0          0      0    0       0       0
## 9      0   0     0       0    0          0      0    0       0       0
## 10     0   0     0       0    0          0      0    0       0       0
##    pathology hpf echoes observation
## 1          0   0      0           0
## 2          0   0      0           0
## 3          0   0      0           0
## 4          0   0      0           0
## 5          0   0      0           0
## 6          0   0      0           0
## 7          0   0      0           0
## 8          0   0      0           0
## 9          0   0      0           0
## 10         0   0      0           0
tdm_df[1:10,] # Binary instance matrix
##    and biliary bladder both calculi cbd clinical corticomedullary
## 1    1       1       1    1       1   1        1                1
## 2    1       0       1    1       1   1        0                0
## 3    1       1       1    1       0   0        0                1
## 4    0       1       1    0       0   0        0                0
## 5    0       1       1    1       0   1        0                0
## 6    1       1       1    0       0   1        0                0
## 7    1       1       1    1       1   1        0                1
## 8    0       1       1    1       0   1        0                1
## 9    0       1       1    1       1   1        0                1
## 10   0       1       1    0       0   1        1                0
##    differentiation distended echotexture fluid focal free gall impression
## 1                1         1           1     1     1    1    1          1
## 2                0         1           1     0     1    0    1          1
## 3                0         0           1     0     1    0    1          1
## 4                0         0           0     0     1    0    1          1
## 5                0         0           1     0     1    0    1          1
## 6                0         0           0     0     1    0    1          0
## 7                1         1           1     0     1    0    1          1
## 8                0         0           1     0     1    1    1          1
## 9                0         0           0     0     1    0    1          1
## 10               0         0           0     0     1    0    1          1
##    kidneys left lesion liver maintained measures normal pancreas radicals
## 1        1    1      1     1          1        1      1        1        1
## 2        1    1      1     1          0        1      1        1        1
## 3        1    1      1     1          0        0      1        1        1
## 4        1    1      1     1          0        0      1        1        1
## 5        1    1      1     1          0        1      1        1        1
## 6        1    1      1     1          0        1      1        1        1
## 7        1    1      1     1          0        1      1        1        1
## 8        1    1      1     1          0        1      1        1        1
## 9        1    1      1     1          0        1      1        1        1
## 10       1    1      1     1          0        0      1        1        1
##    right size spleen urinary well dilatation echopattern evidence kidney
## 1      1    1      1       1    1          0           0        0      0
## 2      1    1      1       1    1          1           1        1      1
## 3      1    1      1       1    0          0           1        1      1
## 4      1    1      1       1    0          1           1        1      1
## 5      1    1      1       1    0          1           1        1      1
## 6      1    1      1       1    1          1           1        1      1
## 7      1    1      1       1    1          1           1        1      0
## 8      1    1      1       1    0          1           1        1      1
## 9      1    1      1       1    0          1           1        1      1
## 10     1    1      1       1    0          1           1        1      1
##    shape thickness wall abnormality detected intrahepatic prostate regular
## 1      0         0    0           0        0            0        0       0
## 2      1         1    1           0        0            0        0       0
## 3      1         1    1           1        1            1        1       0
## 4      1         1    1           1        0            1        1       0
## 5      1         1    1           1        1            1        0       0
## 6      1         1    1           0        0            1        0       0
## 7      1         1    1           0        0            1        0       0
## 8      1         1    1           1        1            1        0       0
## 9      1         1    1           0        0            1        1       0
## 10     1         1    1           1        0            1        1       0
##    cells pus study lesions mass epithelial angles lung contour central
## 1      0   0     0       0    0          0      0    0       0       0
## 2      0   0     0       0    0          0      0    0       0       0
## 3      0   0     0       0    0          0      0    0       0       0
## 4      0   0     0       0    0          0      0    0       0       0
## 5      0   0     0       0    0          0      0    0       0       0
## 6      0   0     0       0    0          0      0    0       0       0
## 7      0   0     0       0    0          0      0    0       0       0
## 8      0   0     0       0    0          0      0    0       0       0
## 9      0   0     0       0    0          0      0    0       0       0
## 10     0   0     0       0    0          0      0    0       0       0
##    pathology hpf echoes observation        notes$title
## 1          0   0      0           0 Diagnostic Imaging
## 2          0   0      0           0 Diagnostic Imaging
## 3          0   0      0           0 Diagnostic Imaging
## 4          0   0      0           0 Diagnostic Imaging
## 5          0   0      0           0 Diagnostic Imaging
## 6          0   0      0           0 Diagnostic Imaging
## 7          0   0      0           0 Diagnostic Imaging
## 8          0   0      0           0 Diagnostic Imaging
## 9          0   0      0           0 Diagnostic Imaging
## 10         0   0      0           0 Diagnostic Imaging

Data splitting into train and test sets

s <- sample(1:nrow(tdm_df), nrow(tdm_df)*(0.70), replace = FALSE) # random sampling

train <- tdm_df[s,] # training set

test <- tdm_df[-s,] # testing set
table(train$`notes$title`) # class instances in train data
## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                2748                1058                3925 
##      Doctors Advice Organ Function Test     Patient Related 
##                 761                 472                 117
table(test$`notes$title`) # class indtances in test data
## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                1178                 457                1669 
##      Doctors Advice Organ Function Test     Patient Related 
##                 348                 198                  42

Build Random forest model using repeated crossvalidation

require(caret)  # Load caret package

ctrl <- trainControl(method = "cv", number = 7) # 10-fold CV


set.seed(100)  

rf.tfidf <- train(train[,c(1:56)], train[,57],
                  method = "rpart", trControl = ctrl) # train random forest

rf.tfidf
## CART 
## 
## 9081 samples
##   56 predictor
##    6 classes: 'Body Fluid Analysis', 'Cytology Test', 'Diagnostic Imaging', 'Doctors Advice', 'Organ Function Test', 'Patient Related' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 7782, 7781, 7784, 7784, 7786, 7785, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03743212  0.7559641  0.6414526
##   0.04945694  0.6774839  0.5005556
##   0.45190070  0.5399708  0.2146879
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.03743212.

Results

pred <- predict(rf.tfidf, newdata = test)  # Predicted values on test set

summary(predict(rf.tfidf, newdata = test))
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                2016                 164                1712 
##      Doctors Advice Organ Function Test     Patient Related 
##                   0                   0                   0