MODEL BUILDING

Randomly sample the dtm and split it into a traning set and testing set.
Build a base model of random forest with repeated 10-fold cross validation.
Check for accuracy of the model on the training set and testing set.

Data preparation:

library(readr)
notes <- read_csv("C:/Users/welcome/Downloads/train_notes (1).csv")

notes$title <- as.factor(notes$title) # convert title to factor

notes <- notes[,-1] # remove firt column

notes <- notes[complete.cases(notes),]  # remove missing values if any

dim(notes)[1]*(0.7)  #  check for 70% of data

## [1] 9081.1

table(notes$title)  #  class instances of the complete data set

## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                3926                1515                5594 
##      Doctors Advice Organ Function Test     Patient Related 
##                1109                 670                 159

Word vector and Text pre-preprocessing:

require(tm)  # load text mining package

sd <- VectorSource(notes$description) # words vector

corpus <- Corpus(sd)  # build corpus

corpus <- tm_map(corpus, removeNumbers)  # remove numbers

corpus <- tm_map(corpus, removePunctuation) # remove puntucations

corpus <- tm_map(corpus, stripWhitespace) # remove  white spaces

corpus <- tm_map(corpus, removeWords, c(stopwords('english'), "and", "are", "the",
                                        "both", "appears", "within", "appear",
                                        "others", "clear", "right", "seen", 
                                        "well")) # remove stopwords

corpus <- tm_map(corpus, content_transformer(tolower))  # change to lower case

Building Document term matrix:

tdm <- DocumentTermMatrix(corpus)  # build document term matrix

tdm_sparse <- removeSparseTerms(tdm, 0.90) # remove sparse terms

tdm_dm <- as.data.frame(as.matrix(tdm_sparse)) # count matrix

tdm_df <- as.matrix((tdm_dm > 0) + 0) # binary instance matrix

tdm_df <- as.data.frame(tdm_df)

tdm_df <- cbind(tdm_df, notes$title) # append label column from original dataset

Document term matrix before removing sparse terms and after removing sparse terms.

tdm  # before removing sparse terms

## <<DocumentTermMatrix (documents: 12973, terms: 7396)>>
## Non-/sparse entries: 313200/95635108
## Sparsity           : 100%
## Maximal term length: 43
## Weighting          : term frequency (tf)

tdm_sparse  # after removing sparse terms

## <<DocumentTermMatrix (documents: 12973, terms: 56)>>
## Non-/sparse entries: 133350/593138
## Sparsity           : 82%
## Maximal term length: 16
## Weighting          : term frequency (tf)

Count frequency matrix VS Binary instance matrix

tdm_dm[1:10,] # Count frequency matrix

##    and biliary bladder both calculi cbd clinical corticomedullary
## 1   11       1       2    1       1   1        1                1
## 2    3       0       3    1       1   1        0                0
## 3    1       1       3    1       0   0        0                1
## 4    0       1       3    0       0   0        0                0
## 5    0       1       3    1       0   1        0                0
## 6   10       1       3    0       0   1        0                0
## 7   10       1       3    1       1   1        0                1
## 8    0       1       3    1       0   1        0                1
## 9    0       1       2    1       1   1        0                1
## 10   0       1       3    0       0   1        1                0
##    differentiation distended echotexture fluid focal free gall impression
## 1                1         2           2     1     2    1    1          1
## 2                0         1           2     0     1    0    2          1
## 3                0         0           1     0     1    0    2          1
## 4                0         0           0     0     1    0    2          1
## 5                0         0           1     0     1    0    2          1
## 6                0         0           0     0     1    0    2          0
## 7                1         1           2     0     1    0    2          1
## 8                0         0           1     0     1    1    2          1
## 9                0         0           0     0     1    0    1          1
## 10               0         0           0     0     1    0    2          1
##    kidneys left lesion liver maintained measures normal pancreas radicals
## 1        4    2      2     2          1        5      9        1        1
## 2        2    2      1     1          0        2     11        1        1
## 3        2    1      1     2          0        0     10        1        1
## 4        1    2      1     1          0        0     10        1        1
## 5        1    2      1     1          0        4     10        1        1
## 6        1    1      1     1          0        1     11        1        1
## 7        2    1      2     1          0        2     11        1        1
## 8        1    2      1     1          0        4     12        1        1
## 9        1    1      1     3          0        2      8        1        1
## 10       1    2      1     1          0        0     10        1        1
##    right size spleen urinary well dilatation echopattern evidence kidney
## 1      2    4      2       1    1          0           0        0      0
## 2      2    6      1       1    1          2           2        4      2
## 3      1    3      1       1    0          0           3        4      2
## 4      2    5      1       1    0          2           3        4      6
## 5      2    5      1       1    0          1           1        3      2
## 6      1    5      1       1    2          2           4        4      2
## 7      1    6      1       1    2          2           2        4      0
## 8      2    6      1       1    0          2           5        5      3
## 9      1    5      1       1    0          2           5        5      3
## 10     1    5      1       1    0          2           2        3      3
##    shape thickness wall abnormality detected intrahepatic prostate regular
## 1      0         0    0           0        0            0        0       0
## 2      5         3    2           0        0            0        0       0
## 3      5         2    2           1        1            1        1       0
## 4      6         1    2           1        0            1        1       0
## 5      6         4    2           1        1            1        0       0
## 6      7         4    2           0        0            1        0       0
## 7      6         5    2           0        0            1        0       0
## 8      8         4    2           1        1            1        0       0
## 9      6         2    1           0        0            1        1       0
## 10     7         2    2           1        0            1        1       0
##    cells pus study lesions mass epithelial angles lung contour central
## 1      0   0     0       0    0          0      0    0       0       0
## 2      0   0     0       0    0          0      0    0       0       0
## 3      0   0     0       0    0          0      0    0       0       0
## 4      0   0     0       0    0          0      0    0       0       0
## 5      0   0     0       0    0          0      0    0       0       0
## 6      0   0     0       0    0          0      0    0       0       0
## 7      0   0     0       0    0          0      0    0       0       0
## 8      0   0     0       0    0          0      0    0       0       0
## 9      0   0     0       0    0          0      0    0       0       0
## 10     0   0     0       0    0          0      0    0       0       0
##    pathology hpf echoes observation
## 1          0   0      0           0
## 2          0   0      0           0
## 3          0   0      0           0
## 4          0   0      0           0
## 5          0   0      0           0
## 6          0   0      0           0
## 7          0   0      0           0
## 8          0   0      0           0
## 9          0   0      0           0
## 10         0   0      0           0

tdm_df[1:10,] # Binary instance matrix

##    and biliary bladder both calculi cbd clinical corticomedullary
## 1    1       1       1    1       1   1        1                1
## 2    1       0       1    1       1   1        0                0
## 3    1       1       1    1       0   0        0                1
## 4    0       1       1    0       0   0        0                0
## 5    0       1       1    1       0   1        0                0
## 6    1       1       1    0       0   1        0                0
## 7    1       1       1    1       1   1        0                1
## 8    0       1       1    1       0   1        0                1
## 9    0       1       1    1       1   1        0                1
## 10   0       1       1    0       0   1        1                0
##    differentiation distended echotexture fluid focal free gall impression
## 1                1         1           1     1     1    1    1          1
## 2                0         1           1     0     1    0    1          1
## 3                0         0           1     0     1    0    1          1
## 4                0         0           0     0     1    0    1          1
## 5                0         0           1     0     1    0    1          1
## 6                0         0           0     0     1    0    1          0
## 7                1         1           1     0     1    0    1          1
## 8                0         0           1     0     1    1    1          1
## 9                0         0           0     0     1    0    1          1
## 10               0         0           0     0     1    0    1          1
##    kidneys left lesion liver maintained measures normal pancreas radicals
## 1        1    1      1     1          1        1      1        1        1
## 2        1    1      1     1          0        1      1        1        1
## 3        1    1      1     1          0        0      1        1        1
## 4        1    1      1     1          0        0      1        1        1
## 5        1    1      1     1          0        1      1        1        1
## 6        1    1      1     1          0        1      1        1        1
## 7        1    1      1     1          0        1      1        1        1
## 8        1    1      1     1          0        1      1        1        1
## 9        1    1      1     1          0        1      1        1        1
## 10       1    1      1     1          0        0      1        1        1
##    right size spleen urinary well dilatation echopattern evidence kidney
## 1      1    1      1       1    1          0           0        0      0
## 2      1    1      1       1    1          1           1        1      1
## 3      1    1      1       1    0          0           1        1      1
## 4      1    1      1       1    0          1           1        1      1
## 5      1    1      1       1    0          1           1        1      1
## 6      1    1      1       1    1          1           1        1      1
## 7      1    1      1       1    1          1           1        1      0
## 8      1    1      1       1    0          1           1        1      1
## 9      1    1      1       1    0          1           1        1      1
## 10     1    1      1       1    0          1           1        1      1
##    shape thickness wall abnormality detected intrahepatic prostate regular
## 1      0         0    0           0        0            0        0       0
## 2      1         1    1           0        0            0        0       0
## 3      1         1    1           1        1            1        1       0
## 4      1         1    1           1        0            1        1       0
## 5      1         1    1           1        1            1        0       0
## 6      1         1    1           0        0            1        0       0
## 7      1         1    1           0        0            1        0       0
## 8      1         1    1           1        1            1        0       0
## 9      1         1    1           0        0            1        1       0
## 10     1         1    1           1        0            1        1       0
##    cells pus study lesions mass epithelial angles lung contour central
## 1      0   0     0       0    0          0      0    0       0       0
## 2      0   0     0       0    0          0      0    0       0       0
## 3      0   0     0       0    0          0      0    0       0       0
## 4      0   0     0       0    0          0      0    0       0       0
## 5      0   0     0       0    0          0      0    0       0       0
## 6      0   0     0       0    0          0      0    0       0       0
## 7      0   0     0       0    0          0      0    0       0       0
## 8      0   0     0       0    0          0      0    0       0       0
## 9      0   0     0       0    0          0      0    0       0       0
## 10     0   0     0       0    0          0      0    0       0       0
##    pathology hpf echoes observation        notes$title
## 1          0   0      0           0 Diagnostic Imaging
## 2          0   0      0           0 Diagnostic Imaging
## 3          0   0      0           0 Diagnostic Imaging
## 4          0   0      0           0 Diagnostic Imaging
## 5          0   0      0           0 Diagnostic Imaging
## 6          0   0      0           0 Diagnostic Imaging
## 7          0   0      0           0 Diagnostic Imaging
## 8          0   0      0           0 Diagnostic Imaging
## 9          0   0      0           0 Diagnostic Imaging
## 10         0   0      0           0 Diagnostic Imaging

Data splitting into train and test sets

s <- sample(1:nrow(tdm_df), nrow(tdm_df)*(0.70), replace = FALSE) # random sampling

train <- tdm_df[s,] # training set

test <- tdm_df[-s,] # testing set

table(train$`notes$title`) # class instances in train data

## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                2748                1058                3925 
##      Doctors Advice Organ Function Test     Patient Related 
##                 761                 472                 117

table(test$`notes$title`) # class indtances in test data

## 
## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                1178                 457                1669 
##      Doctors Advice Organ Function Test     Patient Related 
##                 348                 198                  42

Build Random forest model using repeated crossvalidation

require(caret)  # Load caret package

ctrl <- trainControl(method = "cv", number = 7) # 10-fold CV


set.seed(100)  

rf.tfidf <- train(train[,c(1:56)], train[,57],
                  method = "rpart", trControl = ctrl) # train random forest

rf.tfidf

## CART 
## 
## 9081 samples
##   56 predictor
##    6 classes: 'Body Fluid Analysis', 'Cytology Test', 'Diagnostic Imaging', 'Doctors Advice', 'Organ Function Test', 'Patient Related' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 7782, 7781, 7784, 7784, 7786, 7785, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03743212  0.7559641  0.6414526
##   0.04945694  0.6774839  0.5005556
##   0.45190070  0.5399708  0.2146879
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.03743212.

Results

pred <- predict(rf.tfidf, newdata = test)  # Predicted values on test set

summary(predict(rf.tfidf, newdata = test))

## Body Fluid Analysis       Cytology Test  Diagnostic Imaging 
##                2016                 164                1712 
##      Doctors Advice Organ Function Test     Patient Related 
##                   0                   0                   0

Multiclass classification for text data

karthik

January 3, 2018

DATA PREPARATION

MODEL BUILDING

Data preparation:

Word vector and Text pre-preprocessing:

Building Document term matrix:

Document term matrix before removing sparse terms and after removing sparse terms.

Count frequency matrix VS Binary instance matrix

Data splitting into train and test sets

Build Random forest model using repeated crossvalidation

Results