Objective: Based on description of each labtest, predict whether the outcome is normal/abnormal/other

library(readr)
notes <- read_csv("C:/Users/welcome/Desktop/notes_labled_data.csv")

notes <- notes[complete.cases(notes),] # remove missing values

notes <- notes[,-c(1,2)] # remove redundant columns

notes$Status <- as.factor(notes$Status)  # convert Status column to factor
dim(notes) # shape of the dataset
## [1] 355   2

Class instances

table(notes$Status)
## 
## Abnormal   Normal    Other 
##      101      113      141

Make 3 subsets of the notes dataset, one subset for each class label and build document term matrix for each subset

normal <- notes[grepl("Normal", notes$Status, ignore.case = T),]

abnormal <- notes[grepl("Abnormal", notes$Status, ignore.case = T),]

other <- notes[grepl("Other", notes$Status, ignore.case = T),]

Build document term matrix for normal and get the key terms.

require(tm)  # load text mining package

sd1 <- VectorSource(normal$description) # words vector

corpus1 <- Corpus(sd1)  # build corpus



corpus1 <- tm_map(corpus1, removePunctuation) # remove puntucations

corpus1 <- tm_map(corpus1, stripWhitespace) # remove  white spaces

corpus1 <- tm_map(corpus1, removeWords, c(stopwords('english'), "and", "are", "the",
                                          "both", "appears", "within", "appear",
                                          "others", "clear", "right", "seen", 
                                          "well")) # remove stopwords

corpus1 <- tm_map(corpus1, content_transformer(tolower))  # change to lower case




tdm1 <- DocumentTermMatrix(corpus1)  # build document term matrix

tdm_sparse1 <- removeSparseTerms(tdm1, 0.97) # remove sparse terms

tdm_dm1 <- as.data.frame(as.matrix(tdm_sparse1)) # count matrix

tdm_df1 <- as.matrix((tdm_dm1 > 0) + 0) # binary instance matrix

tdm_df1 <- as.data.frame(tdm_df1)

tdm_df1 <- cbind(tdm_df1, normal$Status) # append label column from original dataset

terms1 <- as.data.frame(sapply(tdm_df1[,-68], sum))


terms1 <- cbind(rownames(terms1), terms1)
terms1
##                                    rownames(terms1)
## abnormality                             abnormality
## aortic                                       aortic
## area                                           area
## clinical                                   clinical
## evidence                                   evidence
## impression                               impression
## left                                           left
## lesion                                       lesion
## normal                                       normal
## noted                                         noted
## right                                         right
## shows                                         shows
## size                                           size
## small                                         small
## the                                             the
## wall                                           wall
## both                                           both
## nil                                             nil
## adequacy                                   adequacy
## and                                             and
## cells                                         cells
## comments                                   comments
## epithelial                               epithelial
## identified                               identified
## present                                     present
## routine                                     routine
## satisfactory                           satisfactory
## show                                           show
## smear                                         smear
## specimen                                   specimen
## are                                             are
## negative                                   negative
## bilateral                                 bilateral
## broncho                                     broncho
## configuration                         configuration
## costophrenic                           costophrenic
## diaphragm                                 diaphragm
## domes                                         domes
## heart                                         heart
## hila                                           hila
## limits                                       limits
## markings                                   markings
## prominent                                 prominent
## pulmonary                                 pulmonary
## vascular                                   vascular
## chest                                         chest
## fields                                       fields
## final                                         final
## lung                                           lung
## recesses                                   recesses
## soft                                           soft
## tissues                                     tissues
## absent                                       absent
## bilirubin                                 bilirubin
## urobilinogen                           urobilinogen
## tcsqctransitionalsquamous tcsqctransitionalsquamous
## wbcpus                                       wbcpus
## hpf                                             hpf
## 12hpf                                         12hpf
## pus                                             pus
## hilar                                         hilar
## pleural                                     pleural
## significant                             significant
## spaces                                       spaces
## rbc                                             rbc
## nitrogen                                   nitrogen
## urea                                           urea
##                           sapply(tdm_df1[, -68], sum)
## abnormality                                        21
## aortic                                             12
## area                                                8
## clinical                                           10
## evidence                                            7
## impression                                         22
## left                                                9
## lesion                                              8
## normal                                             45
## noted                                              10
## right                                               8
## shows                                              10
## size                                               17
## small                                              10
## the                                                26
## wall                                               12
## both                                               20
## nil                                                 9
## adequacy                                            7
## and                                                15
## cells                                              43
## comments                                            7
## epithelial                                         33
## identified                                          8
## present                                             7
## routine                                             8
## satisfactory                                        7
## show                                                8
## smear                                               9
## specimen                                            8
## are                                                 8
## negative                                           15
## bilateral                                           9
## broncho                                             7
## configuration                                       8
## costophrenic                                       10
## diaphragm                                          12
## domes                                              12
## heart                                              13
## hila                                                8
## limits                                             10
## markings                                            7
## prominent                                           8
## pulmonary                                          10
## vascular                                            8
## chest                                              12
## fields                                             10
## final                                               8
## lung                                               14
## recesses                                            8
## soft                                                8
## tissues                                             8
## absent                                             11
## bilirubin                                           7
## urobilinogen                                        7
## tcsqctransitionalsquamous                           8
## wbcpus                                             12
## hpf                                                13
## 12hpf                                               9
## pus                                                 9
## hilar                                               8
## pleural                                            10
## significant                                         8
## spaces                                              7
## rbc                                                 7
## nitrogen                                          107
## urea                                              107

Build document term matrix for abnormal, and get the key terms

sd2 <- VectorSource(abnormal$description) # words vector

corpus2 <- Corpus(sd2)  # build corpus

corpus2 <- tm_map(corpus2, removeNumbers)  # remove numbers

corpus2 <- tm_map(corpus2, removePunctuation) # remove puntucations

corpus2 <- tm_map(corpus2, stripWhitespace) # remove  white spaces

corpus2 <- tm_map(corpus2, removeWords, c(stopwords('english'), "and", "are", "the",
                                          "both", "appears", "within", "appear",
                                          "others", "clear", "right", "seen", 
                                          "well")) # remove stopwords

corpus2 <- tm_map(corpus2, content_transformer(tolower))  # change to lower case




tdm2 <- DocumentTermMatrix(corpus2)  # build document term matrix

tdm_sparse2<- removeSparseTerms(tdm2, 0.97) # remove sparse terms

tdm_dm2 <- as.data.frame(as.matrix(tdm_sparse2)) # count matrix

tdm_df2 <- as.matrix((tdm_dm2 > 0) + 0) # binary instance matrix

tdm_df2 <- as.data.frame(tdm_df2)

tdm_df2 <- cbind(tdm_df2, abnormal$Status) # append label column from original dataset

terms2 <- as.data.frame(sapply(tdm_df2[,-84], sum))


terms2 <- cbind(rownames(terms2), terms2)
terms2
##                  rownames(terms2) sapply(tdm_df2[, -84], sum)
## abnormalities       abnormalities                           4
## adequacy                 adequacy                           4
## and                           and                           7
## area                         area                           4
## cells                       cells                          13
## cellular                 cellular                           4
## cervical                 cervical                           4
## clinical                 clinical                           4
## comments                 comments                           4
## composition           composition                           4
## covering                 covering                           4
## epithelial             epithelial                          10
## few                           few                           4
## history                   history                           4
## identification     identification                           4
## identified             identified                           5
## information           information                           4
## interpretability interpretability                           4
## pap                           pap                           4
## patient                   patient                           4
## present                   present                           7
## preserved               preserved                           4
## provided                 provided                           4
## routine                   routine                           4
## sampled                   sampled                           4
## sampling                 sampling                           4
## satisfactory         satisfactory                           4
## seen                         seen                           4
## show                         show                           6
## slide                       slide                           4
## smear                       smear                           5
## source                     source                           4
## specimen                 specimen                           5
## technical               technical                           4
## than                         than                           4
## transformation     transformation                           4
## with                         with                           4
## zone                         zone                           4
## are                           are                           5
## evaluation             evaluation                           4
## for                           for                           4
## the                           the                          10
## bilateral               bilateral                           7
## both                         both                           8
## bronchitis             bronchitis                           6
## broncho                   broncho                           5
## cardiophrenic       cardiophrenic                           5
## configuration       configuration                           6
## costophrenic         costophrenic                           5
## diaphragm               diaphragm                           5
## domes                       domes                           6
## heart                       heart                           5
## hila                         hila                           5
## impression             impression                          12
## limits                     limits                           4
## markings                 markings                           5
## normal                     normal                          11
## prominent               prominent                           6
## pulmonary               pulmonary                           7
## size                         size                           8
## vascular                 vascular                           6
## abnormality           abnormality                           5
## arch                         arch                           4
## chest                       chest                           7
## final                       final                           5
## lung                         lung                           4
## recesses                 recesses                           4
## shows                       shows                           6
## soft                         soft                           4
## tissues                   tissues                           5
## wall                         wall                           7
## noted                       noted                           4
## hpf                           hpf                           5
## pus                           pus                           4
## left                         left                           4
## lobe                         lobe                           5
## pleural                   pleural                           4
## right                       right                           4
## thickening             thickening                           4
## upper                       upper                           4
## evidence                 evidence                           4
## nitrogen                 nitrogen                          71
## urea                         urea                          71

Build document term matrix for other, and get key terms

sd3 <- VectorSource(other$description) # words vector

corpus3 <- Corpus(sd3)  # build corpus

corpus3 <- tm_map(corpus3, removeNumbers)  # remove numbers

corpus3 <- tm_map(corpus3, removePunctuation) # remove puntucations

corpus3 <- tm_map(corpus3, stripWhitespace) # remove  white spaces

corpus3 <- tm_map(corpus3, removeWords, c(stopwords('english'), "and", "are", "the",
                                          "both", "appears", "within", "appear",
                                          "others", "clear", "right", "seen", 
                                          "well")) # remove stopwords

corpus3 <- tm_map(corpus3, content_transformer(tolower))  # change to lower case




tdm3 <- DocumentTermMatrix(corpus3)  # build document term matrix

tdm_sparse3 <- removeSparseTerms(tdm3, 0.95) # remove sparse terms

tdm_dm3 <- as.data.frame(as.matrix(tdm_sparse3)) # count matrix

tdm_df3 <- as.matrix((tdm_dm3 > 0) + 0) # binary instance matrix

tdm_df3 <- as.data.frame(tdm_df3)

tdm_df3 <- cbind(tdm_df3, other$Status) # append label column from original dataset

terms3 <- as.data.frame(sapply(tdm_df3[,-17], sum))


terms3 <- cbind(rownames(terms3), terms3)
terms3
##          rownames(terms3) sapply(tdm_df3[, -17], sum)
## diet                 diet                          37
## exercise         exercise                          29
## reduce             reduce                          10
## daily               daily                          16
## intake             intake                           8
## regular           regular                          15
## water               water                          10
## weight             weight                          17
## fat                   fat                          12
## low                   low                          19
## opinion           opinion                           8
## cold                 cold                          12
## cough               cough                          12
## pain                 pain                          14
## ear                   ear                          11
## headache         headache                           8

Get the key terms from the above three dtm’s

names <- c("diet","exercise", "reduce", "daily", "intake",
           "regular", "water", "weight", "fat", "low", "opinion", "cold",
           "cough", "pain", "ear", "headache", "cells", "epithelial",
           "present", "show", "bilateral", "bronchitis", "configuration",
           "domes", "impression", "prominent", "pulmonary", "size", "vascular",
           "chest", "wall", "nitrogen", "urea", "abnormality", "aortic",
           "impression", "normal", "size", "wall", "cells", "epithelial", 
           "negative", "diaphragm", "domes", "heart", "chest", "lung",
           "absent", "hpf", "wbcpus", "nitrogen", "urea", "urobilinogen",
           "rbc","costophrenic", "bilateral", "pleural", "broncho","clinical",
           "limits", "recesses", "tcsqctransitionalsquamous","smear",
           "specimen", "hilar", "pus","notes$Status")
length(names)
## [1] 67

Build document term matrix for the whole dataset at 100% sparsity

require(tm)  # load text mining package

sd <- VectorSource(notes$description) # words vector

corpus <- Corpus(sd)  # build corpus

corpus <- tm_map(corpus, removeNumbers)  # remove numbers

corpus <- tm_map(corpus, removePunctuation) # remove puntucations

corpus <- tm_map(corpus, stripWhitespace) # remove  white spaces

corpus <- tm_map(corpus, removeWords, c(stopwords('english'), "and", "are", "the",
                                        "both", "appears", "within", "appear",
                                        "others", "clear", "right", "seen", 
                                        "well")) # remove stopwords

corpus <- tm_map(corpus, content_transformer(tolower))  # change to lower case


tdm <- DocumentTermMatrix(corpus)  # build document term matrix


tdm_dm <- as.data.frame(as.matrix(tdm)) # count matrix

tdm_df <- as.matrix((tdm_dm > 0) + 0) # binary instance matrix

tdm_df <- as.data.frame(tdm_df)

tdm_df <- cbind(tdm_df, notes$Status)

Select the column names from the above tdm that match with the key terms.

library(tidyverse)


final <- tdm_df %>% select(names)  # select column names matching with key terms

set.seed(1234)

s <- sample(1:nrow(final), nrow(final)*(0.70), replace = FALSE) # random sampling

train <- final[s,] # training set

test <- final[-s,] # testing set

Fit a gradient boosting model

require(h2o)

localH2O <- h2o.init(nthreads = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 minutes 23 seconds 
##     H2O cluster version:        3.14.0.3 
##     H2O cluster version age:    4 months and 29 days !!! 
##     H2O cluster name:           H2O_started_from_R_Karthik_zun522 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.70 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.1 (2017-06-30)
h2o.init() # Initialize h20
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 minutes 24 seconds 
##     H2O cluster version:        3.14.0.3 
##     H2O cluster version age:    4 months and 29 days !!! 
##     H2O cluster name:           H2O_started_from_R_Karthik_zun522 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.70 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.1 (2017-06-30)
train.h2o <- as.h2o(train) # Train set converted to h2o dataframe
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
test.h2o <- as.h2o(test) # Test set converted to h2o dataframe
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
y.dep <- 57# Dependent Variable

x.indep <- c(1:56) # Independent variables

gbm <- h2o.gbm(y=y.dep, x=x.indep, training_frame = train.h2o, 
               ntrees=50, learn_rate=0.1, stopping_rounds = 5, nfolds = 5, seed = 1234)  # 5 fold cross validation
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%

Model performance on train data

h2o.performance(gbm, train.h2o)
## H2OMultinomialMetrics: gbm
## 
## Test Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.1329738
## RMSE: (Extract with `h2o.rmse`) 0.3646557
## Logloss: (Extract with `h2o.logloss`) 0.4168374
## Mean Per-Class Error: 0.1917808
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##          Abnormal Normal Other  Error       Rate
## Abnormal       64      7     2 0.1233 =   9 / 73
## Normal         25     40     8 0.4521 =  33 / 73
## Other           0      0   102 0.0000 =  0 / 102
## Totals         89     47   112 0.1694 = 42 / 248
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.830645
## 2 2  0.991935
## 3 3  1.000000

Model performance on test data

h2o.performance(gbm, test.h2o)
## H2OMultinomialMetrics: gbm
## 
## Test Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.1881815
## RMSE: (Extract with `h2o.rmse`) 0.4337989
## Logloss: (Extract with `h2o.logloss`) 0.5595079
## Mean Per-Class Error: 0.2535714
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##          Abnormal Normal Other  Error       Rate
## Abnormal       20      6     2 0.2857 =   8 / 28
## Normal         15     21     4 0.4750 =  19 / 40
## Other           0      0    39 0.0000 =   0 / 39
## Totals         35     27    45 0.2523 = 27 / 107
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.747664
## 2 2  0.981308
## 3 3  1.000000