Objective: Based on description of each labtest, predict whether the outcome is normal/abnormal/other
library(readr)
notes <- read_csv("C:/Users/welcome/Desktop/notes_labled_data.csv")
notes <- notes[complete.cases(notes),] # remove missing values
notes <- notes[,-c(1,2)] # remove redundant columns
notes$Status <- as.factor(notes$Status) # convert Status column to factor
dim(notes) # shape of the dataset
## [1] 355 2
Class instances
table(notes$Status)
##
## Abnormal Normal Other
## 101 113 141
normal <- notes[grepl("Normal", notes$Status, ignore.case = T),]
abnormal <- notes[grepl("Abnormal", notes$Status, ignore.case = T),]
other <- notes[grepl("Other", notes$Status, ignore.case = T),]
require(tm) # load text mining package
sd1 <- VectorSource(normal$description) # words vector
corpus1 <- Corpus(sd1) # build corpus
corpus1 <- tm_map(corpus1, removePunctuation) # remove puntucations
corpus1 <- tm_map(corpus1, stripWhitespace) # remove white spaces
corpus1 <- tm_map(corpus1, removeWords, c(stopwords('english'), "and", "are", "the",
"both", "appears", "within", "appear",
"others", "clear", "right", "seen",
"well")) # remove stopwords
corpus1 <- tm_map(corpus1, content_transformer(tolower)) # change to lower case
tdm1 <- DocumentTermMatrix(corpus1) # build document term matrix
tdm_sparse1 <- removeSparseTerms(tdm1, 0.97) # remove sparse terms
tdm_dm1 <- as.data.frame(as.matrix(tdm_sparse1)) # count matrix
tdm_df1 <- as.matrix((tdm_dm1 > 0) + 0) # binary instance matrix
tdm_df1 <- as.data.frame(tdm_df1)
tdm_df1 <- cbind(tdm_df1, normal$Status) # append label column from original dataset
terms1 <- as.data.frame(sapply(tdm_df1[,-68], sum))
terms1 <- cbind(rownames(terms1), terms1)
terms1
## rownames(terms1)
## abnormality abnormality
## aortic aortic
## area area
## clinical clinical
## evidence evidence
## impression impression
## left left
## lesion lesion
## normal normal
## noted noted
## right right
## shows shows
## size size
## small small
## the the
## wall wall
## both both
## nil nil
## adequacy adequacy
## and and
## cells cells
## comments comments
## epithelial epithelial
## identified identified
## present present
## routine routine
## satisfactory satisfactory
## show show
## smear smear
## specimen specimen
## are are
## negative negative
## bilateral bilateral
## broncho broncho
## configuration configuration
## costophrenic costophrenic
## diaphragm diaphragm
## domes domes
## heart heart
## hila hila
## limits limits
## markings markings
## prominent prominent
## pulmonary pulmonary
## vascular vascular
## chest chest
## fields fields
## final final
## lung lung
## recesses recesses
## soft soft
## tissues tissues
## absent absent
## bilirubin bilirubin
## urobilinogen urobilinogen
## tcsqctransitionalsquamous tcsqctransitionalsquamous
## wbcpus wbcpus
## hpf hpf
## 12hpf 12hpf
## pus pus
## hilar hilar
## pleural pleural
## significant significant
## spaces spaces
## rbc rbc
## nitrogen nitrogen
## urea urea
## sapply(tdm_df1[, -68], sum)
## abnormality 21
## aortic 12
## area 8
## clinical 10
## evidence 7
## impression 22
## left 9
## lesion 8
## normal 45
## noted 10
## right 8
## shows 10
## size 17
## small 10
## the 26
## wall 12
## both 20
## nil 9
## adequacy 7
## and 15
## cells 43
## comments 7
## epithelial 33
## identified 8
## present 7
## routine 8
## satisfactory 7
## show 8
## smear 9
## specimen 8
## are 8
## negative 15
## bilateral 9
## broncho 7
## configuration 8
## costophrenic 10
## diaphragm 12
## domes 12
## heart 13
## hila 8
## limits 10
## markings 7
## prominent 8
## pulmonary 10
## vascular 8
## chest 12
## fields 10
## final 8
## lung 14
## recesses 8
## soft 8
## tissues 8
## absent 11
## bilirubin 7
## urobilinogen 7
## tcsqctransitionalsquamous 8
## wbcpus 12
## hpf 13
## 12hpf 9
## pus 9
## hilar 8
## pleural 10
## significant 8
## spaces 7
## rbc 7
## nitrogen 107
## urea 107
sd2 <- VectorSource(abnormal$description) # words vector
corpus2 <- Corpus(sd2) # build corpus
corpus2 <- tm_map(corpus2, removeNumbers) # remove numbers
corpus2 <- tm_map(corpus2, removePunctuation) # remove puntucations
corpus2 <- tm_map(corpus2, stripWhitespace) # remove white spaces
corpus2 <- tm_map(corpus2, removeWords, c(stopwords('english'), "and", "are", "the",
"both", "appears", "within", "appear",
"others", "clear", "right", "seen",
"well")) # remove stopwords
corpus2 <- tm_map(corpus2, content_transformer(tolower)) # change to lower case
tdm2 <- DocumentTermMatrix(corpus2) # build document term matrix
tdm_sparse2<- removeSparseTerms(tdm2, 0.97) # remove sparse terms
tdm_dm2 <- as.data.frame(as.matrix(tdm_sparse2)) # count matrix
tdm_df2 <- as.matrix((tdm_dm2 > 0) + 0) # binary instance matrix
tdm_df2 <- as.data.frame(tdm_df2)
tdm_df2 <- cbind(tdm_df2, abnormal$Status) # append label column from original dataset
terms2 <- as.data.frame(sapply(tdm_df2[,-84], sum))
terms2 <- cbind(rownames(terms2), terms2)
terms2
## rownames(terms2) sapply(tdm_df2[, -84], sum)
## abnormalities abnormalities 4
## adequacy adequacy 4
## and and 7
## area area 4
## cells cells 13
## cellular cellular 4
## cervical cervical 4
## clinical clinical 4
## comments comments 4
## composition composition 4
## covering covering 4
## epithelial epithelial 10
## few few 4
## history history 4
## identification identification 4
## identified identified 5
## information information 4
## interpretability interpretability 4
## pap pap 4
## patient patient 4
## present present 7
## preserved preserved 4
## provided provided 4
## routine routine 4
## sampled sampled 4
## sampling sampling 4
## satisfactory satisfactory 4
## seen seen 4
## show show 6
## slide slide 4
## smear smear 5
## source source 4
## specimen specimen 5
## technical technical 4
## than than 4
## transformation transformation 4
## with with 4
## zone zone 4
## are are 5
## evaluation evaluation 4
## for for 4
## the the 10
## bilateral bilateral 7
## both both 8
## bronchitis bronchitis 6
## broncho broncho 5
## cardiophrenic cardiophrenic 5
## configuration configuration 6
## costophrenic costophrenic 5
## diaphragm diaphragm 5
## domes domes 6
## heart heart 5
## hila hila 5
## impression impression 12
## limits limits 4
## markings markings 5
## normal normal 11
## prominent prominent 6
## pulmonary pulmonary 7
## size size 8
## vascular vascular 6
## abnormality abnormality 5
## arch arch 4
## chest chest 7
## final final 5
## lung lung 4
## recesses recesses 4
## shows shows 6
## soft soft 4
## tissues tissues 5
## wall wall 7
## noted noted 4
## hpf hpf 5
## pus pus 4
## left left 4
## lobe lobe 5
## pleural pleural 4
## right right 4
## thickening thickening 4
## upper upper 4
## evidence evidence 4
## nitrogen nitrogen 71
## urea urea 71
sd3 <- VectorSource(other$description) # words vector
corpus3 <- Corpus(sd3) # build corpus
corpus3 <- tm_map(corpus3, removeNumbers) # remove numbers
corpus3 <- tm_map(corpus3, removePunctuation) # remove puntucations
corpus3 <- tm_map(corpus3, stripWhitespace) # remove white spaces
corpus3 <- tm_map(corpus3, removeWords, c(stopwords('english'), "and", "are", "the",
"both", "appears", "within", "appear",
"others", "clear", "right", "seen",
"well")) # remove stopwords
corpus3 <- tm_map(corpus3, content_transformer(tolower)) # change to lower case
tdm3 <- DocumentTermMatrix(corpus3) # build document term matrix
tdm_sparse3 <- removeSparseTerms(tdm3, 0.95) # remove sparse terms
tdm_dm3 <- as.data.frame(as.matrix(tdm_sparse3)) # count matrix
tdm_df3 <- as.matrix((tdm_dm3 > 0) + 0) # binary instance matrix
tdm_df3 <- as.data.frame(tdm_df3)
tdm_df3 <- cbind(tdm_df3, other$Status) # append label column from original dataset
terms3 <- as.data.frame(sapply(tdm_df3[,-17], sum))
terms3 <- cbind(rownames(terms3), terms3)
terms3
## rownames(terms3) sapply(tdm_df3[, -17], sum)
## diet diet 37
## exercise exercise 29
## reduce reduce 10
## daily daily 16
## intake intake 8
## regular regular 15
## water water 10
## weight weight 17
## fat fat 12
## low low 19
## opinion opinion 8
## cold cold 12
## cough cough 12
## pain pain 14
## ear ear 11
## headache headache 8
names <- c("diet","exercise", "reduce", "daily", "intake",
"regular", "water", "weight", "fat", "low", "opinion", "cold",
"cough", "pain", "ear", "headache", "cells", "epithelial",
"present", "show", "bilateral", "bronchitis", "configuration",
"domes", "impression", "prominent", "pulmonary", "size", "vascular",
"chest", "wall", "nitrogen", "urea", "abnormality", "aortic",
"impression", "normal", "size", "wall", "cells", "epithelial",
"negative", "diaphragm", "domes", "heart", "chest", "lung",
"absent", "hpf", "wbcpus", "nitrogen", "urea", "urobilinogen",
"rbc","costophrenic", "bilateral", "pleural", "broncho","clinical",
"limits", "recesses", "tcsqctransitionalsquamous","smear",
"specimen", "hilar", "pus","notes$Status")
length(names)
## [1] 67
require(tm) # load text mining package
sd <- VectorSource(notes$description) # words vector
corpus <- Corpus(sd) # build corpus
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removePunctuation) # remove puntucations
corpus <- tm_map(corpus, stripWhitespace) # remove white spaces
corpus <- tm_map(corpus, removeWords, c(stopwords('english'), "and", "are", "the",
"both", "appears", "within", "appear",
"others", "clear", "right", "seen",
"well")) # remove stopwords
corpus <- tm_map(corpus, content_transformer(tolower)) # change to lower case
tdm <- DocumentTermMatrix(corpus) # build document term matrix
tdm_dm <- as.data.frame(as.matrix(tdm)) # count matrix
tdm_df <- as.matrix((tdm_dm > 0) + 0) # binary instance matrix
tdm_df <- as.data.frame(tdm_df)
tdm_df <- cbind(tdm_df, notes$Status)
library(tidyverse)
final <- tdm_df %>% select(names) # select column names matching with key terms
set.seed(1234)
s <- sample(1:nrow(final), nrow(final)*(0.70), replace = FALSE) # random sampling
train <- final[s,] # training set
test <- final[-s,] # testing set
require(h2o)
localH2O <- h2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 minutes 23 seconds
## H2O cluster version: 3.14.0.3
## H2O cluster version age: 4 months and 29 days !!!
## H2O cluster name: H2O_started_from_R_Karthik_zun522
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.70 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.1 (2017-06-30)
h2o.init() # Initialize h20
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 minutes 24 seconds
## H2O cluster version: 3.14.0.3
## H2O cluster version age: 4 months and 29 days !!!
## H2O cluster name: H2O_started_from_R_Karthik_zun522
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.70 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.1 (2017-06-30)
train.h2o <- as.h2o(train) # Train set converted to h2o dataframe
##
|
| | 0%
|
|=================================================================| 100%
test.h2o <- as.h2o(test) # Test set converted to h2o dataframe
##
|
| | 0%
|
|=================================================================| 100%
y.dep <- 57# Dependent Variable
x.indep <- c(1:56) # Independent variables
gbm <- h2o.gbm(y=y.dep, x=x.indep, training_frame = train.h2o,
ntrees=50, learn_rate=0.1, stopping_rounds = 5, nfolds = 5, seed = 1234) # 5 fold cross validation
##
|
| | 0%
|
|==================================================== | 80%
|
|=================================================================| 100%
h2o.performance(gbm, train.h2o)
## H2OMultinomialMetrics: gbm
##
## Test Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.1329738
## RMSE: (Extract with `h2o.rmse`) 0.3646557
## Logloss: (Extract with `h2o.logloss`) 0.4168374
## Mean Per-Class Error: 0.1917808
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Abnormal Normal Other Error Rate
## Abnormal 64 7 2 0.1233 = 9 / 73
## Normal 25 40 8 0.4521 = 33 / 73
## Other 0 0 102 0.0000 = 0 / 102
## Totals 89 47 112 0.1694 = 42 / 248
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios:
## k hit_ratio
## 1 1 0.830645
## 2 2 0.991935
## 3 3 1.000000
h2o.performance(gbm, test.h2o)
## H2OMultinomialMetrics: gbm
##
## Test Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.1881815
## RMSE: (Extract with `h2o.rmse`) 0.4337989
## Logloss: (Extract with `h2o.logloss`) 0.5595079
## Mean Per-Class Error: 0.2535714
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## Abnormal Normal Other Error Rate
## Abnormal 20 6 2 0.2857 = 8 / 28
## Normal 15 21 4 0.4750 = 19 / 40
## Other 0 0 39 0.0000 = 0 / 39
## Totals 35 27 45 0.2523 = 27 / 107
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios:
## k hit_ratio
## 1 1 0.747664
## 2 2 0.981308
## 3 3 1.000000