Random Forest for Classification

This is the project for the National Syndromic Surveillance Program’s (NSSP) BioSense Platform. Data from this platform are emergency department records. We applied the Random Forest to chief complaints/diagnosis description, and see if it can successfully select the heat related illness.

Libraries

library(tm) ## text mining package, tm_map(), content_transformer()
library(SnowballC) ##used for stemming, stemDocument
library(RColorBrewer) 
library(wordcloud) ## wordcloud generator
library(randomForest) ## random forest
library(caret) ##ConfusionMatrix()
library(pROC) ## ROC and compute AUC
library(dplyr) ## data manipulation

Data Exploration

After removing the empty rows, the dataset has 2819 observations with 2 variables: Heat_Related_Illness (TRUE/FALSE), and CCUpdates (combination of chief complaints and diagnosis description). Some of CCUpdates are very long, so I display here only the first 10 words from CCUpdates for the first 6 rows of the data.

#The data is not allowed to open to public yet
rawdata<-read.csv("heatdata.csv",header=TRUE,na.strings=c("","NA"))
rawdata=rawdata[-1]%>%
  filter(is.na(CCUpdates)==FALSE)

show_data=data.frame(head(rawdata)[,1],gsub("^((\\w+\\W+){9}\\w+).*$","\\1",head(rawdata)$CCUpdates))
colnames(show_data)=c("Heat_Related_Illness","CCUpdates")
print(show_data)

##   Heat_Related_Illness
## 1                 TRUE
## 2                FALSE
## 3                FALSE
## 4                FALSE
## 5                 TRUE
## 6                FALSE
##                                                                             CCUpdates
## 1                                         Pt to ed by EMS; patient was walking in the
## 2                                                    H60.333 Swimmer's ear; bilateral
## 3 Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus
## 4                                                SOB I50.9 Heart failure; unspecified
## 5             dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA
## 6                                     pt reports that he is trying to detox from ETOH

Convert the logical variable to a factor (TRUE/FALSE), and show the count and relative frequency for TRUE/FALSE two levels

rawdata$Heat_Related_Illness=as.factor(rawdata$Heat_Related_Illness)
table(rawdata$Heat_Related_Illness)

## 
## FALSE  TRUE 
##  2500   319

round(prop.table(table(rawdata$Heat_Related_Illness)),2)

## 
## FALSE  TRUE 
##  0.89  0.11

Data Visualization

word cloud for TRUE and FALSE heat related illness.

heat_true<-subset(rawdata,Heat_Related_Illness=="TRUE")
wordcloud(heat_true$CCUpdates,max.words=100,color=brewer.pal(5,"Dark2"),random.order=FALSE)

heat_false<-subset(rawdata,Heat_Related_Illness=="FALSE")
wordcloud(heat_false$CCUpdates,max.words=100,color=brewer.pal(5,"Dark2"),random.order=FALSE)

Corpus Creation and cleasing

prepare a vector source object using VectorSource and supply the vector source to VCorpus

heat_corpus<-VCorpus(VectorSource(rawdata$CCUpdates))
#must use double bracket and as.character() to view a message
lapply(heat_corpus[1:5],as.character)

## $`1`
## [1] "Pt to ed by EMS; patient was walking in the heat when he had a syncopal episode and fell. Denies hitting his head; abrasions to left knee. States he feels dehydrated. R55 Syncope and collapse;T22.211A Burn of second degree of right forearm; initial encounter;T22.212A Burn of second degree of left forearm; initial encounter;T31.0 Burns involving less than 10% of body surface;X19.XXXA Contact with other heat and hot substances; initial encounter;Y92.480 Sidewalk as the place of occurrence of the external cause;T67.9XXA Effect of heat and light; unspecified; initial encounter;X30.XXXA Exposure to excessive natural heat; initial encounter;N17.9 Acute kidney failure; unspecified;D64.9 Anemia; unspecified;E87.6 Hypokalemia;E83.42 Hypomagnesemia;Z87.891 Personal history of nicotine dependence;F10.21 Alcohol dependence; in remission;I10 Essential (primary) hypertension"
## 
## $`2`
## [1] " H60.333 Swimmer's ear; bilateral"
## 
## $`3`
## [1] "Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus;G43.909 Migraine; unspecified; not intractable; without status migrainosus"
## 
## $`4`
## [1] "SOB I50.9 Heart failure; unspecified"
## 
## $`5`
## [1] "dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA EXPOSURE TO OTHER SPECIFIED FACTORS; INITIAL ENCOUNTER;R42 DIZZINESS AND GIDDINESS"

Corpus cleasing

## convert to lowercase
heat_corpus_clean<-tm_map(heat_corpus,content_transformer(tolower))
## remove numbers
heat_corpus_clean<-tm_map(heat_corpus_clean,content_transformer(removeNumbers))
## remove stop words, i.e., to, or, but, and.
heat_corpus_clean<-tm_map(heat_corpus_clean,removeWords,stopwords())
##remove punctutation, i.e "",.'``
heat_corpus_clean<-tm_map(heat_corpus_clean,removePunctuation)
## apply stemming
heat_corpus_clean<-tm_map(heat_corpus_clean,stemDocument)
## tripe additional whitespaces
heat_corpus_clean<-tm_map(heat_corpus_clean,stripWhitespace)

Data Preparation

Create a document term matrix

heat_dtm<-DocumentTermMatrix(heat_corpus_clean)
## it creates 5538 features
dim(heat_dtm)

## [1] 2819 5538

prepare training and test data set. I select 2000 observations out of 2918 as training set

set.seed(123)
train_sample<-sample(nrow(heat_dtm),size=2000,replace=FALSE)
heat_dtm_train<-heat_dtm[train_sample,]
heat_dtm_test<-heat_dtm[-train_sample,]

prepare training and test data lebels (Heat_Related_Illness=TRUE/FALSE)

heat_train_labels<-rawdata[train_sample,]$Heat_Related_Illness
heat_test_labels<-rawdata[-train_sample,]$Heat_Related_Illness

proportion for the train and test labels

#both set should roughly have the same proportion of ture and false
prop.table(table(heat_train_labels))

## heat_train_labels
## FALSE  TRUE 
## 0.886 0.114

prop.table(table(heat_test_labels))

## heat_test_labels
##     FALSE      TRUE 
## 0.8888889 0.1111111

finding words that appear at least 5 times

heat_freq_words<-findFreqTerms(heat_dtm_train,5)
## preview of most frequent words
str(heat_freq_words)

##  chr [1:1116] "abd" "abdomen" "abdomin" "abdpain" "abl" "abnorm" ...

filtering the DTM (Document Term Matrix) to only contain words with at least 5 occurances

#reducing the features in the DTM
heat_dtm_freq_train<-heat_dtm_train[,heat_freq_words]
heat_dtm_freq_test<-heat_dtm_test[,heat_freq_words]

Random Forest

## word "repeat" is removed since it causes issus (still working on it and havn't figured out yet)
train=as.data.frame(as.matrix(heat_dtm_freq_train))%>%
  select(-"repeat")
colnames(train)=make.names(colnames(train))
test<-as.data.frame(as.matrix(heat_dtm_freq_test))%>%
  select(-"repeat")

fit random forest model

random_forest<-randomForest(heat_train_labels~.,data=train)

predict for the test set

RF_pred<-predict(random_forest,newdata=test,type="prob")[,2]

Confusion Matrix and AUC

confusionMatrix(as.factor(RF_pred>0.5),heat_test_labels,positive="TRUE",dnn=c("predicted","actual"))

## Confusion Matrix and Statistics
## 
##          actual
## predicted FALSE TRUE
##     FALSE   725    9
##     TRUE      3   82
##                                           
##                Accuracy : 0.9853          
##                  95% CI : (0.9745, 0.9924)
##     No Information Rate : 0.8889          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9236          
##  Mcnemar's Test P-Value : 0.1489          
##                                           
##             Sensitivity : 0.9011          
##             Specificity : 0.9959          
##          Pos Pred Value : 0.9647          
##          Neg Pred Value : 0.9877          
##              Prevalence : 0.1111          
##          Detection Rate : 0.1001          
##    Detection Prevalence : 0.1038          
##       Balanced Accuracy : 0.9485          
##                                           
##        'Positive' Class : TRUE            
##

The sensitivity is 90.11% and the specificity is 99.59%, with accuracy 98.53% and precision 96.47.

ROC and AUC

ROCurve<-roc(heat_test_labels,RF_pred)
#plot(ROCurve)
## AUC is 0.971
auc(ROCurve)

## Area under the curve: 0.971