This is the project for the National Syndromic Surveillance Program’s (NSSP) BioSense Platform. Data from this platform are emergency department records. We applied the Random Forest to chief complaints/diagnosis description, and see if it can successfully select the heat related illness.

Libraries

library(tm) ## text mining package, tm_map(), content_transformer()
library(SnowballC) ##used for stemming, stemDocument
library(RColorBrewer) 
library(wordcloud) ## wordcloud generator
library(randomForest) ## random forest
library(caret) ##ConfusionMatrix()
library(pROC) ## ROC and compute AUC
library(dplyr) ## data manipulation

Data Exploration

After removing the empty rows, the dataset has 2819 observations with 2 variables: Heat_Related_Illness (TRUE/FALSE), and CCUpdates (combination of chief complaints and diagnosis description). Some of CCUpdates are very long, so I display here only the first 10 words from CCUpdates for the first 6 rows of the data.

#The data is not allowed to open to public yet
rawdata<-read.csv("heatdata.csv",header=TRUE,na.strings=c("","NA"))
rawdata=rawdata[-1]%>%
  filter(is.na(CCUpdates)==FALSE)

show_data=data.frame(head(rawdata)[,1],gsub("^((\\w+\\W+){9}\\w+).*$","\\1",head(rawdata)$CCUpdates))
colnames(show_data)=c("Heat_Related_Illness","CCUpdates")
print(show_data)
##   Heat_Related_Illness
## 1                 TRUE
## 2                FALSE
## 3                FALSE
## 4                FALSE
## 5                 TRUE
## 6                FALSE
##                                                                             CCUpdates
## 1                                         Pt to ed by EMS; patient was walking in the
## 2                                                    H60.333 Swimmer's ear; bilateral
## 3 Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus
## 4                                                SOB I50.9 Heart failure; unspecified
## 5             dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA
## 6                                     pt reports that he is trying to detox from ETOH

Convert the logical variable to a factor (TRUE/FALSE), and show the count and relative frequency for TRUE/FALSE two levels

rawdata$Heat_Related_Illness=as.factor(rawdata$Heat_Related_Illness)
table(rawdata$Heat_Related_Illness)
## 
## FALSE  TRUE 
##  2500   319
round(prop.table(table(rawdata$Heat_Related_Illness)),2)
## 
## FALSE  TRUE 
##  0.89  0.11

Data Visualization

Corpus Creation and cleasing

prepare a vector source object using VectorSource and supply the vector source to VCorpus

heat_corpus<-VCorpus(VectorSource(rawdata$CCUpdates))
#must use double bracket and as.character() to view a message
lapply(heat_corpus[1:5],as.character)
## $`1`
## [1] "Pt to ed by EMS; patient was walking in the heat when he had a syncopal episode and fell. Denies hitting his head; abrasions to left knee. States he feels dehydrated. R55 Syncope and collapse;T22.211A Burn of second degree of right forearm; initial encounter;T22.212A Burn of second degree of left forearm; initial encounter;T31.0 Burns involving less than 10% of body surface;X19.XXXA Contact with other heat and hot substances; initial encounter;Y92.480 Sidewalk as the place of occurrence of the external cause;T67.9XXA Effect of heat and light; unspecified; initial encounter;X30.XXXA Exposure to excessive natural heat; initial encounter;N17.9 Acute kidney failure; unspecified;D64.9 Anemia; unspecified;E87.6 Hypokalemia;E83.42 Hypomagnesemia;Z87.891 Personal history of nicotine dependence;F10.21 Alcohol dependence; in remission;I10 Essential (primary) hypertension"
## 
## $`2`
## [1] " H60.333 Swimmer's ear; bilateral"
## 
## $`3`
## [1] "Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus;G43.909 Migraine; unspecified; not intractable; without status migrainosus"
## 
## $`4`
## [1] "SOB I50.9 Heart failure; unspecified"
## 
## $`5`
## [1] "dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA EXPOSURE TO OTHER SPECIFIED FACTORS; INITIAL ENCOUNTER;R42 DIZZINESS AND GIDDINESS"

Corpus cleasing

## convert to lowercase
heat_corpus_clean<-tm_map(heat_corpus,content_transformer(tolower))
## remove numbers
heat_corpus_clean<-tm_map(heat_corpus_clean,content_transformer(removeNumbers))
## remove stop words, i.e., to, or, but, and.
heat_corpus_clean<-tm_map(heat_corpus_clean,removeWords,stopwords())
##remove punctutation, i.e "",.'``
heat_corpus_clean<-tm_map(heat_corpus_clean,removePunctuation)
## apply stemming
heat_corpus_clean<-tm_map(heat_corpus_clean,stemDocument)
## tripe additional whitespaces
heat_corpus_clean<-tm_map(heat_corpus_clean,stripWhitespace)

Data Preparation

Create a document term matrix

heat_dtm<-DocumentTermMatrix(heat_corpus_clean)
## it creates 5538 features
dim(heat_dtm)
## [1] 2819 5538

prepare training and test data set. I select 2000 observations out of 2918 as training set

set.seed(123)
train_sample<-sample(nrow(heat_dtm),size=2000,replace=FALSE)
heat_dtm_train<-heat_dtm[train_sample,]
heat_dtm_test<-heat_dtm[-train_sample,]

prepare training and test data lebels (Heat_Related_Illness=TRUE/FALSE)

heat_train_labels<-rawdata[train_sample,]$Heat_Related_Illness
heat_test_labels<-rawdata[-train_sample,]$Heat_Related_Illness

proportion for the train and test labels

#both set should roughly have the same proportion of ture and false
prop.table(table(heat_train_labels))
## heat_train_labels
## FALSE  TRUE 
## 0.886 0.114
prop.table(table(heat_test_labels))
## heat_test_labels
##     FALSE      TRUE 
## 0.8888889 0.1111111

finding words that appear at least 5 times

heat_freq_words<-findFreqTerms(heat_dtm_train,5)
## preview of most frequent words
str(heat_freq_words)
##  chr [1:1116] "abd" "abdomen" "abdomin" "abdpain" "abl" "abnorm" ...

filtering the DTM (Document Term Matrix) to only contain words with at least 5 occurances

#reducing the features in the DTM
heat_dtm_freq_train<-heat_dtm_train[,heat_freq_words]
heat_dtm_freq_test<-heat_dtm_test[,heat_freq_words]

Random Forest

## word "repeat" is removed since it causes issus (still working on it and havn't figured out yet)
train=as.data.frame(as.matrix(heat_dtm_freq_train))%>%
  select(-"repeat")
colnames(train)=make.names(colnames(train))
test<-as.data.frame(as.matrix(heat_dtm_freq_test))%>%
  select(-"repeat")

fit random forest model

random_forest<-randomForest(heat_train_labels~.,data=train)

predict for the test set

RF_pred<-predict(random_forest,newdata=test,type="prob")[,2]

Confusion Matrix and AUC

confusionMatrix(as.factor(RF_pred>0.5),heat_test_labels,positive="TRUE",dnn=c("predicted","actual"))
## Confusion Matrix and Statistics
## 
##          actual
## predicted FALSE TRUE
##     FALSE   725    9
##     TRUE      3   82
##                                           
##                Accuracy : 0.9853          
##                  95% CI : (0.9745, 0.9924)
##     No Information Rate : 0.8889          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9236          
##  Mcnemar's Test P-Value : 0.1489          
##                                           
##             Sensitivity : 0.9011          
##             Specificity : 0.9959          
##          Pos Pred Value : 0.9647          
##          Neg Pred Value : 0.9877          
##              Prevalence : 0.1111          
##          Detection Rate : 0.1001          
##    Detection Prevalence : 0.1038          
##       Balanced Accuracy : 0.9485          
##                                           
##        'Positive' Class : TRUE            
## 

The sensitivity is 90.11% and the specificity is 99.59%, with accuracy 98.53% and precision 96.47.

ROC and AUC

ROCurve<-roc(heat_test_labels,RF_pred)
#plot(ROCurve)
## AUC is 0.971
auc(ROCurve)
## Area under the curve: 0.971