This is the project for the National Syndromic Surveillance Program’s (NSSP) BioSense Platform. Data from this platform are emergency department records. We applied the Random Forest to chief complaints/diagnosis description, and see if it can successfully select the heat related illness.
library(tm) ## text mining package, tm_map(), content_transformer()
library(SnowballC) ##used for stemming, stemDocument
library(RColorBrewer)
library(wordcloud) ## wordcloud generator
library(randomForest) ## random forest
library(caret) ##ConfusionMatrix()
library(pROC) ## ROC and compute AUC
library(dplyr) ## data manipulation
After removing the empty rows, the dataset has 2819 observations with 2 variables: Heat_Related_Illness (TRUE/FALSE), and CCUpdates (combination of chief complaints and diagnosis description). Some of CCUpdates are very long, so I display here only the first 10 words from CCUpdates for the first 6 rows of the data.
#The data is not allowed to open to public yet
rawdata<-read.csv("heatdata.csv",header=TRUE,na.strings=c("","NA"))
rawdata=rawdata[-1]%>%
filter(is.na(CCUpdates)==FALSE)
show_data=data.frame(head(rawdata)[,1],gsub("^((\\w+\\W+){9}\\w+).*$","\\1",head(rawdata)$CCUpdates))
colnames(show_data)=c("Heat_Related_Illness","CCUpdates")
print(show_data)
## Heat_Related_Illness
## 1 TRUE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 TRUE
## 6 FALSE
## CCUpdates
## 1 Pt to ed by EMS; patient was walking in the
## 2 H60.333 Swimmer's ear; bilateral
## 3 Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus
## 4 SOB I50.9 Heart failure; unspecified
## 5 dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA
## 6 pt reports that he is trying to detox from ETOH
Convert the logical variable to a factor (TRUE/FALSE), and show the count and relative frequency for TRUE/FALSE two levels
rawdata$Heat_Related_Illness=as.factor(rawdata$Heat_Related_Illness)
table(rawdata$Heat_Related_Illness)
##
## FALSE TRUE
## 2500 319
round(prop.table(table(rawdata$Heat_Related_Illness)),2)
##
## FALSE TRUE
## 0.89 0.11
heat_corpus<-VCorpus(VectorSource(rawdata$CCUpdates))
#must use double bracket and as.character() to view a message
lapply(heat_corpus[1:5],as.character)
## $`1`
## [1] "Pt to ed by EMS; patient was walking in the heat when he had a syncopal episode and fell. Denies hitting his head; abrasions to left knee. States he feels dehydrated. R55 Syncope and collapse;T22.211A Burn of second degree of right forearm; initial encounter;T22.212A Burn of second degree of left forearm; initial encounter;T31.0 Burns involving less than 10% of body surface;X19.XXXA Contact with other heat and hot substances; initial encounter;Y92.480 Sidewalk as the place of occurrence of the external cause;T67.9XXA Effect of heat and light; unspecified; initial encounter;X30.XXXA Exposure to excessive natural heat; initial encounter;N17.9 Acute kidney failure; unspecified;D64.9 Anemia; unspecified;E87.6 Hypokalemia;E83.42 Hypomagnesemia;Z87.891 Personal history of nicotine dependence;F10.21 Alcohol dependence; in remission;I10 Essential (primary) hypertension"
##
## $`2`
## [1] " H60.333 Swimmer's ear; bilateral"
##
## $`3`
## [1] "Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus;G43.909 Migraine; unspecified; not intractable; without status migrainosus"
##
## $`4`
## [1] "SOB I50.9 Heart failure; unspecified"
##
## $`5`
## [1] "dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA EXPOSURE TO OTHER SPECIFIED FACTORS; INITIAL ENCOUNTER;R42 DIZZINESS AND GIDDINESS"
## convert to lowercase
heat_corpus_clean<-tm_map(heat_corpus,content_transformer(tolower))
## remove numbers
heat_corpus_clean<-tm_map(heat_corpus_clean,content_transformer(removeNumbers))
## remove stop words, i.e., to, or, but, and.
heat_corpus_clean<-tm_map(heat_corpus_clean,removeWords,stopwords())
##remove punctutation, i.e "",.'``
heat_corpus_clean<-tm_map(heat_corpus_clean,removePunctuation)
## apply stemming
heat_corpus_clean<-tm_map(heat_corpus_clean,stemDocument)
## tripe additional whitespaces
heat_corpus_clean<-tm_map(heat_corpus_clean,stripWhitespace)
heat_dtm<-DocumentTermMatrix(heat_corpus_clean)
## it creates 5538 features
dim(heat_dtm)
## [1] 2819 5538
prepare training and test data set. I select 2000 observations out of 2918 as training set
set.seed(123)
train_sample<-sample(nrow(heat_dtm),size=2000,replace=FALSE)
heat_dtm_train<-heat_dtm[train_sample,]
heat_dtm_test<-heat_dtm[-train_sample,]
prepare training and test data lebels (Heat_Related_Illness=TRUE/FALSE)
heat_train_labels<-rawdata[train_sample,]$Heat_Related_Illness
heat_test_labels<-rawdata[-train_sample,]$Heat_Related_Illness
proportion for the train and test labels
#both set should roughly have the same proportion of ture and false
prop.table(table(heat_train_labels))
## heat_train_labels
## FALSE TRUE
## 0.886 0.114
prop.table(table(heat_test_labels))
## heat_test_labels
## FALSE TRUE
## 0.8888889 0.1111111
heat_freq_words<-findFreqTerms(heat_dtm_train,5)
## preview of most frequent words
str(heat_freq_words)
## chr [1:1116] "abd" "abdomen" "abdomin" "abdpain" "abl" "abnorm" ...
#reducing the features in the DTM
heat_dtm_freq_train<-heat_dtm_train[,heat_freq_words]
heat_dtm_freq_test<-heat_dtm_test[,heat_freq_words]
## word "repeat" is removed since it causes issus (still working on it and havn't figured out yet)
train=as.data.frame(as.matrix(heat_dtm_freq_train))%>%
select(-"repeat")
colnames(train)=make.names(colnames(train))
test<-as.data.frame(as.matrix(heat_dtm_freq_test))%>%
select(-"repeat")
random_forest<-randomForest(heat_train_labels~.,data=train)
RF_pred<-predict(random_forest,newdata=test,type="prob")[,2]
confusionMatrix(as.factor(RF_pred>0.5),heat_test_labels,positive="TRUE",dnn=c("predicted","actual"))
## Confusion Matrix and Statistics
##
## actual
## predicted FALSE TRUE
## FALSE 725 9
## TRUE 3 82
##
## Accuracy : 0.9853
## 95% CI : (0.9745, 0.9924)
## No Information Rate : 0.8889
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9236
## Mcnemar's Test P-Value : 0.1489
##
## Sensitivity : 0.9011
## Specificity : 0.9959
## Pos Pred Value : 0.9647
## Neg Pred Value : 0.9877
## Prevalence : 0.1111
## Detection Rate : 0.1001
## Detection Prevalence : 0.1038
## Balanced Accuracy : 0.9485
##
## 'Positive' Class : TRUE
##
The sensitivity is 90.11% and the specificity is 99.59%, with accuracy 98.53% and precision 96.47.
ROCurve<-roc(heat_test_labels,RF_pred)
#plot(ROCurve)
## AUC is 0.971
auc(ROCurve)
## Area under the curve: 0.971