This is the project for the National Syndromic Surveillance Program’s (NSSP) BioSense Platform. Data from this platform are emergency department records. We applied the Naive Bayes Classifier to chief complaints/diagnosis description, and see if it can successfully select the heat related illness.
library(tm) ## text mining package, tm_map(), content_transformer()
library(SnowballC) ##used for stemming, stemDocument
library(RColorBrewer)
library(wordcloud) ## wordcloud generator
library(e1071) ## Naive Bayes
library(caret) ##ConfusionMatrix()
library(pROC) ## ROC and compute AUC
library(dplyr) ## data manipulation
After removing the empty rows, the dataset has 2819 observations with 2 variables: Heat_Related_Illness (TRUE/FALSE), and CCUpdates (combination of chief complaints and diagnosis description). Some of CCUpdates are very long, so I display here only the first 10 words from CCUpdates for the first 6 rows of the data.
#The data is not allowed to open to public yet
rawdata<-read.csv("heatdata.csv",header=TRUE,na.strings=c("","NA"))
rawdata=rawdata[-1]%>%
filter(is.na(CCUpdates)==FALSE)
show_data=data.frame(head(rawdata)[,1],gsub("^((\\w+\\W+){9}\\w+).*$","\\1",head(rawdata)$CCUpdates))
colnames(show_data)=c("Heat_Related_Illness","CCUpdates")
print(show_data)
## Heat_Related_Illness
## 1 TRUE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 TRUE
## 6 FALSE
## CCUpdates
## 1 Pt to ed by EMS; patient was walking in the
## 2 H60.333 Swimmer's ear; bilateral
## 3 Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus
## 4 SOB I50.9 Heart failure; unspecified
## 5 dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA
## 6 pt reports that he is trying to detox from ETOH
Convert the logical variable to a factor (TRUE/FALSE), and show the count and relative frequency for TRUE/FALSE two levels
rawdata$Heat_Related_Illness=as.factor(rawdata$Heat_Related_Illness)
table(rawdata$Heat_Related_Illness)
##
## FALSE TRUE
## 2500 319
round(prop.table(table(rawdata$Heat_Related_Illness)),2)
##
## FALSE TRUE
## 0.89 0.11
heat_corpus<-VCorpus(VectorSource(rawdata$CCUpdates))
#must use double bracket and as.character() to view a message
lapply(heat_corpus[1:5],as.character)
## $`1`
## [1] "Pt to ed by EMS; patient was walking in the heat when he had a syncopal episode and fell. Denies hitting his head; abrasions to left knee. States he feels dehydrated. R55 Syncope and collapse;T22.211A Burn of second degree of right forearm; initial encounter;T22.212A Burn of second degree of left forearm; initial encounter;T31.0 Burns involving less than 10% of body surface;X19.XXXA Contact with other heat and hot substances; initial encounter;Y92.480 Sidewalk as the place of occurrence of the external cause;T67.9XXA Effect of heat and light; unspecified; initial encounter;X30.XXXA Exposure to excessive natural heat; initial encounter;N17.9 Acute kidney failure; unspecified;D64.9 Anemia; unspecified;E87.6 Hypokalemia;E83.42 Hypomagnesemia;Z87.891 Personal history of nicotine dependence;F10.21 Alcohol dependence; in remission;I10 Essential (primary) hypertension"
##
## $`2`
## [1] " H60.333 Swimmer's ear; bilateral"
##
## $`3`
## [1] "Migraine G43.909 Migraine; unspecified; not intractable; without status migrainosus;G43.909 Migraine; unspecified; not intractable; without status migrainosus"
##
## $`4`
## [1] "SOB I50.9 Heart failure; unspecified"
##
## $`5`
## [1] "dizzy T67.5XXA HEAT EXHAUSTION; UNSPECIFIED; INITIAL ENCOUNTER;X58.XXXA EXPOSURE TO OTHER SPECIFIED FACTORS; INITIAL ENCOUNTER;R42 DIZZINESS AND GIDDINESS"
## convert to lowercase
heat_corpus_clean<-tm_map(heat_corpus,content_transformer(tolower))
## remove numbers
heat_corpus_clean<-tm_map(heat_corpus_clean,content_transformer(removeNumbers))
## remove stop words, i.e., to, or, but, and.
heat_corpus_clean<-tm_map(heat_corpus_clean,removeWords,stopwords())
##remove punctutation, i.e "",.'``
heat_corpus_clean<-tm_map(heat_corpus_clean,removePunctuation)
## apply stemming
heat_corpus_clean<-tm_map(heat_corpus_clean,stemDocument)
## tripe additional whitespaces
heat_corpus_clean<-tm_map(heat_corpus_clean,stripWhitespace)
heat_dtm<-DocumentTermMatrix(heat_corpus_clean)
## it creates 5538 features
dim(heat_dtm)
## [1] 2819 5538
prepare training and test data set. I select 2000 observations out of 2918 as training set
set.seed(123)
train_sample<-sample(nrow(heat_dtm),size=2000,replace=FALSE)
heat_dtm_train<-heat_dtm[train_sample,]
heat_dtm_test<-heat_dtm[-train_sample,]
prepare training and test data lebels (Heat_Related_Illness=TRUE/FALSE)
heat_train_labels<-rawdata[train_sample,]$Heat_Related_Illness
heat_test_labels<-rawdata[-train_sample,]$Heat_Related_Illness
proportion for the train and test labels
#both set should roughly have the same proportion of ture and false
prop.table(table(heat_train_labels))
## heat_train_labels
## FALSE TRUE
## 0.886 0.114
prop.table(table(heat_test_labels))
## heat_test_labels
## FALSE TRUE
## 0.8888889 0.1111111
heat_freq_words<-findFreqTerms(heat_dtm_train,5)
## preview of most frequent words
str(heat_freq_words)
## chr [1:1116] "abd" "abdomen" "abdomin" "abdpain" "abl" "abnorm" ...
#reducing the features in the DTM
heat_dtm_freq_train<-heat_dtm_train[,heat_freq_words]
heat_dtm_freq_test<-heat_dtm_test[,heat_freq_words]
#first convert to a categorical varialbe (Yes/No) so that Naive Bayes could recognize
convert_counts <- function(x){
x <- ifelse(x > 0, "Yes", "No")
}
#apply to train and test reduced DTMs, applying to column
heat_train <- apply(heat_dtm_freq_train, MARGIN = 2, convert_counts)
heat_test <- apply(heat_dtm_freq_test, MARGIN = 2, convert_counts)
nb_classifier<-naiveBayes(heat_train,heat_train_labels,laplace=0)
nb_pred<-predict(nb_classifier,heat_test)
confusionMatrix(nb_pred,heat_test_labels,positive="TRUE",dnn=c("predicted","actual"))
## Confusion Matrix and Statistics
##
## actual
## predicted FALSE TRUE
## FALSE 708 8
## TRUE 20 83
##
## Accuracy : 0.9658
## 95% CI : (0.951, 0.9772)
## No Information Rate : 0.8889
## P-Value [Acc > NIR] : 7.025e-16
##
## Kappa : 0.8364
## Mcnemar's Test P-Value : 0.03764
##
## Sensitivity : 0.9121
## Specificity : 0.9725
## Pos Pred Value : 0.8058
## Neg Pred Value : 0.9888
## Prevalence : 0.1111
## Detection Rate : 0.1013
## Detection Prevalence : 0.1258
## Balanced Accuracy : 0.9423
##
## 'Positive' Class : TRUE
##
The sensitivity is 91.21% and the specificity is 97.25%, with accuracy 96.58% and precision 80.58%.
ROC and AUC
ROCurve<-roc(heat_test_labels,as.numeric(nb_pred))
#plot(ROCurve)
## We got AUC of 0.9423
auc(ROCurve)
## Area under the curve: 0.9423