This project builds on the Medical Appointment No Shows dataset from Kaggle, which was collected from medical appointments in Brazil.
https://www.kaggle.com/joniarroba/noshowappointments
Reducing or accurately predicting appointment no-shows can be useful, since optimized scheduling can greatly improve allocation of valuable doctor time. Particularly in a developing country like Brazil, where there is a shortage of qualified professionals, optimizing medical scheduling can help lower healthcare costs, provide broader healthcare access to more people, and even save lives.
The goal of this assignment will be to predict whether or not a patient will be a “no-show”, depending on their medical profile.
Before we begin, we’ll load the necessary files and libraries into the environment.
# Load files and libraries
set.seed(1234)
noshow <- read.csv("noshow.csv")
summary(noshow)
## Age Gender AppointmentRegistration
## Min. : -2.00 F:200505 2015-04-15T14:19:34Z: 9
## 1st Qu.: 19.00 M: 99495 2014-03-19T17:10:21Z: 8
## Median : 38.00 2014-07-03T08:50:50Z: 8
## Mean : 37.81 2015-05-27T13:57:09Z: 8
## 3rd Qu.: 56.00 2013-12-23T12:40:00Z: 7
## Max. :113.00 2014-01-07T15:42:57Z: 7
## (Other) :299953
## ApointmentData DayOfTheWeek Status
## 2014-10-22T00:00:00Z: 759 Friday :52771 No-Show: 90731
## 2014-09-03T00:00:00Z: 756 Monday :59298 Show-Up:209269
## 2014-11-24T00:00:00Z: 752 Saturday : 1393
## 2014-11-19T00:00:00Z: 742 Sunday : 6
## 2015-05-25T00:00:00Z: 741 Thursday :60262
## 2014-09-29T00:00:00Z: 739 Tuesday :62775
## (Other) :295511 Wednesday:63495
## Diabetes Alcoolism HiperTension Handcap
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.07797 Mean :0.02501 Mean :0.2159 Mean :0.02052
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :4.00000
##
## Smokes Scholarship Tuberculosis Sms_Reminder
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.05237 Mean :0.0969 Mean :0.00045 Mean :0.5742
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :2.0000
##
## AwaitingTime
## Min. :-398.00
## 1st Qu.: -20.00
## Median : -8.00
## Mean : -13.84
## 3rd Qu.: -4.00
## Max. : -1.00
##
library(boot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(OneR)
## Warning: package 'OneR' was built under R version 3.3.3
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ade4)
## Warning: package 'ade4' was built under R version 3.3.3
# Create a new variable indicating advance notice.
noshow$AppointmentRegistration <- as.POSIXct(strptime(noshow$AppointmentRegistration, "%Y-%m-%dT%H:%M:%SZ"))
noshow$ApointmentData <- as.POSIXct(strptime(noshow$ApointmentData, "%Y-%m-%dT%H:%M:%SZ"))
noshow$AdvanceNotice <- ceiling(((noshow$ApointmentData - noshow$AppointmentRegistration) - 12) / 24)
# Split into training and test set
inTrain <- createDataPartition(y=noshow$Age, p=0.9, list=FALSE)
training <- noshow[inTrain,]
testing <- noshow[-inTrain,]
dim(training)
## [1] 270002 16
dim(testing)
## [1] 29998 16
First, let’s examine some of the data’s characteristics.
# Check NA Counts for each column
naCount <- data.frame(colName = names(training), naCount = as.vector(apply(training, MARGIN=2, FUN=function(x){ length(x[is.na(x)]) } )))
# Prop table for each element
apply(training[,!(names(training) %in% c("Age", "AwaitingTime", "AppointmentRegistration", "ApointmentData"))], MARGIN=2, FUN=function(x, tmpStatus){ round(prop.table(table(bin(x, method = "content"), tmpStatus ), 1), 3) }, tmpStatus = training$Status )
## $Gender
## tmpStatus
## No-Show Show-Up
## F 0.299 0.701
## M 0.310 0.690
##
## $DayOfTheWeek
## tmpStatus
## No-Show Show-Up
## Friday 0.309 0.691
## Monday 0.322 0.678
## Saturday 0.357 0.643
## Sunday 0.167 0.833
## Thursday 0.297 0.703
## Tuesday 0.289 0.711
## Wednesday 0.298 0.702
##
## $Status
## tmpStatus
## No-Show Show-Up
## No-Show 1 0
## Show-Up 0 1
##
## $Diabetes
## tmpStatus
## No-Show Show-Up
## 0 0.307 0.693
## 1 0.254 0.746
##
## $Alcoolism
## tmpStatus
## No-Show Show-Up
## 0 0.301 0.699
## 1 0.375 0.625
##
## $HiperTension
## tmpStatus
## No-Show Show-Up
## 0 0.317 0.683
## 1 0.252 0.748
##
## $Handcap
## tmpStatus
## No-Show Show-Up
## 0 0.303 0.697
## 1 0.278 0.722
## 2 0.289 0.711
## 3 0.257 0.743
## 4 0.200 0.800
##
## $Smokes
## tmpStatus
## No-Show Show-Up
## 0 0.30 0.70
## 1 0.35 0.65
##
## $Scholarship
## tmpStatus
## No-Show Show-Up
## 0 0.297 0.703
## 1 0.360 0.640
##
## $Tuberculosis
## tmpStatus
## No-Show Show-Up
## 0 0.303 0.697
## 1 0.388 0.612
##
## $Sms_Reminder
## tmpStatus
## No-Show Show-Up
## 0 0.303 0.697
## 1 0.303 0.697
## 2 0.339 0.661
##
## $AdvanceNotice
## tmpStatus
## No-Show Show-Up
## 0 hours 0.228 0.772
## 1 hours 0.242 0.758
## 2 hours 0.269 0.731
## 3 hours 0.249 0.751
## 4 hours 0.255 0.745
## 5 hours 0.283 0.717
## 6 hours 0.296 0.704
## 7 hours 0.303 0.697
## 8 hours 0.300 0.700
## 9 hours 0.307 0.693
## 10 hours 0.335 0.665
## 11 hours 0.334 0.666
## 12 hours 0.344 0.656
## 13 hours 0.341 0.659
## 14 hours 0.333 0.667
## 15 hours 0.328 0.672
## 16 hours 0.346 0.654
## 17 hours 0.336 0.664
## 18 hours 0.363 0.637
## 19 hours 0.356 0.644
## 20 hours 0.347 0.653
## 21 hours 0.343 0.657
## 22 hours 0.352 0.648
## 23 hours 0.344 0.656
## 24 hours 0.369 0.631
## 25 hours 0.376 0.624
## 26 hours 0.348 0.652
## 27 hours 0.335 0.665
## 28 hours 0.344 0.656
## 29 hours 0.331 0.669
## 30 hours 0.332 0.668
## 31 hours 0.340 0.660
## 32 hours 0.351 0.649
## 33 hours 0.362 0.638
## 34 hours 0.352 0.648
## 35 hours 0.364 0.636
## 36 hours 0.339 0.661
## 37 hours 0.348 0.652
## 38 hours 0.385 0.615
## 39 hours 0.382 0.618
## 40 hours 0.351 0.649
## 41 hours 0.375 0.625
## 42 hours 0.355 0.645
## 43 hours 0.357 0.643
## 44 hours 0.367 0.633
## 45 hours 0.386 0.614
## 46 hours 0.375 0.625
## 47 hours 0.403 0.597
## 48 hours 0.357 0.643
## 49 hours 0.355 0.645
## 50 hours 0.377 0.623
## 51 hours 0.407 0.593
## 52 hours 0.363 0.637
## 53 hours 0.375 0.625
## 54 hours 0.403 0.597
## 55 hours 0.354 0.646
## 56 hours 0.338 0.662
## 57 hours 0.315 0.685
## 58 hours 0.428 0.572
## 59 hours 0.373 0.627
## 60 hours 0.337 0.663
## 61 hours 0.329 0.671
## 62 hours 0.370 0.630
## 63 hours 0.366 0.634
## 64 hours 0.389 0.611
## 65 hours 0.323 0.677
## 66 hours 0.411 0.589
## 67 hours 0.423 0.577
## 68 hours 0.382 0.618
## 69 hours 0.385 0.615
## 70 hours 0.331 0.669
## 71 hours 0.346 0.654
## 72 hours 0.379 0.621
## 73 hours 0.368 0.632
## 74 hours 0.474 0.526
## 75 hours 0.374 0.626
## 76 hours 0.324 0.676
## 77 hours 0.331 0.669
## 78 hours 0.339 0.661
## 79 hours 0.310 0.690
## 80 hours 0.354 0.646
## 81 hours 0.511 0.489
## 82 hours 0.351 0.649
## 83 hours 0.333 0.667
## 84 hours 0.310 0.690
## 85 hours 0.211 0.789
## 86 hours 0.306 0.694
## 87 hours 0.263 0.737
## 88 hours 0.326 0.674
## 89 hours 0.404 0.596
## 90 hours 0.359 0.641
## 91 hours 0.310 0.690
## 92 hours 0.315 0.685
## 93 hours 0.275 0.725
## 94 hours 0.361 0.639
## 95 hours 0.195 0.805
## 96 hours 0.279 0.721
## 97 hours 0.289 0.711
## 98 hours 0.375 0.625
## 99 hours 0.368 0.632
## 100 hours 0.200 0.800
## 101 hours 0.211 0.789
## 102 hours 0.333 0.667
## 103 hours 0.262 0.738
## 104 hours 0.333 0.667
## 105 hours 0.216 0.784
## 106 hours 0.481 0.519
## 107 hours 0.333 0.667
## 108 hours 0.118 0.882
## 109 hours 0.357 0.643
## 110 hours 0.250 0.750
## 111 hours 0.286 0.714
## 112 hours 0.333 0.667
## 113 hours 0.100 0.900
## 114 hours 0.429 0.571
## 115 hours 0.429 0.571
## 116 hours 0.167 0.833
## 117 hours 0.300 0.700
## 118 hours 0.438 0.562
## 119 hours 0.000 1.000
## 120 hours 1.000 0.000
## 121 hours 0.000 1.000
## 123 hours 0.400 0.600
## 124 hours 0.500 0.500
## 125 hours 0.750 0.250
## 126 hours 0.438 0.562
## 127 hours 0.400 0.600
## 128 hours 0.500 0.500
## 129 hours 0.500 0.500
## 130 hours 0.667 0.333
## 132 hours 1.000 0.000
## 133 hours 0.333 0.667
## 134 hours 0.333 0.667
## 139 hours 1.000 0.000
## 140 hours 0.167 0.833
## 142 hours 1.000 0.000
## 145 hours 0.000 1.000
## 146 hours 1.000 0.000
## 147 hours 0.333 0.667
## 149 hours 0.000 1.000
## 152 hours 0.000 1.000
## 154 hours 0.000 1.000
## 156 hours 0.000 1.000
## 160 hours 0.000 1.000
## 161 hours 0.000 1.000
## 162 hours 0.000 1.000
## 163 hours 0.000 1.000
## 168 hours 0.250 0.750
## 170 hours 0.250 0.750
## 172 hours 0.000 1.000
## 173 hours 0.667 0.333
## 174 hours 0.500 0.500
## 175 hours 0.429 0.571
## 176 hours 0.000 1.000
## 182 hours 0.273 0.727
## 184 hours 0.000 1.000
## 186 hours 1.000 0.000
## 189 hours 1.000 0.000
## 195 hours 0.000 1.000
## 202 hours 0.200 0.800
## 205 hours 1.000 0.000
## 206 hours 0.000 1.000
## 210 hours 0.000 1.000
## 216 hours 0.000 1.000
## 217 hours 0.333 0.667
## 224 hours 0.250 0.750
## 225 hours 0.000 1.000
## 226 hours 0.000 1.000
## 230 hours 0.000 1.000
## 231 hours 0.500 0.500
## 232 hours 0.400 0.600
## 237 hours 0.000 1.000
## 238 hours 0.286 0.714
## 245 hours 0.000 1.000
## 249 hours 0.000 1.000
## 252 hours 1.000 0.000
## 253 hours 0.000 1.000
## 257 hours 0.500 0.500
## 258 hours 0.500 0.500
## 261 hours 0.500 0.500
## 265 hours 0.500 0.500
## 266 hours 0.000 1.000
## 272 hours 0.500 0.500
## 274 hours 0.000 1.000
## 277 hours 0.000 1.000
## 278 hours 0.000 1.000
## 280 hours 0.500 0.500
## 281 hours 0.333 0.667
## 287 hours 0.333 0.667
## 294 hours 0.000 1.000
## 296 hours 0.000 1.000
## 299 hours 1.000 0.000
## 300 hours 0.000 1.000
## 301 hours 0.750 0.250
## 302 hours 1.000 0.000
## 303 hours 0.000 1.000
## 313 hours 0.000 1.000
## 314 hours 0.000 1.000
## 320 hours 0.000 1.000
## 321 hours 1.000 0.000
## 322 hours 0.000 1.000
## 329 hours 1.000 0.000
## 332 hours 0.667 0.333
## 333 hours 1.000 0.000
## 334 hours 0.000 1.000
## 343 hours 0.000 1.000
## 348 hours 1.000 0.000
## 350 hours 0.000 1.000
## 397 hours 1.000 0.000
Now let’s use more visual methods to evaluate impact on status.
# No-show by age
plot(aggregate(training$Status, by=list(training$Age), function(x){ length(x[x %in% "No-Show"]) / length(x) }))
abline(h=length(training[training$Status %in% "No-Show",]$Status) / length(training$Status))
# This aggregates no-show ratios by week, normalized for apointment set and appointment date
normaliedAppointmentRegistration <- aggregate(training$Status, by=list(floor_date(training$AppointmentRegistration, "week")), function(x){ length(x[x %in% "No-Show"]) / length(x) })
normaliedAppointmentRegistration[,2] <- (normaliedAppointmentRegistration[,2] - mean(normaliedAppointmentRegistration[,2]))/sd(normaliedAppointmentRegistration[,2])
plot(normaliedAppointmentRegistration, ylim=c(-3,3), col=training$Gender)
legend("topleft", legend=levels(training$Gender), pch=16, col=unique(training$Gender))
normaliedApointmentData<- aggregate(training$Status, by=list(floor_date(training$ApointmentData, "month")), function(x){ length(x[x %in% "No-Show"]) / length(x) })
normaliedApointmentData[,2] <- (normaliedApointmentData[,2] - mean(normaliedApointmentData[,2]))/sd(normaliedApointmentData[,2])
plot(normaliedApointmentData, ylim=c(-3,3), col=training$Gender)
legend("topleft", legend=levels(training$Gender), pch=16, col=unique(training$Gender))
# Plot variability of AppointmentRegistration by weekday
plot(aggregate(training$Status, by=list(as.factor(weekdays(training$AppointmentRegistration))), function(x){ length(x[x %in% "No-Show"]) / length(x) }))
# Normalized Weekday Plot
normAR <- aggregate(training$Status, by=list(as.factor(weekdays(training$AppointmentRegistration))), function(x){ length(x[x %in% "No-Show"]) / length(x) })
normAR[,2] <- (normAR[,2] - mean(normAR[,2])) / sd(normAR[,2])
plot(normAR)
# Plot variability of ApointmentData by weekday
plot(aggregate(training$Status, by=list(as.factor(weekdays(training$ApointmentData))), function(x){ length(x[x %in% "No-Show"]) / length(x) }))
# Normalized Weekday Plot
normAR <- aggregate(training$Status, by=list(as.factor(weekdays(training$ApointmentData))), function(x){ length(x[x %in% "No-Show"]) / length(x) })
normAR[,2] <- (normAR[,2] - mean(normAR[,2])) / sd(normAR[,2])
plot(normAR)
# Plot variability by month
sort.month <- function(x, dataframe = NULL, abbreviated = FALSE){
y <- data.frame(m1 = month.name, m2 = month.abb, n = 1:12)
z <- if(abbreviated) match(x, y[, 'm2']) else match(x, y[, 'm1'])
x <- if(is.null(dataframe)) x else dataframe
h <- data.frame(z, x)
h[order(z), ][, -1]
}
# Normalized monthly plot for AppointmentRegistration
byMonth <- aggregate(training$Status, by=list(as.factor(months(training$AppointmentRegistration))), function(x){ length(x[x %in% "No-Show"]) / length(x) })
names(byMonth) <- c("monthName", "NoShowRate")
byMonth$NoShowRate <- (byMonth$NoShowRate - mean(byMonth$NoShowRate)) / sd(byMonth$NoShowRate)
byMonth <- sort.month(byMonth$monthName, byMonth)
plot(byMonth)
# Normalized monthly plot for ApointmentData
byMonth <- aggregate(training$Status, by=list(as.factor(months(training$ApointmentData))), function(x){ length(x[x %in% "No-Show"]) / length(x) })
names(byMonth) <- c("monthName", "NoShowRate")
byMonth$NoShowRate <- (byMonth$NoShowRate - mean(byMonth$NoShowRate)) / sd(byMonth$NoShowRate)
byMonth <- sort.month(byMonth$monthName, byMonth)
plot(byMonth)
# Normalized monthly plot for AwaitingTime
byAwaitingTime <- aggregate(training$Status, by=list(as.factor(floor(training$AwaitingTime))), function(x){ length(x[x %in% "No-Show"]) / length(x) })
names(byAwaitingTime) <- c("monthName", "NoShowRate")
byAwaitingTime$NoShowRate <- (byAwaitingTime$NoShowRate - mean(byAwaitingTime$NoShowRate)) / sd(byAwaitingTime$NoShowRate)
plot(byAwaitingTime)
# And just for good measure, let's run a correlation plot
training$AppointmentRegistrationTimestamp <- as.numeric(training$AppointmentRegistration)
training$ApointmentDataTimestamp <- as.numeric(training$ApointmentData)
cor(training[,names(training) %in% c("Age", "AwaitingTime", "ApointmentDataTimestamp", "AppointmentRegistrationTimestamp")])
## Age AwaitingTime
## Age 1.000000000 0.003880146
## AwaitingTime 0.003880146 1.000000000
## AppointmentRegistrationTimestamp 0.003597929 0.014286400
## ApointmentDataTimestamp 0.003310029 -0.061236447
## AppointmentRegistrationTimestamp
## Age 0.003597929
## AwaitingTime 0.014286400
## AppointmentRegistrationTimestamp 1.000000000
## ApointmentDataTimestamp 0.997146364
## ApointmentDataTimestamp
## Age 0.003310029
## AwaitingTime -0.061236447
## AppointmentRegistrationTimestamp 0.997146364
## ApointmentDataTimestamp 1.000000000
Now that we know which variables are most influential, it’s time to compile our testing and training datasets using identical algorithms.
One thing you’ll notice is that both Naive Bayes and Decision Trees do very poor jobs of classifying the appointments. None of the models predict a sinle No-Show result for even a single patient.
We get this result because none of the patients have more than a 50% chance of being a No-Show. This is natural, since nearly all of these variables reflect permanent states of being which have no exclusive relationship with this current appointment alone. That also explains why the Decision Tree only had one root.
The current approach is like using someone’s hair color and zip code to find out if they are hungry. It’s extremely rare that a patient will just keep making apointments without ever showing up for any of them. In order to predictively classify if they will be a No-Show, we need more data about their past, and other information about competing priorities at the time of their appointment.
A much better approach would be to use a regression model and predict the probability of a No-Show for any given patient, using regression.
# Break ApointmentData down by month
monthAD <- as.factor(months(training$AppointmentRegistration))
monthAD <- acm.disjonctif(as.data.frame(monthAD))
# Break ApointmentData down by weekday
weekdayAD <- as.factor(weekdays(training$AppointmentRegistration))
weekdayAD <- acm.disjonctif(as.data.frame(weekdayAD))
dim(weekdayAD)
## [1] 270002 7
# HiloAge variable splits at mean
training$HiLoAge <- 0
training$HiLoAge[training$Age > 42] <- 1
# Combine columns
training <- cbind(training, monthAD, weekdayAD)
# Check column names
# names(training)
trainingCondensed <- training[,names(training) %in% c("Age", "HiLoAge", "AppointmentRegistrationTimestamp", "Gender", "weekdayAD.Saturday", "weekdayAD.Sunday", "monthAD.January", "monthAD.December", "monthAD.June", "Diabetes", "Alcoolism", "Handcap", "Tuberculosis", "HiperTension", "Status")]
# names(trainingCondensed)
dim(trainingCondensed)
## [1] 270002 15
# ###############################################
# Break ApointmentData down by month
monthAD <- as.factor(months(testing$AppointmentRegistration))
monthAD <- acm.disjonctif(as.data.frame(monthAD))
# Break ApointmentData down by weekday
weekdayAD <- as.factor(weekdays(testing$AppointmentRegistration))
weekdayAD <- acm.disjonctif(as.data.frame(weekdayAD))
dim(weekdayAD)
## [1] 29998 6
testing$weekdayAD.Sunday <- 0
# HiloAge variable splits at mean
testing$HiLoAge <- 0
testing$HiLoAge[testing$Age > 42] <- 1
# Combine columns
testing <- cbind(testing, monthAD, weekdayAD)
# Check column names
# names(testing)
testingCondensed <- testing[,names(testing) %in% c("Age", "HiLoAge", "AppointmentRegistrationTimestamp", "Gender", "weekdayAD.Saturday", "weekdayAD.Sunday", "monthAD.January", "monthAD.December", "monthAD.June", "Diabetes", "Alcoolism", "Handcap", "Tuberculosis", "HiperTension", "Status")]
# names(testingCondensed)
dim(testingCondensed)
## [1] 29998 14
Since these classification models are very complex, they can take days to compile. Because of this, we’ve opted to save the models to files and load them as needed.
# Plot a classification tree model
rpartFit <- train(Status ~ Gender + Diabetes + HiperTension + HiLoAge + monthAD.December + monthAD.January + monthAD.June + weekdayAD.Saturday + weekdayAD.Sunday, method= "rpart",data=trainingCondensed)
print(rpartFit)
plot(rpartFit$finalModel, uniform=TRUE, main="Classification Tree")
text(rpartFit$finalModel, use.n=TRUE, cex=0.8)
# Create a model using Naive Bayes
nbFit <- train(Status ~ Gender + Diabetes + HiperTension + HiLoAge + monthAD.December + monthAD.January + monthAD.June + weekdayAD.Saturday + weekdayAD.Sunday, method= "nb",data=trainingCondensed)
print(nbFit)
# These model fits take a long time to generate. So let's save them just in case.
saveRDS(rpartFit, "rpartFit.rds")
saveRDS(nbFit, "nbFit.rds")
# The KNN algorithm was taking forever, so I cancelled it. But it would give similar results.
saveRDS(adaboostFit, "adaboostFit.rds")
rpartFit <- readRDS("rpartFit.rds")
nbFit <- readRDS("nbFit.rds")
# The KNN algorithm was taking forever, so I cancelled it. But it would give similar results.
# readRDS("knnFit.rds")
# Evaluate the fit of each model against test data
rpartPred <- predict(rpartFit, testing); testingCondensed$predRight <- rpartPred == testingCondensed$Status
## Loading required package: rpart
table(rpartPred, testingCondensed$Status)
##
## rpartPred No-Show Show-Up
## No-Show 0 0
## Show-Up 8983 21015
confusionMatrix(rpartPred, testingCondensed$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No-Show Show-Up
## No-Show 0 0
## Show-Up 8983 21015
##
## Accuracy : 0.7005
## 95% CI : (0.6953, 0.7057)
## No Information Rate : 0.7005
## P-Value [Acc > NIR] : 0.5029
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7005
## Prevalence : 0.2995
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No-Show
##
nbPred <- predict(nbFit, testing); testing$predRight <- nbPred == testing$Status
## Loading required package: klaR
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
table(nbPred, testing$Status)
##
## nbPred No-Show Show-Up
## No-Show 0 0
## Show-Up 8983 21015
confusionMatrix(nbPred, testing$Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No-Show Show-Up
## No-Show 0 0
## Show-Up 8983 21015
##
## Accuracy : 0.7005
## 95% CI : (0.6953, 0.7057)
## No Information Rate : 0.7005
## P-Value [Acc > NIR] : 0.5029
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7005
## Prevalence : 0.2995
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No-Show
##
Instead of classification, we’ll use the dummy variables from each record to create a hash. Then, we’ll calculate the probability of a no-show for each hash value. Finally, we’ll create a glm regression model that attempts to calculate the no-show probability for any given patient.
With this model, the RMSE is about half the size of the standard deviation. Although this is pretty good, I think it could be better with a few adjustments. From what I can see, there are a few potential areas for future improvement.
# Calculate the probability for each dummy hash
trainingCondensed$dummyHash <- with(trainingCondensed, paste0(Diabetes, Gender, HiperTension, HiLoAge, monthAD.December, monthAD.January, monthAD.June, weekdayAD.Saturday, weekdayAD.Sunday))
tmpHashNoSHowProb <- with(trainingCondensed, aggregate(Status, by = list(dummyHash), FUN = function(x){ length(x[x %in% "No-Show"]) / length(x) }))
names(tmpHashNoSHowProb) <- c("dummyHash", "noshowProb")
# Add these values to the data set
trainingCondensed <- merge(trainingCondensed, tmpHashNoSHowProb, by="dummyHash")
# Check to make sure it all worked out okay.
# head(trainingCondensed[sample(1:nrow(trainingCondensed), 1000),c("noshowProb", "dummyHash")], 100)
# #############################################
# Calculate the probability for each dummy hash
testingCondensed$dummyHash <- with(testingCondensed, paste0(Diabetes, Gender, HiperTension, HiLoAge, monthAD.December, monthAD.January, monthAD.June, weekdayAD.Saturday, weekdayAD.Sunday))
tmpHashNoSHowProb <- with(testingCondensed, aggregate(Status, by = list(dummyHash), FUN = function(x){ length(x[x %in% "No-Show"]) / length(x) }))
names(tmpHashNoSHowProb) <- c("dummyHash", "noshowProb")
# Add these values to the data set
testingCondensed <- merge(testingCondensed, tmpHashNoSHowProb, by="dummyHash")
# Check to make sure it all worked out okay.
# head(testingCondensed[sample(1:nrow(testingCondensed), 1000),c("noshowProb", "dummyHash")], 100)
# Fit a GLM to predict the probability of not showing up
glmFit <- train(noshowProb ~ Gender + Diabetes + HiperTension + HiLoAge + monthAD.December + monthAD.January + monthAD.June + weekdayAD.Saturday + weekdayAD.Sunday, method= "glm",data=trainingCondensed)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
print(glmFit)
## Generalized Linear Model
##
## 270002 samples
## 9 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 270002, 270002, 270002, 270002, 270002, 270002, ...
## Resampling results:
##
## RMSE Rsquared
## 0.007619173 0.9771759
##
##
summary(glmFit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.45707 -0.00043 -0.00004 0.00004 0.69817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.408e-01 2.344e-05 14541.87 <2e-16 ***
## GenderM 5.335e-03 3.052e-05 174.84 <2e-16 ***
## Diabetes 2.509e-03 5.939e-05 42.25 <2e-16 ***
## HiperTension -1.079e-02 4.248e-05 -254.02 <2e-16 ***
## HiLoAge -9.275e-02 3.312e-05 -2800.39 <2e-16 ***
## monthAD.December 2.753e-02 5.884e-05 467.84 <2e-16 ***
## monthAD.January -1.442e-02 5.186e-05 -277.98 <2e-16 ***
## monthAD.June 3.195e-02 5.256e-05 607.90 <2e-16 ***
## weekdayAD.Saturday 7.899e-02 3.425e-04 230.65 <2e-16 ***
## weekdayAD.Sunday -3.408e-01 7.456e-03 -45.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.558509e-05)
##
## Null deviance: 687.500 on 270001 degrees of freedom
## Residual deviance: 15.008 on 269992 degrees of freedom
## AIC: -1879126
##
## Number of Fisher Scoring iterations: 2
# Run preditions
predicted = predict(glmFit,testingCondensed)
# Check for missing values
length(predicted[is.na(predicted)])
## [1] 0
comparePredict <- data.frame(Original = testingCondensed$noshowProb, Guess = predicted)
# head(comparePredict[sample(1:nrow(comparePredict), 1000),], 100)
# Calculate RMSE
RMSE = function(m, o){
sqrt(mean((m - o)^2))
}
RMSE(comparePredict$Original, comparePredict$Guess)
## [1] 0.02496114
sd(comparePredict$Original)
## [1] 0.05924568
mean(comparePredict$Original)
## [1] 0.2994533