There are many reasons that patients fail to make their appointments and there are demographic similarities across patients who no-show more consistently than others. Practice managers work hard to reduce no-shows using a variety of strategies, but often those strategies involve manual processes or difficult-to-enforce policies, resulting in a low impact.
The aim of this project is to carry out an exploratory data analysis of the no-show appointment data.
setwd("C:\\Users\\USER\\Documents\\R")
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read the dataset
dat <- read_csv("KaggleV2-May-2016.csv") # Importing the dataset
## Rows: 110527 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Gender, Neighbourhood, No-show
## dbl (9): PatientId, AppointmentID, Age, Scholarship, Hipertension, Diabetes...
## dttm (2): ScheduledDay, AppointmentDay
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(dat) # Viewing the data
# The dimension of the data
dim(dat)
## [1] 110527 14
# Summary of the data
summary(dat)
## PatientId AppointmentID Gender
## Min. :3.922e+04 Min. :5030230 Length:110527
## 1st Qu.:4.173e+12 1st Qu.:5640286 Class :character
## Median :3.173e+13 Median :5680573 Mode :character
## Mean :1.475e+14 Mean :5675305
## 3rd Qu.:9.439e+13 3rd Qu.:5725524
## Max. :1.000e+15 Max. :5790484
## ScheduledDay AppointmentDay
## Min. :2015-11-10 07:13:56.00 Min. :2016-04-29 00:00:00.00
## 1st Qu.:2016-04-29 10:27:01.00 1st Qu.:2016-05-09 00:00:00.00
## Median :2016-05-10 12:13:17.00 Median :2016-05-18 00:00:00.00
## Mean :2016-05-09 07:49:15.84 Mean :2016-05-19 00:57:50.00
## 3rd Qu.:2016-05-20 11:18:37.00 3rd Qu.:2016-05-31 00:00:00.00
## Max. :2016-06-08 20:07:23.00 Max. :2016-06-08 00:00:00.00
## Age Neighbourhood Scholarship Hipertension
## Min. : -1.00 Length:110527 Min. :0.00000 Min. :0.0000
## 1st Qu.: 18.00 Class :character 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 37.00 Mode :character Median :0.00000 Median :0.0000
## Mean : 37.09 Mean :0.09827 Mean :0.1972
## 3rd Qu.: 55.00 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :115.00 Max. :1.00000 Max. :1.0000
## Diabetes Alcoholism Handcap SMS_received
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000
## Mean :0.07186 Mean :0.0304 Mean :0.02225 Mean :0.321
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :4.00000 Max. :1.000
## No-show
## Length:110527
## Class :character
## Mode :character
##
##
##
# To check for missing values in the data
table(is.na(dat))
##
## FALSE
## 1547378
sum(is.na(dat)) #Summary of the missing value
## [1] 0
# Extracting the column of age from the dataset
table(dat$Age)
##
## -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 3539 2273 1618 1513 1299 1489 1521 1427 1424 1372 1274 1195 1092 1103 1118
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 1211 1402 1509 1487 1545 1437 1452 1376 1349 1242 1332 1283 1377 1448 1403 1521
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 1439 1505 1524 1526 1378 1580 1533 1629 1536 1402 1346 1272 1344 1487 1453 1460
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
## 1394 1399 1652 1613 1567 1746 1651 1530 1425 1635 1603 1469 1624 1411 1343 1312
## 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
## 1374 1331 1101 1187 973 1012 832 724 695 615 725 602 544 571 527 541
## 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
## 390 511 434 392 280 311 275 260 184 126 173 109 66 86 53 33
## 95 96 97 98 99 100 102 115
## 24 17 11 6 1 4 2 5
dat %>% filter(Age == -1)
## # A tibble: 1 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <dttm> <dttm> <dbl>
## 1 4.66e14 5775010 F 2016-06-06 08:58:13 2016-06-06 00:00:00 -1
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
# Filtering the column with -1
sort_df <- dat %>% filter(Age != -1)
View(sort_df)
summary(sort_df)
## PatientId AppointmentID Gender
## Min. :3.922e+04 Min. :5030230 Length:110526
## 1st Qu.:4.173e+12 1st Qu.:5640285 Class :character
## Median :3.173e+13 Median :5680572 Mode :character
## Mean :1.475e+14 Mean :5675304
## 3rd Qu.:9.439e+13 3rd Qu.:5725523
## Max. :1.000e+15 Max. :5790484
## ScheduledDay AppointmentDay
## Min. :2015-11-10 07:13:56.00 Min. :2016-04-29 00:00:00.00
## 1st Qu.:2016-04-29 10:27:01.00 1st Qu.:2016-05-09 00:00:00.00
## Median :2016-05-10 12:13:10.50 Median :2016-05-18 00:00:00.00
## Mean :2016-05-09 07:48:53.92 Mean :2016-05-19 00:57:35.97
## 3rd Qu.:2016-05-20 11:18:30.25 3rd Qu.:2016-05-31 00:00:00.00
## Max. :2016-06-08 20:07:23.00 Max. :2016-06-08 00:00:00.00
## Age Neighbourhood Scholarship Hipertension
## Min. : 0.00 Length:110526 Min. :0.00000 Min. :0.0000
## 1st Qu.: 18.00 Class :character 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 37.00 Mode :character Median :0.00000 Median :0.0000
## Mean : 37.09 Mean :0.09827 Mean :0.1972
## 3rd Qu.: 55.00 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :115.00 Max. :1.00000 Max. :1.0000
## Diabetes Alcoholism Handcap SMS_received
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000
## Mean :0.07187 Mean :0.0304 Mean :0.02225 Mean :0.321
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :4.00000 Max. :1.000
## No-show
## Length:110526
## Class :character
## Mode :character
##
##
##
## Renaming the columns that were wrongly spelt
colnames(sort_df)[c(9,12)] <- c("Hypertension", "Handicap")
View(sort_df)
head(sort_df)
## # A tibble: 6 × 14
## PatientId AppointmentID Gender ScheduledDay AppointmentDay Age
## <dbl> <dbl> <chr> <dttm> <dttm> <dbl>
## 1 2.99e13 5642903 F 2016-04-29 18:38:08 2016-04-29 00:00:00 62
## 2 5.59e14 5642503 M 2016-04-29 16:08:27 2016-04-29 00:00:00 56
## 3 4.26e12 5642549 F 2016-04-29 16:19:04 2016-04-29 00:00:00 62
## 4 8.68e11 5642828 F 2016-04-29 17:29:31 2016-04-29 00:00:00 8
## 5 8.84e12 5642494 F 2016-04-29 16:07:23 2016-04-29 00:00:00 56
## 6 9.60e13 5626772 F 2016-04-27 08:36:51 2016-04-29 00:00:00 76
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## # Hypertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handicap <dbl>,
## # SMS_received <dbl>, `No-show` <chr>
## Using the select function to remove the Appointment ID column
df <- sort_df %>% select(1,3:14)
View(df)
## Using the mutate function and substr to extract a string from a string
df <- df %>% mutate(ScheduledDay = substr(ScheduledDay,1,10))
summary(df)
## PatientId Gender ScheduledDay
## Min. :3.922e+04 Length:110526 Length:110526
## 1st Qu.:4.173e+12 Class :character Class :character
## Median :3.173e+13 Mode :character Mode :character
## Mean :1.475e+14
## 3rd Qu.:9.439e+13
## Max. :1.000e+15
## AppointmentDay Age Neighbourhood
## Min. :2016-04-29 00:00:00.00 Min. : 0.00 Length:110526
## 1st Qu.:2016-05-09 00:00:00.00 1st Qu.: 18.00 Class :character
## Median :2016-05-18 00:00:00.00 Median : 37.00 Mode :character
## Mean :2016-05-19 00:57:35.97 Mean : 37.09
## 3rd Qu.:2016-05-31 00:00:00.00 3rd Qu.: 55.00
## Max. :2016-06-08 00:00:00.00 Max. :115.00
## Scholarship Hypertension Diabetes Alcoholism
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.09827 Mean :0.1972 Mean :0.07187 Mean :0.0304
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## Handicap SMS_received No-show
## Min. :0.00000 Min. :0.000 Length:110526
## 1st Qu.:0.00000 1st Qu.:0.000 Class :character
## Median :0.00000 Median :0.000 Mode :character
## Mean :0.02225 Mean :0.321
## 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :4.00000 Max. :1.000
class(df$AppointmentDay)
## [1] "POSIXct" "POSIXt"
## Using the mutate function to convert the Appointment day to character
df <- df %>% mutate(AppointmentDay = as.character(AppointmentDay), AppointmentDay = substr(AppointmentDay,1,10))
df <- df %>% select(1:13)
class(df$`No-show`) ## To specify the type of column
## [1] "character"
## Using the mutate function to convert some of the columns to date, factor and logical
df <- df %>%
mutate(
ScheduledDay = as.Date(ScheduledDay),
AppointmentDay = as.Date(AppointmentDay),
Neighbourhood = as.factor(Neighbourhood),
Hypertension = as.logical(Hypertension),
Diabetes = as.logical(Diabetes),
Alcoholism = as.logical(Alcoholism),
Handicap = as.factor(Handicap),
SMS_received = as.logical(SMS_received),
`No-show` = recode(`No-show`, 'No'=0, 'Yes'=1),
`No-show`= as.logical(`No-show`),
Gender = as.factor(Gender),
Scholarship = as.logical(Scholarship)
)
View(df)
summary(df)
## PatientId Gender ScheduledDay AppointmentDay
## Min. :3.922e+04 F:71839 Min. :2015-11-10 Min. :2016-04-29
## 1st Qu.:4.173e+12 M:38687 1st Qu.:2016-04-29 1st Qu.:2016-05-09
## Median :3.173e+13 Median :2016-05-10 Median :2016-05-18
## Mean :1.475e+14 Mean :2016-05-08 Mean :2016-05-19
## 3rd Qu.:9.439e+13 3rd Qu.:2016-05-20 3rd Qu.:2016-05-31
## Max. :1.000e+15 Max. :2016-06-08 Max. :2016-06-08
##
## Age Neighbourhood Scholarship Hypertension
## Min. : 0.00 JARDIM CAMBURI : 7717 Mode :logical Mode :logical
## 1st Qu.: 18.00 MARIA ORTIZ : 5805 FALSE:99665 FALSE:88725
## Median : 37.00 RESISTÊNCIA : 4431 TRUE :10861 TRUE :21801
## Mean : 37.09 JARDIM DA PENHA: 3877
## 3rd Qu.: 55.00 ITARARÉ : 3514
## Max. :115.00 CENTRO : 3334
## (Other) :81848
## Diabetes Alcoholism Handicap SMS_received No-show
## Mode :logical Mode :logical 0:108285 Mode :logical Mode :logical
## FALSE:102583 FALSE:107166 1: 2042 FALSE:75044 FALSE:88207
## TRUE :7943 TRUE :3360 2: 183 TRUE :35482 TRUE :22319
## 3: 13
## 4: 3
##
##
###Question 1
# Does the Time Interval Between Scheduled Day and Appointment Day Impact the No-Show Rate?
# Calculate the time interval in days
df <- df %>% mutate(Time_Interval = as.numeric(AppointmentDay - ScheduledDay))
summary(df)
## PatientId Gender ScheduledDay AppointmentDay
## Min. :3.922e+04 F:71839 Min. :2015-11-10 Min. :2016-04-29
## 1st Qu.:4.173e+12 M:38687 1st Qu.:2016-04-29 1st Qu.:2016-05-09
## Median :3.173e+13 Median :2016-05-10 Median :2016-05-18
## Mean :1.475e+14 Mean :2016-05-08 Mean :2016-05-19
## 3rd Qu.:9.439e+13 3rd Qu.:2016-05-20 3rd Qu.:2016-05-31
## Max. :1.000e+15 Max. :2016-06-08 Max. :2016-06-08
##
## Age Neighbourhood Scholarship Hypertension
## Min. : 0.00 JARDIM CAMBURI : 7717 Mode :logical Mode :logical
## 1st Qu.: 18.00 MARIA ORTIZ : 5805 FALSE:99665 FALSE:88725
## Median : 37.00 RESISTÊNCIA : 4431 TRUE :10861 TRUE :21801
## Mean : 37.09 JARDIM DA PENHA: 3877
## 3rd Qu.: 55.00 ITARARÉ : 3514
## Max. :115.00 CENTRO : 3334
## (Other) :81848
## Diabetes Alcoholism Handicap SMS_received No-show
## Mode :logical Mode :logical 0:108285 Mode :logical Mode :logical
## FALSE:102583 FALSE:107166 1: 2042 FALSE:75044 FALSE:88207
## TRUE :7943 TRUE :3360 2: 183 TRUE :35482 TRUE :22319
## 3: 13
## 4: 3
##
##
## Time_Interval
## Min. : -6.00
## 1st Qu.: 0.00
## Median : 4.00
## Mean : 10.18
## 3rd Qu.: 15.00
## Max. :179.00
##
table(df$Time_Interval)
##
## -6 -1 0 1 2 3 4 5 6 7 8 9 10
## 1 4 38562 5213 6725 2737 5290 3277 4037 4906 2332 1605 1391
## 11 12 13 14 15 16 17 18 19 20 21 22 23
## 987 1115 1682 2913 1503 1151 1107 1021 1044 1187 1861 1173 822
## 24 25 26 27 28 29 30 31 32 33 34 35 36
## 622 637 731 1013 1706 1089 704 672 586 664 808 963 551
## 37 38 39 40 41 42 43 44 45 46 47 48 49
## 331 199 170 254 342 417 254 157 137 121 109 117 216
## 50 51 52 53 54 55 56 57 58 59 60 61 62
## 182 91 90 96 73 109 203 98 110 90 73 135 193
## 63 64 65 66 67 68 69 70 71 72 73 74 75
## 200 75 97 162 63 70 105 112 50 53 37 39 72
## 76 77 78 79 80 81 82 83 84 85 86 87 88
## 65 53 33 23 18 10 1 8 54 37 6 16 16
## 89 90 91 92 93 94 95 96 97 98 101 102 103
## 17 58 56 3 2 2 5 4 2 5 1 4 5
## 104 105 107 108 109 110 111 112 115 117 119 122 123
## 8 4 2 5 5 2 5 5 2 1 4 3 1
## 125 126 127 132 133 139 142 146 151 155 162 169 176
## 1 1 1 1 11 1 8 1 1 10 11 8 16
## 179
## 10
#Removing the negative values
df <- df %>% filter(Time_Interval != -6)
df <- df %>% filter(Time_Interval != -1)
## Plotting a histogram graph of time interval using ggplot
ggplot(df, aes(Time_Interval)) + geom_histogram(binwidth = 10, color = "red", fill = "blue")
##Create a new dataframe and group_by time interval
## Find the mean of the noshow that is TRUE
data <- df %>% group_by(Time_Interval) %>% summarise(No_show_rate = mean(`No-show` == TRUE))
## Visualizing the relationship between time interval and no-show rate
ggplot(data, aes(x=Time_Interval, y= No_show_rate)) + geom_line(color = "blue") +
labs(title= "Impact of the Time Interval on No-show Rate",
x = "Time Interval", y = "No-show Rate")
###QUESTION2
# Are there days when patients rarely visit the hospitals to meet up the appointment?
# Extract the day of the week from AppointmentDay
df <- df %>% mutate(Day_of_the_Week = weekdays(AppointmentDay))