Patient no-shows are a common issue in medical practices, affecting all types of specialties and locations. While these no-shows can create problems for practices, many don’t track how often they happen or understand the impact of just a few missed appointments on their operations and income.
Patients miss appointments for various reasons, and there are certain trends among those who tend to no-show more often. Practice managers try different strategies to reduce these no-shows, but many approaches involve manual work or hard-to-follow policies, which often aren’t very effective.
With advancements in electronic health records (EHRs) and healthcare innovation, there are now best practices that can help practice managers reduce no-shows by more than 50% while cutting down on manual tasks.
This guide delves into the reasons behind no-show behavior, outlines effective practices, and explains how these methods can lessen the workload for staff, improve patient care, and ultimately reduce no-show rates.
the aim of this project is to carry out an explanatory analysis of the no-show appointment data
library(magrittr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%||%() masks base::%||%()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
setwd("C:/Users/user/Desktop/OLADIPUPO")
library(readr)
datta <- read.csv("Kaggle.csv")
View(datta)
summary(datta)
## PatientId AppointmentID Gender ScheduledDay
## Min. :3.922e+04 Min. :5030230 Length:110527 Length:110527
## 1st Qu.:4.173e+12 1st Qu.:5640286 Class :character Class :character
## Median :3.173e+13 Median :5680573 Mode :character Mode :character
## Mean :1.475e+14 Mean :5675305
## 3rd Qu.:9.439e+13 3rd Qu.:5725524
## Max. :1.000e+15 Max. :5790484
## AppointmentDay Age Neighbourhood Scholarship
## Length:110527 Min. : -1.00 Length:110527 Min. :0.00000
## Class :character 1st Qu.: 18.00 Class :character 1st Qu.:0.00000
## Mode :character Median : 37.00 Mode :character Median :0.00000
## Mean : 37.09 Mean :0.09827
## 3rd Qu.: 55.00 3rd Qu.:0.00000
## Max. :115.00 Max. :1.00000
## Hipertension Diabetes Alcoholism Handcap
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.1972 Mean :0.07186 Mean :0.0304 Mean :0.02225
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :4.00000
## SMS_received No.show
## Min. :0.000 Length:110527
## 1st Qu.:0.000 Class :character
## Median :0.000 Mode :character
## Mean :0.321
## 3rd Qu.:1.000
## Max. :1.000
table(is.na(datta))
##
## FALSE
## 1547378
sum(is.na(datta)) # No missing value
## [1] 0
## %>% is use for joining two executable codes and is called forward pipe operator
### some names are not correct
datt <- datta %>% rename(Handicap=Handcap, Hypertension=Hipertension)
View(datt)
##### number of row and column
nrow(datt)
## [1] 110527
ncol(datt)
## [1] 14
### we have age of -1
which(datt$Age==-1)
## [1] 99833
# convert the days to date
datt$ScheduledDay <- as.Date(datt$ScheduledDay)
class(datt$ScheduledDay)
## [1] "Date"
datt$AppointmentDay <- as.Date(datt$AppointmentDay)
class(datt$AppointmentDay)
## [1] "Date"
datt$time_interval <- as.numeric(datt$AppointmentDay-datt$ScheduledDay)
summary(datt$time_interval)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.00 0.00 4.00 10.18 15.00 179.00
which(datt$time_interval =="-6") # which row has a negative 6 time interval
## [1] 71534
datt <- datt[- 71534,] # remove the entire row that is -6
which(datt$time_interval < "0") # which row of the time interval is less than 0
## [1] 27034 55227 64176 72362
datt <- datt[-c(27034, 55227, 64176, 72362),] # remove the entire row that is less 0
summary(datt)
## PatientId AppointmentID Gender ScheduledDay
## Min. :3.922e+04 Min. :5030230 Length:110522 Min. :2015-11-10
## 1st Qu.:4.173e+12 1st Qu.:5640284 Class :character 1st Qu.:2016-04-29
## Median :3.174e+13 Median :5680574 Mode :character Median :2016-05-10
## Mean :1.475e+14 Mean :5675305 Mean :2016-05-08
## 3rd Qu.:9.439e+13 3rd Qu.:5725525 3rd Qu.:2016-05-20
## Max. :1.000e+15 Max. :5790484 Max. :2016-06-08
## AppointmentDay Age Neighbourhood Scholarship
## Min. :2016-04-29 Min. : -1.00 Length:110522 Min. :0.00000
## 1st Qu.:2016-05-09 1st Qu.: 18.00 Class :character 1st Qu.:0.00000
## Median :2016-05-18 Median : 37.00 Mode :character Median :0.00000
## Mean :2016-05-19 Mean : 37.09 Mean :0.09827
## 3rd Qu.:2016-05-31 3rd Qu.: 55.00 3rd Qu.:0.00000
## Max. :2016-06-08 Max. :115.00 Max. :1.00000
## Hypertension Diabetes Alcoholism Handicap
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.1973 Mean :0.07187 Mean :0.0304 Mean :0.02223
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :4.00000
## SMS_received No.show time_interval
## Min. :0.000 Length:110522 Min. : 0.00
## 1st Qu.:0.000 Class :character 1st Qu.: 0.00
## Median :0.000 Mode :character Median : 4.00
## Mean :0.321 Mean : 10.18
## 3rd Qu.:1.000 3rd Qu.: 15.00
## Max. :1.000 Max. :179.00
table(datt$No.show)
##
## No Yes
## 88208 22314
# we need to convert it to numeric to be able to calculate the average
datt$No.show[which(datt$No.show=="Yes")] <- 1
datt$No.show[which(datt$No.show=="No")] <- 0
datt$No.show <- as.numeric(datt$No.show)
class(datt$No.show)
## [1] "numeric"
# group by time interval and calculate the average no show rate
no_show_rate <- datt %>%
group_by(time_interval) %>%
summarise(no_show_rate = mean(No.show))
view(no_show_rate)
# factor Yes and No
datt$No.show <- factor(datt$No.show, labels = c("Yes","No"))
no_show_rate_yes <- datt %>%
group_by(time_interval) %>%
summarise(no_show_rate = mean(No.show=="Yes"))
View(no_show_rate_yes)
no_show_rate_No <- datt %>%
group_by(time_interval) %>%
summarise(no_show_rate = mean(No.show=="No"))
View(no_show_rate_No)
b <- ggplot(no_show_rate_yes, aes(x = time_interval, y = no_show_rate )) +
geom_line(col = "red") +
geom_point() +
labs(title = "No-Show Rate by Time Interval",
x = "Time Interval (days)",
y = "No_show_rate_yes") +
theme_minimal()
c <- ggplot(no_show_rate_No, aes(x = time_interval, y = no_show_rate)) +
geom_line( colour = "blue") +
geom_point() +
labs(title = "No-Show Rate by Time Interval",
x = "Time Interval (days)",
y = "No_Show_Rate_No") +
theme_minimal()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(b,c,ncol=2)
From the analysis of time interval between ScheduledDay and
AppointmentDay, it can be observed that there is a strong relationship
with no-show rates. Both graphs are showing notable patterns where the
red graph shows no-shows-yes and the blue graph shows attended
appointments-no. It indicates that when the appointment has been
scheduled really far in advance - more than 100 days, then extreme
variability occurs with fluctuations between 0% to 100%. However, a
no-show rate within the first 50 booked days seems more stable within
the 25-40% range. This could be an indication that with closer proximity
between calling for an appointment and the appointment date, attendance
patterns could also be closer to consistency. The longer the time
intervals are, the less predictable the attendance outcome tends to be
due to forgotten appointments or the use of alternative
arrangements.
# appointment days to weekdays
datt$days_of_week <- weekdays(datt$AppointmentDay)
datt$days_of_week <- factor(datt$days_of_week,
levels = c("Monday", "Tuesday", "Wednesday", # for the days to be in orders
"Thursday", "Friday", "Saturday"))
table(datt$days_of_week)
##
## Monday Tuesday Wednesday Thursday Friday Saturday
## 22714 25638 25866 17246 19019 39
ggplot(datt, aes(x = days_of_week, fill = No.show)) +
geom_bar(position = "dodge") +
labs(title = "No-Show Rate vs Appointment Day",
x = "Appointment Day",
y = "No_show",
fill = "No.Show") +
theme_classic() + # to apply a classical theme we use theme_classic()
scale_fill_manual(values = c("No" = "maroon", "Yes" = "yellow4"))
Saturdays are days where patient rarely visit the hospital
## TO call diabetes column
D <- datt$Diabetes
which(is.na(datt$Diabetes)) # check if there is a missing value
## integer(0)
### To check people that have diabetes
YES.D <- which(datt$Diabetes==1)
## To check length of people that has Diabetes
length(YES.D)
## [1] 7943
## TO call Hypertension column
H <- datt$Hypertension
which(is.na(datt$Hypertension)) # check if there is a missing value
## integer(0)
### To check people that have Hypertension in that column
YES.H <-which(H==1)
## To check numbers of people that has Hypertension
length(YES.H)
## [1] 21801
## TO call Handicap column
Hh <- datt$Handicap
which(is.na(datt$Handicap)) # check if there is a missing value
## integer(0)
YES.Hh <- which(Hh==1) # checked the rows with people who have handicap
length(YES.Hh) # checked the length of people with handicap
## [1] 2040
## TO call Alcoholism column
A <- datt$Alcoholism
which(is.na(datt$Alcoholism)) # check if there is a missing value
## integer(0)
YES.A <- which(A==1) # checked the rows with people who have Alcoholism
length(YES.A) # checked the length of people with Alcoholism
## [1] 3360
# Calculate disease counts for no_show yes
# By calculating attendance and missed counts, you can:
# Determine the prevalence of no-shows for each disease.
#Analyze the relationship between disease type and attendance.
YES.D <- sum(datt$No.show == "Yes" & D== 1)
YES.D
## [1] 6513
YES.H <- sum(datt$No.show == "Yes" & H == 1)
YES.H
## [1] 18029
YES.Hh <- sum(datt$No.show == "Yes" & Hh== 1)
YES.Hh
## [1] 1676
YES.A <- sum(datt$No.show == "Yes" & A == 1)
YES.A
## [1] 2683
# Create a data frame
disease_counts <- data.frame(
Disease = c("Diabetes", "Hypertension", "Handicap", "Alcoholism"),
Total = c(length(YES.D), length(YES.H), length(YES.Hh), length(YES.A)),
No_Show = c(YES.D, YES.H, YES.Hh, YES.A)
)
# Create a bar chart
ggplot(disease_counts, aes(x = Disease, y = Total, fill = "Total")) +
geom_bar(stat = "identity", position = "dodge") +
geom_bar(aes(y = No_Show, fill = "No Show"), stat = "identity", position = "dodge") +
labs(title = "Disease Attendance", x = "Disease", y = "Count", fill = "Attendance") +
theme_classic()
Handicap Patients are likely to come less
# . Create age groups
datt$Age_Group <- cut(datt$Age, breaks = c(0, 18, 30, 50, 65, Inf),
labels = c("0-18", "19-30", "31-50", "51-65", "66-100"))
summary(datt$Age_Group)
## 0-18 19-30 31-50 51-65 66-100 NA's
## 25326 16763 29471 22122 13300 3540
# remove the NAs
age_summary <- datt %>% filter(Age_Group != "NA")
ggplot(age_summary, aes(x = Age_Group, fill = No.show)) +
geom_bar(position = "dodge") +
labs(title = "Age Group vs No-Show Rate", x = "Age Group", y = "No show rate")
# . Calculate the no-show rate by neighborhood
no_show_neighbor <- datt %>%
group_by(Neighbourhood) %>%
summarise(
Total_Appointments = n(),
No_Show_Count = sum(No.show == "Yes"),
No_Show_Rate = No_Show_Count / Total_Appointments
)
# Visualize the no-show rates by neighborhood
ggplot(no_show_neighbor, aes(x = Neighbourhood, y = No_Show_Rate)) +
geom_bar(stat = "identity") +
labs(title = "No-Show Rate by Neighborhood", x = "Neighborhood", y = "No-Show Rate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create a new variable for the number of diseases
datt <- datt %>%
mutate(Disease_Count = rowSums(select(., Hypertension, Diabetes, Alcoholism, Handicap), na.rm = TRUE))
## To the sum elements of Hypertension, Alcoholism, Handicap and Diabetes column
datt$Disease_Count <- rowSums(datt[, c("Hypertension",
"Diabetes", "Alcoholism", "Handicap")])
# Calculate no-show rate by disease count
disease_no_show_rate <- datt %>%
group_by(Disease_Count) %>%
summarise(No_Show_Count = sum(No.show == "Yes"))
# Create the bar plot
ggplot(disease_no_show_rate, aes(x = Disease_Count, y = No_Show_Count)) +
geom_bar(stat = "identity", fill = "blue4") +
labs(title = "No-Show Rate by Disease Count",
x = "Number of Diseases",
y = "No-Show Rate") +
theme_minimal()
#. Calculate the no-show rate based on SMS received
sms_no_show_rate <- datt %>%
group_by(SMS_received) %>%
summarise(
Total_Appointments = n(),
No_Show_Count = sum(No.show == "Yes"),
No_Show_Ratee = No_Show_Count / Total_Appointments
)
#
ggplot(sms_no_show_rate, aes(x = as.factor(SMS_received), y = No_Show_Ratee)) +
geom_bar(stat = "identity", aes(fill = as.factor(SMS_received))) +
labs(title = "No-Show Rate by SMS Received",
x = "SMS Received (1 = Yes, 0 = No)",
y = "No-Show Rate") +
scale_fill_manual(values = c("red4", "green4")) +
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "none"
)