Introduction

Patient no-shows are a common issue in medical practices, affecting all types of specialties and locations. While these no-shows can create problems for practices, many don’t track how often they happen or understand the impact of just a few missed appointments on their operations and income.

Patients miss appointments for various reasons, and there are certain trends among those who tend to no-show more often. Practice managers try different strategies to reduce these no-shows, but many approaches involve manual work or hard-to-follow policies, which often aren’t very effective.

With advancements in electronic health records (EHRs) and healthcare innovation, there are now best practices that can help practice managers reduce no-shows by more than 50% while cutting down on manual tasks.

This guide delves into the reasons behind no-show behavior, outlines effective practices, and explains how these methods can lessen the workload for staff, improve patient care, and ultimately reduce no-show rates.

Aims and Objectives

the aim of this project is to carry out an explanatory analysis of the no-show appointment data

Project Questions

  1. Does the Time Interval Between Scheduled Day and Appointment Day Impact the No-Show Rate? . Calculate the time interval in days
  2. Are there days when patients rarely visit the hospitals to meet up the appointment? . To Extract specific day of the week from AppointmentDay
  3. Which Patients With Particular Diseases is likely to Come less for Appointment?
  4. Does the Age Group Have Impact on the No-Show Rate? . Create age groups
  5. Does the Location Have Impact on the No-Show Rate? . Calculate the no-show rate by neighbourhood
  6. Do patients with more than one disease increase no-show rate? . Create a variable for the number of diseases
  7. Is the Follow-Up Scheme Used Appropriately? . Calculate the no-show rate based on SMS received

load some libraries

library(magrittr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%||%()      masks base::%||%()
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

set working directories

setwd("C:/Users/user/Desktop/OLADIPUPO")
library(readr)
datta <- read.csv("Kaggle.csv")

view the data

View(datta)

the summary of the datta

summary(datta)
##    PatientId         AppointmentID        Gender          ScheduledDay      
##  Min.   :3.922e+04   Min.   :5030230   Length:110527      Length:110527     
##  1st Qu.:4.173e+12   1st Qu.:5640286   Class :character   Class :character  
##  Median :3.173e+13   Median :5680573   Mode  :character   Mode  :character  
##  Mean   :1.475e+14   Mean   :5675305                                        
##  3rd Qu.:9.439e+13   3rd Qu.:5725524                                        
##  Max.   :1.000e+15   Max.   :5790484                                        
##  AppointmentDay          Age         Neighbourhood       Scholarship     
##  Length:110527      Min.   : -1.00   Length:110527      Min.   :0.00000  
##  Class :character   1st Qu.: 18.00   Class :character   1st Qu.:0.00000  
##  Mode  :character   Median : 37.00   Mode  :character   Median :0.00000  
##                     Mean   : 37.09                      Mean   :0.09827  
##                     3rd Qu.: 55.00                      3rd Qu.:0.00000  
##                     Max.   :115.00                      Max.   :1.00000  
##   Hipertension       Diabetes         Alcoholism        Handcap       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.1972   Mean   :0.07186   Mean   :0.0304   Mean   :0.02225  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :4.00000  
##   SMS_received     No.show         
##  Min.   :0.000   Length:110527     
##  1st Qu.:0.000   Class :character  
##  Median :0.000   Mode  :character  
##  Mean   :0.321                     
##  3rd Qu.:1.000                     
##  Max.   :1.000

to check for a missing value

table(is.na(datta))
## 
##   FALSE 
## 1547378
sum(is.na(datta))   # No missing value
## [1] 0

cleaning of the data

## %>% is use for joining two executable codes and is called forward pipe operator
### some names are not correct 
datt <- datta %>% rename(Handicap=Handcap, Hypertension=Hipertension)
View(datt)
##### number of row and column
nrow(datt)
## [1] 110527
ncol(datt)
## [1] 14

cleaning the age

### we have age of -1
which(datt$Age==-1)
## [1] 99833

1. Does the time interval between scheduled-Day and Appointment-Day impact the no-show rate?

# convert the days to date
datt$ScheduledDay <- as.Date(datt$ScheduledDay)
class(datt$ScheduledDay)
## [1] "Date"
datt$AppointmentDay <- as.Date(datt$AppointmentDay)
class(datt$AppointmentDay)
## [1] "Date"

calculate the time interval

datt$time_interval <- as.numeric(datt$AppointmentDay-datt$ScheduledDay)
summary(datt$time_interval)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -6.00    0.00    4.00   10.18   15.00  179.00

removing the negative rows

which(datt$time_interval =="-6") # which row has a negative 6 time interval
## [1] 71534
datt <- datt[- 71534,]   # remove the entire row that is -6
which(datt$time_interval < "0") # which row of the time interval is less than 0
## [1] 27034 55227 64176 72362
datt <- datt[-c(27034, 55227, 64176, 72362),] # remove the entire row that is less 0
summary(datt)
##    PatientId         AppointmentID        Gender           ScheduledDay       
##  Min.   :3.922e+04   Min.   :5030230   Length:110522      Min.   :2015-11-10  
##  1st Qu.:4.173e+12   1st Qu.:5640284   Class :character   1st Qu.:2016-04-29  
##  Median :3.174e+13   Median :5680574   Mode  :character   Median :2016-05-10  
##  Mean   :1.475e+14   Mean   :5675305                      Mean   :2016-05-08  
##  3rd Qu.:9.439e+13   3rd Qu.:5725525                      3rd Qu.:2016-05-20  
##  Max.   :1.000e+15   Max.   :5790484                      Max.   :2016-06-08  
##  AppointmentDay            Age         Neighbourhood       Scholarship     
##  Min.   :2016-04-29   Min.   : -1.00   Length:110522      Min.   :0.00000  
##  1st Qu.:2016-05-09   1st Qu.: 18.00   Class :character   1st Qu.:0.00000  
##  Median :2016-05-18   Median : 37.00   Mode  :character   Median :0.00000  
##  Mean   :2016-05-19   Mean   : 37.09                      Mean   :0.09827  
##  3rd Qu.:2016-05-31   3rd Qu.: 55.00                      3rd Qu.:0.00000  
##  Max.   :2016-06-08   Max.   :115.00                      Max.   :1.00000  
##   Hypertension       Diabetes         Alcoholism        Handicap      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.1973   Mean   :0.07187   Mean   :0.0304   Mean   :0.02223  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :4.00000  
##   SMS_received     No.show          time_interval   
##  Min.   :0.000   Length:110522      Min.   :  0.00  
##  1st Qu.:0.000   Class :character   1st Qu.:  0.00  
##  Median :0.000   Mode  :character   Median :  4.00  
##  Mean   :0.321                      Mean   : 10.18  
##  3rd Qu.:1.000                      3rd Qu.: 15.00  
##  Max.   :1.000                      Max.   :179.00

No.show to numeric

table(datt$No.show)
## 
##    No   Yes 
## 88208 22314
# we need to convert it to numeric to be able to calculate the average
datt$No.show[which(datt$No.show=="Yes")] <- 1
datt$No.show[which(datt$No.show=="No")] <- 0
datt$No.show <- as.numeric(datt$No.show)
class(datt$No.show)
## [1] "numeric"

Calculate the no-show rate by time interval

# group by time interval and calculate the average no show rate 
no_show_rate <- datt %>%
  group_by(time_interval) %>%
  summarise(no_show_rate = mean(No.show))

view(no_show_rate)

group by time interval and calculate the no show rate for yes and no

# factor Yes and No
datt$No.show <- factor(datt$No.show, labels = c("Yes","No"))

no_show_rate_yes <- datt %>%
  group_by(time_interval) %>%
  summarise(no_show_rate = mean(No.show=="Yes"))
View(no_show_rate_yes)

no_show_rate_No <- datt %>%
  group_by(time_interval) %>%
  summarise(no_show_rate = mean(No.show=="No"))
View(no_show_rate_No)

Plotting the no-show rate against time interval

b <- ggplot(no_show_rate_yes, aes(x = time_interval, y = no_show_rate )) +
  geom_line(col = "red") +
  geom_point() +
  labs(title = "No-Show Rate by Time Interval",
       x = "Time Interval (days)",
       y = "No_show_rate_yes") +
  theme_minimal()

c <- ggplot(no_show_rate_No, aes(x = time_interval, y = no_show_rate)) +
  geom_line( colour = "blue") +
  geom_point() +
  labs(title = "No-Show Rate by Time Interval",
       x = "Time Interval (days)",
       y = "No_Show_Rate_No") +
  theme_minimal()

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(b,c,ncol=2)

From the analysis of time interval between ScheduledDay and AppointmentDay, it can be observed that there is a strong relationship with no-show rates. Both graphs are showing notable patterns where the red graph shows no-shows-yes and the blue graph shows attended appointments-no. It indicates that when the appointment has been scheduled really far in advance - more than 100 days, then extreme variability occurs with fluctuations between 0% to 100%. However, a no-show rate within the first 50 booked days seems more stable within the 25-40% range. This could be an indication that with closer proximity between calling for an appointment and the appointment date, attendance patterns could also be closer to consistency. The longer the time intervals are, the less predictable the attendance outcome tends to be due to forgotten appointments or the use of alternative arrangements.

2. Are there days when patients rarely visit the hospitals to meet up the appointment?

# appointment days to weekdays
datt$days_of_week <- weekdays(datt$AppointmentDay)

datt$days_of_week <- factor(datt$days_of_week, 
                               levels = c("Monday", "Tuesday", "Wednesday", # for the days to be in orders
                                          "Thursday", "Friday", "Saturday"))
table(datt$days_of_week)
## 
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday 
##     22714     25638     25866     17246     19019        39

Plot no-show rate vs appointment day

ggplot(datt, aes(x = days_of_week, fill = No.show)) +
  geom_bar(position = "dodge") +
  labs(title = "No-Show Rate vs Appointment Day",
       x = "Appointment Day", 
       y = "No_show", 
       fill = "No.Show") + 
  theme_classic()  + # to apply a classical theme we use theme_classic()
  scale_fill_manual(values = c("No" = "maroon", "Yes" = "yellow4"))

Saturdays are days where patient rarely visit the hospital

3. Which Patients With Particular Diseases is likely to Come less for Appointment?

## TO call diabetes column
D <- datt$Diabetes
which(is.na(datt$Diabetes)) # check if there is a missing value
## integer(0)
### To check people that have diabetes
YES.D <- which(datt$Diabetes==1)  
## To check length of people that has Diabetes
length(YES.D)
## [1] 7943
## TO call Hypertension column
H <- datt$Hypertension

which(is.na(datt$Hypertension)) # check if there is a missing value
## integer(0)
### To check people that have Hypertension in that column
YES.H <-which(H==1) 

## To check numbers of people that has Hypertension
length(YES.H)
## [1] 21801
## TO call Handicap column
Hh <- datt$Handicap

which(is.na(datt$Handicap)) # check if there is a missing value
## integer(0)
YES.Hh <- which(Hh==1) # checked the rows with people who have  handicap

length(YES.Hh) # checked the length of people with handicap
## [1] 2040
## TO call Alcoholism column
A <- datt$Alcoholism

which(is.na(datt$Alcoholism)) # check if there is a missing value
## integer(0)
YES.A <- which(A==1) # checked the rows with people who have  Alcoholism

length(YES.A) # checked the length of people with Alcoholism
## [1] 3360
# Calculate disease counts for no_show yes
# By calculating attendance and missed counts, you can:
# Determine the prevalence of no-shows for each disease. 
#Analyze the relationship between disease type and attendance.
YES.D <- sum(datt$No.show == "Yes" & D== 1)
YES.D
## [1] 6513
YES.H <- sum(datt$No.show == "Yes" & H == 1)
YES.H
## [1] 18029
YES.Hh <- sum(datt$No.show == "Yes" & Hh== 1)
YES.Hh
## [1] 1676
YES.A <- sum(datt$No.show == "Yes" & A == 1)
YES.A
## [1] 2683
# Create a data frame
disease_counts <- data.frame(
  Disease = c("Diabetes", "Hypertension", "Handicap", "Alcoholism"),
  Total = c(length(YES.D), length(YES.H), length(YES.Hh), length(YES.A)),
  No_Show = c(YES.D, YES.H, YES.Hh, YES.A)
)
# Create a bar chart
ggplot(disease_counts, aes(x = Disease, y = Total, fill = "Total")) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_bar(aes(y = No_Show, fill = "No Show"), stat = "identity", position = "dodge") +
  labs(title = "Disease Attendance", x = "Disease", y = "Count", fill = "Attendance") +
  theme_classic()

Handicap Patients are likely to come less

4. Does the Age Group Have Impact on the No-Show Rate?

#  . Create age groups
datt$Age_Group <- cut(datt$Age, breaks = c(0, 18, 30, 50, 65, Inf),
                              labels = c("0-18", "19-30", "31-50", "51-65", "66-100"))

summary(datt$Age_Group)
##   0-18  19-30  31-50  51-65 66-100   NA's 
##  25326  16763  29471  22122  13300   3540
# remove the NAs 
age_summary <- datt %>% filter(Age_Group != "NA")


ggplot(age_summary, aes(x = Age_Group, fill = No.show)) + 
  geom_bar(position = "dodge") + 
  labs(title = "Age Group vs No-Show Rate", x = "Age Group", y = "No show rate")

5. Does the Location Have Impact on the No-Show Rate?

# . Calculate the no-show rate by neighborhood

no_show_neighbor <- datt %>%
  group_by(Neighbourhood) %>%
  summarise(
    Total_Appointments = n(),
    No_Show_Count = sum(No.show == "Yes"),
    No_Show_Rate = No_Show_Count / Total_Appointments
  )

# Visualize the no-show rates by neighborhood
ggplot(no_show_neighbor, aes(x = Neighbourhood, y = No_Show_Rate)) +
  geom_bar(stat = "identity") +
  labs(title = "No-Show Rate by Neighborhood", x = "Neighborhood", y = "No-Show Rate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

6. Do patient with more than one diseases increase no-show rate

# Create a new variable for the number of diseases

datt <- datt %>%
  mutate(Disease_Count = rowSums(select(., Hypertension, Diabetes, Alcoholism, Handicap), na.rm = TRUE))


## To the sum elements of Hypertension, Alcoholism, Handicap and Diabetes column
datt$Disease_Count <- rowSums(datt[, c("Hypertension", 
                                       "Diabetes", "Alcoholism", "Handicap")])


# Calculate no-show rate by disease count
disease_no_show_rate <- datt %>%
  group_by(Disease_Count) %>%
  summarise(No_Show_Count = sum(No.show == "Yes"))

# Create the bar plot
ggplot(disease_no_show_rate, aes(x = Disease_Count, y = No_Show_Count)) +
  geom_bar(stat = "identity", fill = "blue4") +
  labs(title = "No-Show Rate by Disease Count",
       x = "Number of Diseases",
       y = "No-Show Rate") +
  theme_minimal()

7. Is the Follow-Up Scheme Used Appropriately?

#. Calculate the no-show rate based on SMS received
sms_no_show_rate <- datt %>%
  group_by(SMS_received) %>%
  summarise(
    Total_Appointments = n(),
    No_Show_Count = sum(No.show == "Yes"),
    No_Show_Ratee = No_Show_Count / Total_Appointments
  )
#
ggplot(sms_no_show_rate, aes(x = as.factor(SMS_received), y = No_Show_Ratee)) +
  geom_bar(stat = "identity", aes(fill = as.factor(SMS_received))) +
  labs(title = "No-Show Rate by SMS Received", 
       x = "SMS Received (1 = Yes, 0 = No)", 
       y = "No-Show Rate") +
  scale_fill_manual(values = c("red4", "green4")) +
  theme_classic() +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "none"
  )