Introduction

There are many reasons that patients fail to make their appointments and there are demographic similarities across patients who no-show more consistently than others. Practice managers work hard to reduce no-shows using a variety of strategies, but often those strategies involve manual processes or difficult-to-enforce policies, resulting in a low impact.

Aim of the Project

The aim of this project is to carry out an exploratory data analysis of the no-show appointment data.

Project Questions

Objectives of the Study

setwd("C:\\Users\\USER\\Documents\\R")
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Reading the NO-show dataset

# Read the dataset
dat <- read_csv("KaggleV2-May-2016.csv") # Importing the dataset
## Rows: 110527 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Gender, Neighbourhood, No-show
## dbl  (9): PatientId, AppointmentID, Age, Scholarship, Hipertension, Diabetes...
## dttm (2): ScheduledDay, AppointmentDay
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(dat) # Viewing the data
# The dimension of the data
dim(dat)
## [1] 110527     14
# Summary of the data
summary(dat)
##    PatientId         AppointmentID        Gender         
##  Min.   :3.922e+04   Min.   :5030230   Length:110527     
##  1st Qu.:4.173e+12   1st Qu.:5640286   Class :character  
##  Median :3.173e+13   Median :5680573   Mode  :character  
##  Mean   :1.475e+14   Mean   :5675305                     
##  3rd Qu.:9.439e+13   3rd Qu.:5725524                     
##  Max.   :1.000e+15   Max.   :5790484                     
##   ScheduledDay                    AppointmentDay                  
##  Min.   :2015-11-10 07:13:56.00   Min.   :2016-04-29 00:00:00.00  
##  1st Qu.:2016-04-29 10:27:01.00   1st Qu.:2016-05-09 00:00:00.00  
##  Median :2016-05-10 12:13:17.00   Median :2016-05-18 00:00:00.00  
##  Mean   :2016-05-09 07:49:15.84   Mean   :2016-05-19 00:57:50.00  
##  3rd Qu.:2016-05-20 11:18:37.00   3rd Qu.:2016-05-31 00:00:00.00  
##  Max.   :2016-06-08 20:07:23.00   Max.   :2016-06-08 00:00:00.00  
##       Age         Neighbourhood       Scholarship       Hipertension   
##  Min.   : -1.00   Length:110527      Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 18.00   Class :character   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median : 37.00   Mode  :character   Median :0.00000   Median :0.0000  
##  Mean   : 37.09                      Mean   :0.09827   Mean   :0.1972  
##  3rd Qu.: 55.00                      3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :115.00                      Max.   :1.00000   Max.   :1.0000  
##     Diabetes         Alcoholism        Handcap         SMS_received  
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.000  
##  Mean   :0.07186   Mean   :0.0304   Mean   :0.02225   Mean   :0.321  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :4.00000   Max.   :1.000  
##    No-show         
##  Length:110527     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# To check for missing values in the data
table(is.na(dat))
## 
##   FALSE 
## 1547378
sum(is.na(dat)) #Summary of the missing value
## [1] 0
# Extracting the column of age from the dataset
table(dat$Age)
## 
##   -1    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##    1 3539 2273 1618 1513 1299 1489 1521 1427 1424 1372 1274 1195 1092 1103 1118 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
## 1211 1402 1509 1487 1545 1437 1452 1376 1349 1242 1332 1283 1377 1448 1403 1521 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46 
## 1439 1505 1524 1526 1378 1580 1533 1629 1536 1402 1346 1272 1344 1487 1453 1460 
##   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62 
## 1394 1399 1652 1613 1567 1746 1651 1530 1425 1635 1603 1469 1624 1411 1343 1312 
##   63   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78 
## 1374 1331 1101 1187  973 1012  832  724  695  615  725  602  544  571  527  541 
##   79   80   81   82   83   84   85   86   87   88   89   90   91   92   93   94 
##  390  511  434  392  280  311  275  260  184  126  173  109   66   86   53   33 
##   95   96   97   98   99  100  102  115 
##   24   17   11    6    1    4    2    5

Finding the row with -1

dat  %>% filter(Age == -1)
## # A tibble: 1 × 14
##   PatientId AppointmentID Gender ScheduledDay        AppointmentDay        Age
##       <dbl>         <dbl> <chr>  <dttm>              <dttm>              <dbl>
## 1   4.66e14       5775010 F      2016-06-06 08:58:13 2016-06-06 00:00:00    -1
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hipertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handcap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
# Filtering the column with -1
sort_df <- dat  %>% filter(Age != -1) 
View(sort_df)
summary(sort_df)
##    PatientId         AppointmentID        Gender         
##  Min.   :3.922e+04   Min.   :5030230   Length:110526     
##  1st Qu.:4.173e+12   1st Qu.:5640285   Class :character  
##  Median :3.173e+13   Median :5680572   Mode  :character  
##  Mean   :1.475e+14   Mean   :5675304                     
##  3rd Qu.:9.439e+13   3rd Qu.:5725523                     
##  Max.   :1.000e+15   Max.   :5790484                     
##   ScheduledDay                    AppointmentDay                  
##  Min.   :2015-11-10 07:13:56.00   Min.   :2016-04-29 00:00:00.00  
##  1st Qu.:2016-04-29 10:27:01.00   1st Qu.:2016-05-09 00:00:00.00  
##  Median :2016-05-10 12:13:10.50   Median :2016-05-18 00:00:00.00  
##  Mean   :2016-05-09 07:48:53.92   Mean   :2016-05-19 00:57:35.97  
##  3rd Qu.:2016-05-20 11:18:30.25   3rd Qu.:2016-05-31 00:00:00.00  
##  Max.   :2016-06-08 20:07:23.00   Max.   :2016-06-08 00:00:00.00  
##       Age         Neighbourhood       Scholarship       Hipertension   
##  Min.   :  0.00   Length:110526      Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 18.00   Class :character   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median : 37.00   Mode  :character   Median :0.00000   Median :0.0000  
##  Mean   : 37.09                      Mean   :0.09827   Mean   :0.1972  
##  3rd Qu.: 55.00                      3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :115.00                      Max.   :1.00000   Max.   :1.0000  
##     Diabetes         Alcoholism        Handcap         SMS_received  
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.000  
##  Mean   :0.07187   Mean   :0.0304   Mean   :0.02225   Mean   :0.321  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :4.00000   Max.   :1.000  
##    No-show         
##  Length:110526     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
## Renaming the columns that were wrongly spelt
colnames(sort_df)[c(9,12)] <- c("Hypertension", "Handicap")
View(sort_df)
head(sort_df)
## # A tibble: 6 × 14
##   PatientId AppointmentID Gender ScheduledDay        AppointmentDay        Age
##       <dbl>         <dbl> <chr>  <dttm>              <dttm>              <dbl>
## 1   2.99e13       5642903 F      2016-04-29 18:38:08 2016-04-29 00:00:00    62
## 2   5.59e14       5642503 M      2016-04-29 16:08:27 2016-04-29 00:00:00    56
## 3   4.26e12       5642549 F      2016-04-29 16:19:04 2016-04-29 00:00:00    62
## 4   8.68e11       5642828 F      2016-04-29 17:29:31 2016-04-29 00:00:00     8
## 5   8.84e12       5642494 F      2016-04-29 16:07:23 2016-04-29 00:00:00    56
## 6   9.60e13       5626772 F      2016-04-27 08:36:51 2016-04-29 00:00:00    76
## # ℹ 8 more variables: Neighbourhood <chr>, Scholarship <dbl>,
## #   Hypertension <dbl>, Diabetes <dbl>, Alcoholism <dbl>, Handicap <dbl>,
## #   SMS_received <dbl>, `No-show` <chr>
## Using the select function to remove the Appointment ID column
df <- sort_df %>% select(1,3:14)
View(df)
## Using the mutate function and substr to extract a string from a string
df <- df %>% mutate(ScheduledDay = substr(ScheduledDay,1,10))
summary(df)
##    PatientId            Gender          ScheduledDay      
##  Min.   :3.922e+04   Length:110526      Length:110526     
##  1st Qu.:4.173e+12   Class :character   Class :character  
##  Median :3.173e+13   Mode  :character   Mode  :character  
##  Mean   :1.475e+14                                        
##  3rd Qu.:9.439e+13                                        
##  Max.   :1.000e+15                                        
##  AppointmentDay                        Age         Neighbourhood     
##  Min.   :2016-04-29 00:00:00.00   Min.   :  0.00   Length:110526     
##  1st Qu.:2016-05-09 00:00:00.00   1st Qu.: 18.00   Class :character  
##  Median :2016-05-18 00:00:00.00   Median : 37.00   Mode  :character  
##  Mean   :2016-05-19 00:57:35.97   Mean   : 37.09                     
##  3rd Qu.:2016-05-31 00:00:00.00   3rd Qu.: 55.00                     
##  Max.   :2016-06-08 00:00:00.00   Max.   :115.00                     
##   Scholarship       Hypertension       Diabetes         Alcoholism    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.09827   Mean   :0.1972   Mean   :0.07187   Mean   :0.0304  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     Handicap        SMS_received     No-show         
##  Min.   :0.00000   Min.   :0.000   Length:110526     
##  1st Qu.:0.00000   1st Qu.:0.000   Class :character  
##  Median :0.00000   Median :0.000   Mode  :character  
##  Mean   :0.02225   Mean   :0.321                     
##  3rd Qu.:0.00000   3rd Qu.:1.000                     
##  Max.   :4.00000   Max.   :1.000
class(df$AppointmentDay)
## [1] "POSIXct" "POSIXt"
## Using the mutate function to convert the Appointment day to character
df <- df %>% mutate(AppointmentDay = as.character(AppointmentDay), AppointmentDay = substr(AppointmentDay,1,10))
df <- df %>% select(1:13)

class(df$`No-show`) ## To specify the type of column
## [1] "character"
## Using the mutate function to convert some of the columns to date, factor and logical 
df <- df %>% 
  mutate(
    ScheduledDay = as.Date(ScheduledDay),
    AppointmentDay = as.Date(AppointmentDay),
    Neighbourhood = as.factor(Neighbourhood),
    Hypertension = as.logical(Hypertension),
    Diabetes = as.logical(Diabetes),
    Alcoholism = as.logical(Alcoholism),
    Handicap = as.factor(Handicap),
    SMS_received = as.logical(SMS_received),
    `No-show` = recode(`No-show`, 'No'=0, 'Yes'=1),
    `No-show`= as.logical(`No-show`),
    Gender = as.factor(Gender),
    Scholarship = as.logical(Scholarship)
  )
View(df)
summary(df)
##    PatientId         Gender     ScheduledDay        AppointmentDay      
##  Min.   :3.922e+04   F:71839   Min.   :2015-11-10   Min.   :2016-04-29  
##  1st Qu.:4.173e+12   M:38687   1st Qu.:2016-04-29   1st Qu.:2016-05-09  
##  Median :3.173e+13             Median :2016-05-10   Median :2016-05-18  
##  Mean   :1.475e+14             Mean   :2016-05-08   Mean   :2016-05-19  
##  3rd Qu.:9.439e+13             3rd Qu.:2016-05-20   3rd Qu.:2016-05-31  
##  Max.   :1.000e+15             Max.   :2016-06-08   Max.   :2016-06-08  
##                                                                         
##       Age                 Neighbourhood   Scholarship     Hypertension   
##  Min.   :  0.00   JARDIM CAMBURI : 7717   Mode :logical   Mode :logical  
##  1st Qu.: 18.00   MARIA ORTIZ    : 5805   FALSE:99665     FALSE:88725    
##  Median : 37.00   RESISTÊNCIA    : 4431   TRUE :10861     TRUE :21801    
##  Mean   : 37.09   JARDIM DA PENHA: 3877                                  
##  3rd Qu.: 55.00   ITARARÉ        : 3514                                  
##  Max.   :115.00   CENTRO         : 3334                                  
##                   (Other)        :81848                                  
##   Diabetes       Alcoholism      Handicap   SMS_received     No-show       
##  Mode :logical   Mode :logical   0:108285   Mode :logical   Mode :logical  
##  FALSE:102583    FALSE:107166    1:  2042   FALSE:75044     FALSE:88207    
##  TRUE :7943      TRUE :3360      2:   183   TRUE :35482     TRUE :22319    
##                                  3:    13                                  
##                                  4:     3                                  
##                                                                            
## 

###Question 1

# Does the Time Interval Between Scheduled Day and Appointment Day Impact the No-Show Rate?

# Calculate the time interval in days

df <- df %>% mutate(Time_Interval = as.numeric(AppointmentDay - ScheduledDay))
summary(df)
##    PatientId         Gender     ScheduledDay        AppointmentDay      
##  Min.   :3.922e+04   F:71839   Min.   :2015-11-10   Min.   :2016-04-29  
##  1st Qu.:4.173e+12   M:38687   1st Qu.:2016-04-29   1st Qu.:2016-05-09  
##  Median :3.173e+13             Median :2016-05-10   Median :2016-05-18  
##  Mean   :1.475e+14             Mean   :2016-05-08   Mean   :2016-05-19  
##  3rd Qu.:9.439e+13             3rd Qu.:2016-05-20   3rd Qu.:2016-05-31  
##  Max.   :1.000e+15             Max.   :2016-06-08   Max.   :2016-06-08  
##                                                                         
##       Age                 Neighbourhood   Scholarship     Hypertension   
##  Min.   :  0.00   JARDIM CAMBURI : 7717   Mode :logical   Mode :logical  
##  1st Qu.: 18.00   MARIA ORTIZ    : 5805   FALSE:99665     FALSE:88725    
##  Median : 37.00   RESISTÊNCIA    : 4431   TRUE :10861     TRUE :21801    
##  Mean   : 37.09   JARDIM DA PENHA: 3877                                  
##  3rd Qu.: 55.00   ITARARÉ        : 3514                                  
##  Max.   :115.00   CENTRO         : 3334                                  
##                   (Other)        :81848                                  
##   Diabetes       Alcoholism      Handicap   SMS_received     No-show       
##  Mode :logical   Mode :logical   0:108285   Mode :logical   Mode :logical  
##  FALSE:102583    FALSE:107166    1:  2042   FALSE:75044     FALSE:88207    
##  TRUE :7943      TRUE :3360      2:   183   TRUE :35482     TRUE :22319    
##                                  3:    13                                  
##                                  4:     3                                  
##                                                                            
##                                                                            
##  Time_Interval   
##  Min.   : -6.00  
##  1st Qu.:  0.00  
##  Median :  4.00  
##  Mean   : 10.18  
##  3rd Qu.: 15.00  
##  Max.   :179.00  
## 
table(df$Time_Interval)
## 
##    -6    -1     0     1     2     3     4     5     6     7     8     9    10 
##     1     4 38562  5213  6725  2737  5290  3277  4037  4906  2332  1605  1391 
##    11    12    13    14    15    16    17    18    19    20    21    22    23 
##   987  1115  1682  2913  1503  1151  1107  1021  1044  1187  1861  1173   822 
##    24    25    26    27    28    29    30    31    32    33    34    35    36 
##   622   637   731  1013  1706  1089   704   672   586   664   808   963   551 
##    37    38    39    40    41    42    43    44    45    46    47    48    49 
##   331   199   170   254   342   417   254   157   137   121   109   117   216 
##    50    51    52    53    54    55    56    57    58    59    60    61    62 
##   182    91    90    96    73   109   203    98   110    90    73   135   193 
##    63    64    65    66    67    68    69    70    71    72    73    74    75 
##   200    75    97   162    63    70   105   112    50    53    37    39    72 
##    76    77    78    79    80    81    82    83    84    85    86    87    88 
##    65    53    33    23    18    10     1     8    54    37     6    16    16 
##    89    90    91    92    93    94    95    96    97    98   101   102   103 
##    17    58    56     3     2     2     5     4     2     5     1     4     5 
##   104   105   107   108   109   110   111   112   115   117   119   122   123 
##     8     4     2     5     5     2     5     5     2     1     4     3     1 
##   125   126   127   132   133   139   142   146   151   155   162   169   176 
##     1     1     1     1    11     1     8     1     1    10    11     8    16 
##   179 
##    10
#Removing the negative values
df <- df %>% filter(Time_Interval != -6)
df <- df %>% filter(Time_Interval != -1)
## Plotting a histogram graph of time interval using ggplot
ggplot(df, aes(Time_Interval)) + geom_histogram(binwidth = 10, color = "red", fill = "blue")

##Create a new dataframe and group_by time interval 
## Find the mean of the noshow that is TRUE
data <- df %>% group_by(Time_Interval) %>% summarise(No_show_rate = mean(`No-show` == TRUE))
## Visualizing the relationship between time interval and no-show rate
ggplot(data, aes(x=Time_Interval, y= No_show_rate)) + geom_line(color = "blue") +
  labs(title= "Impact of the Time Interval on No-show Rate",
       x = "Time Interval", y = "No-show Rate")

###QUESTION2

# Are there days when patients rarely visit the hospitals to meet up the appointment?
# Extract the day of the week from AppointmentDay
df <- df %>% mutate(Day_of_the_Week = weekdays(AppointmentDay))