Presented By

Ahmed Abdallah Mohammed –1727138

Alaa Ahmed Yousef –1727040

Dalia Sarhan Mahmoud –1627252

Nada Shaaban –1727066

Omar Mohammed Abdel-Salam –1727234

Sarah Mahmoud Abdel-Aziz –1627262

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

We begin with loading the data

#reading the data from url
csv_url= 'https://d17h27t6h515a5.cloudfront.net/topher/2017/October/59dd2e9a_noshowappointments-kagglev2-may-2016/noshowappointments-kagglev2-may-2016.csv'
appointment_data=read.csv(csv_url)
head(appointment_data ,10) #showing first 10 records
##       PatientId AppointmentID Gender         ScheduledDay       AppointmentDay
## 1  2.987250e+13       5642903      F 2016-04-29T18:38:08Z 2016-04-29T00:00:00Z
## 2  5.589978e+14       5642503      M 2016-04-29T16:08:27Z 2016-04-29T00:00:00Z
## 3  4.262962e+12       5642549      F 2016-04-29T16:19:04Z 2016-04-29T00:00:00Z
## 4  8.679512e+11       5642828      F 2016-04-29T17:29:31Z 2016-04-29T00:00:00Z
## 5  8.841186e+12       5642494      F 2016-04-29T16:07:23Z 2016-04-29T00:00:00Z
## 6  9.598513e+13       5626772      F 2016-04-27T08:36:51Z 2016-04-29T00:00:00Z
## 7  7.336882e+14       5630279      F 2016-04-27T15:05:12Z 2016-04-29T00:00:00Z
## 8  3.449833e+12       5630575      F 2016-04-27T15:39:58Z 2016-04-29T00:00:00Z
## 9  5.639473e+13       5638447      F 2016-04-29T08:02:16Z 2016-04-29T00:00:00Z
## 10 7.812456e+13       5629123      F 2016-04-27T12:48:25Z 2016-04-29T00:00:00Z
##    Age     Neighbourhood Scholarship Hipertension Diabetes Alcoholism Handcap
## 1   62   JARDIM DA PENHA           0            1        0          0       0
## 2   56   JARDIM DA PENHA           0            0        0          0       0
## 3   62     MATA DA PRAIA           0            0        0          0       0
## 4    8 PONTAL DE CAMBURI           0            0        0          0       0
## 5   56   JARDIM DA PENHA           0            1        1          0       0
## 6   76        REPÚBLICA           0            1        0          0       0
## 7   23        GOIABEIRAS           0            0        0          0       0
## 8   39        GOIABEIRAS           0            0        0          0       0
## 9   21        ANDORINHAS           0            0        0          0       0
## 10  19         CONQUISTA           0            0        0          0       0
##    SMS_received No.show
## 1             0      No
## 2             0      No
## 3             0      No
## 4             0      No
## 5             0      No
## 6             0      No
## 7             0     Yes
## 8             0     Yes
## 9             0      No
## 10            0      No
dim(appointment_data)
## [1] 110527     14

the data frame has 110527 records and 14 columns

Wrangling & Cleaning Data

first of all, we see the structure of the data

str(appointment_data)
## 'data.frame':    110527 obs. of  14 variables:
##  $ PatientId     : num  2.99e+13 5.59e+14 4.26e+12 8.68e+11 8.84e+12 ...
##  $ AppointmentID : int  5642903 5642503 5642549 5642828 5642494 5626772 5630279 5630575 5638447 5629123 ...
##  $ Gender        : chr  "F" "M" "F" "F" ...
##  $ ScheduledDay  : chr  "2016-04-29T18:38:08Z" "2016-04-29T16:08:27Z" "2016-04-29T16:19:04Z" "2016-04-29T17:29:31Z" ...
##  $ AppointmentDay: chr  "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" ...
##  $ Age           : int  62 56 62 8 56 76 23 39 21 19 ...
##  $ Neighbourhood : chr  "JARDIM DA PENHA" "JARDIM DA PENHA" "MATA DA PRAIA" "PONTAL DE CAMBURI" ...
##  $ Scholarship   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hipertension  : int  1 0 0 0 1 1 0 0 0 0 ...
##  $ Diabetes      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Alcoholism    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handcap       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SMS_received  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.show       : chr  "No" "No" "No" "No" ...

checking if the data has duplicates or missing values

sum(duplicated(appointment_data))
## [1] 0
sapply(appointment_data,function(x)sum(is.na(x)))
##      PatientId  AppointmentID         Gender   ScheduledDay AppointmentDay 
##              0              0              0              0              0 
##            Age  Neighbourhood    Scholarship   Hipertension       Diabetes 
##              0              0              0              0              0 
##     Alcoholism        Handcap   SMS_received        No.show 
##              0              0              0              0

checking for irrational data

appointment_data[appointment_data$Age<0, ]
##          PatientId AppointmentID Gender         ScheduledDay
## 99833 4.659432e+14       5775010      F 2016-06-06T08:58:13Z
##             AppointmentDay Age Neighbourhood Scholarship Hipertension Diabetes
## 99833 2016-06-06T00:00:00Z  -1        ROMÃO           0            0        0
##       Alcoholism Handcap SMS_received No.show
## 99833          0       0            0      No

found a row with an irrational value in it, so we drop it

appointment_data <-appointment_data[!(appointment_data$Age<0),]

summarizing..

summary(appointment_data)
##    PatientId         AppointmentID        Gender          ScheduledDay      
##  Min.   :3.922e+04   Min.   :5030230   Length:110526      Length:110526     
##  1st Qu.:4.173e+12   1st Qu.:5640285   Class :character   Class :character  
##  Median :3.173e+13   Median :5680572   Mode  :character   Mode  :character  
##  Mean   :1.475e+14   Mean   :5675304                                        
##  3rd Qu.:9.439e+13   3rd Qu.:5725523                                        
##  Max.   :1.000e+15   Max.   :5790484                                        
##  AppointmentDay          Age         Neighbourhood       Scholarship     
##  Length:110526      Min.   :  0.00   Length:110526      Min.   :0.00000  
##  Class :character   1st Qu.: 18.00   Class :character   1st Qu.:0.00000  
##  Mode  :character   Median : 37.00   Mode  :character   Median :0.00000  
##                     Mean   : 37.09                      Mean   :0.09827  
##                     3rd Qu.: 55.00                      3rd Qu.:0.00000  
##                     Max.   :115.00                      Max.   :1.00000  
##   Hipertension       Diabetes         Alcoholism        Handcap       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.1972   Mean   :0.07187   Mean   :0.0304   Mean   :0.02225  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :4.00000  
##   SMS_received     No.show         
##  Min.   :0.000   Length:110526     
##  1st Qu.:0.000   Class :character  
##  Median :0.000   Mode  :character  
##  Mean   :0.321                     
##  3rd Qu.:1.000                     
##  Max.   :1.000

Now, having the summary in hand, we make some changes to the data types to make it easier to manipulate with and provide a categorical meaning of the data. So, we take(Gender, No.show, SMS_recieved, PatientId, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap) as factors, and (ScheduledDay, AppointmentDay) as dates

appointment_data <- mutate_at(appointment_data,vars(Gender, No.show, SMS_received, PatientId, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap),as.factor)
appointment_data <- mutate_at(appointment_data,vars(ScheduledDay, AppointmentDay),as.Date)

checking datatypes of columns

lapply(appointment_data, class)
## $PatientId
## [1] "factor"
## 
## $AppointmentID
## [1] "integer"
## 
## $Gender
## [1] "factor"
## 
## $ScheduledDay
## [1] "Date"
## 
## $AppointmentDay
## [1] "Date"
## 
## $Age
## [1] "integer"
## 
## $Neighbourhood
## [1] "factor"
## 
## $Scholarship
## [1] "factor"
## 
## $Hipertension
## [1] "factor"
## 
## $Diabetes
## [1] "factor"
## 
## $Alcoholism
## [1] "factor"
## 
## $Handcap
## [1] "factor"
## 
## $SMS_received
## [1] "factor"
## 
## $No.show
## [1] "factor"

Let’s go for some analytic questions..

#1#Which gender was most committed to the appointments?

appointment_data %>% 
  group_by(Gender) %>%
  count(No.show)
## # A tibble: 4 x 3
## # Groups:   Gender [2]
##   Gender No.show     n
##   <fct>  <fct>   <int>
## 1 F      No      57245
## 2 F      Yes     14594
## 3 M      No      30962
## 4 M      Yes      7725

#2#What is the difference in days(the appointment day and the scheduled day) in a way such that to know if the patient goes to the appointment or not?

#we first define a parameter which is the difference in days between scheduled and actual appointment
appointment_data$diff_in_days =as.numeric(difftime(appointment_data$AppointmentDay, appointment_data$ScheduledDay, units = "days"))
mean(appointment_data$diff_in_days)
## [1] 10.18379
#we check the class of the new column
class(appointment_data$diff_in_days)
## [1] "numeric"
mean(appointment_data$diff_in_days) #calculating mean of it
## [1] 10.18379
min(appointment_data$diff_in_days)
## [1] -6
#there are some negative values in diff_in_days
#this isn't valid as a patient cannot go for an appointment before scheduling it
appointment_data=filter(appointment_data,diff_in_days>=0)
#answering the question
appointment_data %>% 
  group_by(No.show) %>%
  summarize(m = mean(diff_in_days))
## # A tibble: 2 x 2
##   No.show     m
##   <fct>   <dbl>
## 1 No       8.75
## 2 Yes     15.8

#3#to know if the patient recieved an alerting message either they attended or not?

appointment_data %>% 
  group_by(No.show) %>%
  count(SMS_received)
## # A tibble: 4 x 3
## # Groups:   No.show [2]
##   No.show SMS_received     n
##   <fct>   <fct>        <int>
## 1 No      0            62509
## 2 No      1            25698
## 3 Yes     0            12530
## 4 Yes     1             9784

#4#What is the average ages who attend and who don’t?

appointment_data %>% 
  group_by(No.show) %>%
  mean(Age)
## Warning in mean.default(., Age): argument is not numeric or logical: returning
## NA
## [1] NA

####the results were not that much informative as the ages were too close.

#5#How many attended their appointments and who didn’t in each and every neighborhood?

appointment_data %>% 
  group_by(Neighbourhood) %>%
  count(No.show)
## # A tibble: 160 x 3
## # Groups:   Neighbourhood [81]
##    Neighbourhood       No.show     n
##    <fct>               <fct>   <int>
##  1 AEROPORTO           No          7
##  2 AEROPORTO           Yes         1
##  3 ANDORINHAS          No       1741
##  4 ANDORINHAS          Yes       521
##  5 ANTÔNIO HONÓRIO   No        221
##  6 ANTÔNIO HONÓRIO   Yes        50
##  7 ARIOVALDO FAVALESSA No        220
##  8 ARIOVALDO FAVALESSA Yes        62
##  9 BARRO VERMELHO      No        332
## 10 BARRO VERMELHO      Yes        91
## # ... with 150 more rows

Some Plots and Statistics

preparing ggplot library
library(ggplot2)
##Gender vs No Show
ggplot(appointment_data)+
  geom_bar(aes(x = Gender, fill = No.show))+
  ggtitle("Gender vs No Show ")+
  theme(plot.title = element_text(hjust = 0.5))+
  ylab("Count")+
  xlab("Gender")

##mean(diff_in_days) vs No.show
M= appointment_data%>% 
  group_by(No.show) %>%
  summarize(mean(diff_in_days))
barplot(M$`mean(diff_in_days)`,names.arg=M$No.show , xlab="Noshow",ylab="mean for diff_in_days",col="pink")

##No.show vs Age
ggplot(appointment_data, aes(x = No.show, y = Age, fill = No.show))+
  geom_boxplot()+ 
  ggtitle("Age vs No Show")+
  theme(plot.title = element_text(hjust = 0.5))

#statistics

select(appointment_data, Age, No.show) %>% 
    group_by(No.show) %>% 
    summarise(Age_mean = mean(Age))
## # A tibble: 2 x 2
##   No.show Age_mean
##   <fct>      <dbl>
## 1 No          37.8
## 2 Yes         34.3
##mean(Age) vs No.show
N=appointment_data %>% 
  group_by(No.show) %>%
  summarize(mean(Age))
barplot(N$`mean(Age)`,names.arg=N$No.show,xlab="Noshow",ylab="mean of Age",col="purple")