Ahmed Abdallah Mohammed –1727138
Alaa Ahmed Yousef –1727040
Dalia Sarhan Mahmoud –1627252
Nada Shaaban –1727066
Omar Mohammed Abdel-Salam –1727234
Sarah Mahmoud Abdel-Aziz –1627262
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#reading the data from url
csv_url= 'https://d17h27t6h515a5.cloudfront.net/topher/2017/October/59dd2e9a_noshowappointments-kagglev2-may-2016/noshowappointments-kagglev2-may-2016.csv'
appointment_data=read.csv(csv_url)
head(appointment_data ,10) #showing first 10 records
## PatientId AppointmentID Gender ScheduledDay AppointmentDay
## 1 2.987250e+13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:00Z
## 2 5.589978e+14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:00Z
## 3 4.262962e+12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:00Z
## 4 8.679512e+11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:00Z
## 5 8.841186e+12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:00Z
## 6 9.598513e+13 5626772 F 2016-04-27T08:36:51Z 2016-04-29T00:00:00Z
## 7 7.336882e+14 5630279 F 2016-04-27T15:05:12Z 2016-04-29T00:00:00Z
## 8 3.449833e+12 5630575 F 2016-04-27T15:39:58Z 2016-04-29T00:00:00Z
## 9 5.639473e+13 5638447 F 2016-04-29T08:02:16Z 2016-04-29T00:00:00Z
## 10 7.812456e+13 5629123 F 2016-04-27T12:48:25Z 2016-04-29T00:00:00Z
## Age Neighbourhood Scholarship Hipertension Diabetes Alcoholism Handcap
## 1 62 JARDIM DA PENHA 0 1 0 0 0
## 2 56 JARDIM DA PENHA 0 0 0 0 0
## 3 62 MATA DA PRAIA 0 0 0 0 0
## 4 8 PONTAL DE CAMBURI 0 0 0 0 0
## 5 56 JARDIM DA PENHA 0 1 1 0 0
## 6 76 REPÚBLICA 0 1 0 0 0
## 7 23 GOIABEIRAS 0 0 0 0 0
## 8 39 GOIABEIRAS 0 0 0 0 0
## 9 21 ANDORINHAS 0 0 0 0 0
## 10 19 CONQUISTA 0 0 0 0 0
## SMS_received No.show
## 1 0 No
## 2 0 No
## 3 0 No
## 4 0 No
## 5 0 No
## 6 0 No
## 7 0 Yes
## 8 0 Yes
## 9 0 No
## 10 0 No
dim(appointment_data)
## [1] 110527 14
the data frame has 110527 records and 14 columns
first of all, we see the structure of the data
str(appointment_data)
## 'data.frame': 110527 obs. of 14 variables:
## $ PatientId : num 2.99e+13 5.59e+14 4.26e+12 8.68e+11 8.84e+12 ...
## $ AppointmentID : int 5642903 5642503 5642549 5642828 5642494 5626772 5630279 5630575 5638447 5629123 ...
## $ Gender : chr "F" "M" "F" "F" ...
## $ ScheduledDay : chr "2016-04-29T18:38:08Z" "2016-04-29T16:08:27Z" "2016-04-29T16:19:04Z" "2016-04-29T17:29:31Z" ...
## $ AppointmentDay: chr "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" "2016-04-29T00:00:00Z" ...
## $ Age : int 62 56 62 8 56 76 23 39 21 19 ...
## $ Neighbourhood : chr "JARDIM DA PENHA" "JARDIM DA PENHA" "MATA DA PRAIA" "PONTAL DE CAMBURI" ...
## $ Scholarship : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Hipertension : int 1 0 0 0 1 1 0 0 0 0 ...
## $ Diabetes : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Alcoholism : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Handcap : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SMS_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.show : chr "No" "No" "No" "No" ...
checking if the data has duplicates or missing values
sum(duplicated(appointment_data))
## [1] 0
sapply(appointment_data,function(x)sum(is.na(x)))
## PatientId AppointmentID Gender ScheduledDay AppointmentDay
## 0 0 0 0 0
## Age Neighbourhood Scholarship Hipertension Diabetes
## 0 0 0 0 0
## Alcoholism Handcap SMS_received No.show
## 0 0 0 0
checking for irrational data
appointment_data[appointment_data$Age<0, ]
## PatientId AppointmentID Gender ScheduledDay
## 99833 4.659432e+14 5775010 F 2016-06-06T08:58:13Z
## AppointmentDay Age Neighbourhood Scholarship Hipertension Diabetes
## 99833 2016-06-06T00:00:00Z -1 ROMÃO 0 0 0
## Alcoholism Handcap SMS_received No.show
## 99833 0 0 0 No
found a row with an irrational value in it, so we drop it
appointment_data <-appointment_data[!(appointment_data$Age<0),]
summarizing..
summary(appointment_data)
## PatientId AppointmentID Gender ScheduledDay
## Min. :3.922e+04 Min. :5030230 Length:110526 Length:110526
## 1st Qu.:4.173e+12 1st Qu.:5640285 Class :character Class :character
## Median :3.173e+13 Median :5680572 Mode :character Mode :character
## Mean :1.475e+14 Mean :5675304
## 3rd Qu.:9.439e+13 3rd Qu.:5725523
## Max. :1.000e+15 Max. :5790484
## AppointmentDay Age Neighbourhood Scholarship
## Length:110526 Min. : 0.00 Length:110526 Min. :0.00000
## Class :character 1st Qu.: 18.00 Class :character 1st Qu.:0.00000
## Mode :character Median : 37.00 Mode :character Median :0.00000
## Mean : 37.09 Mean :0.09827
## 3rd Qu.: 55.00 3rd Qu.:0.00000
## Max. :115.00 Max. :1.00000
## Hipertension Diabetes Alcoholism Handcap
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.1972 Mean :0.07187 Mean :0.0304 Mean :0.02225
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :4.00000
## SMS_received No.show
## Min. :0.000 Length:110526
## 1st Qu.:0.000 Class :character
## Median :0.000 Mode :character
## Mean :0.321
## 3rd Qu.:1.000
## Max. :1.000
Now, having the summary in hand, we make some changes to the data types to make it easier to manipulate with and provide a categorical meaning of the data. So, we take(Gender, No.show, SMS_recieved, PatientId, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap) as factors, and (ScheduledDay, AppointmentDay) as dates
appointment_data <- mutate_at(appointment_data,vars(Gender, No.show, SMS_received, PatientId, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap),as.factor)
appointment_data <- mutate_at(appointment_data,vars(ScheduledDay, AppointmentDay),as.Date)
checking datatypes of columns
lapply(appointment_data, class)
## $PatientId
## [1] "factor"
##
## $AppointmentID
## [1] "integer"
##
## $Gender
## [1] "factor"
##
## $ScheduledDay
## [1] "Date"
##
## $AppointmentDay
## [1] "Date"
##
## $Age
## [1] "integer"
##
## $Neighbourhood
## [1] "factor"
##
## $Scholarship
## [1] "factor"
##
## $Hipertension
## [1] "factor"
##
## $Diabetes
## [1] "factor"
##
## $Alcoholism
## [1] "factor"
##
## $Handcap
## [1] "factor"
##
## $SMS_received
## [1] "factor"
##
## $No.show
## [1] "factor"
appointment_data %>%
group_by(Gender) %>%
count(No.show)
## # A tibble: 4 x 3
## # Groups: Gender [2]
## Gender No.show n
## <fct> <fct> <int>
## 1 F No 57245
## 2 F Yes 14594
## 3 M No 30962
## 4 M Yes 7725
#we first define a parameter which is the difference in days between scheduled and actual appointment
appointment_data$diff_in_days =as.numeric(difftime(appointment_data$AppointmentDay, appointment_data$ScheduledDay, units = "days"))
mean(appointment_data$diff_in_days)
## [1] 10.18379
#we check the class of the new column
class(appointment_data$diff_in_days)
## [1] "numeric"
mean(appointment_data$diff_in_days) #calculating mean of it
## [1] 10.18379
min(appointment_data$diff_in_days)
## [1] -6
#there are some negative values in diff_in_days
#this isn't valid as a patient cannot go for an appointment before scheduling it
appointment_data=filter(appointment_data,diff_in_days>=0)
#answering the question
appointment_data %>%
group_by(No.show) %>%
summarize(m = mean(diff_in_days))
## # A tibble: 2 x 2
## No.show m
## <fct> <dbl>
## 1 No 8.75
## 2 Yes 15.8
appointment_data %>%
group_by(No.show) %>%
count(SMS_received)
## # A tibble: 4 x 3
## # Groups: No.show [2]
## No.show SMS_received n
## <fct> <fct> <int>
## 1 No 0 62509
## 2 No 1 25698
## 3 Yes 0 12530
## 4 Yes 1 9784
appointment_data %>%
group_by(No.show) %>%
mean(Age)
## Warning in mean.default(., Age): argument is not numeric or logical: returning
## NA
## [1] NA
####the results were not that much informative as the ages were too close.
appointment_data %>%
group_by(Neighbourhood) %>%
count(No.show)
## # A tibble: 160 x 3
## # Groups: Neighbourhood [81]
## Neighbourhood No.show n
## <fct> <fct> <int>
## 1 AEROPORTO No 7
## 2 AEROPORTO Yes 1
## 3 ANDORINHAS No 1741
## 4 ANDORINHAS Yes 521
## 5 ANTÔNIO HONÓRIO No 221
## 6 ANTÔNIO HONÓRIO Yes 50
## 7 ARIOVALDO FAVALESSA No 220
## 8 ARIOVALDO FAVALESSA Yes 62
## 9 BARRO VERMELHO No 332
## 10 BARRO VERMELHO Yes 91
## # ... with 150 more rows
library(ggplot2)
##Gender vs No Show
ggplot(appointment_data)+
geom_bar(aes(x = Gender, fill = No.show))+
ggtitle("Gender vs No Show ")+
theme(plot.title = element_text(hjust = 0.5))+
ylab("Count")+
xlab("Gender")
##mean(diff_in_days) vs No.show
M= appointment_data%>%
group_by(No.show) %>%
summarize(mean(diff_in_days))
barplot(M$`mean(diff_in_days)`,names.arg=M$No.show , xlab="Noshow",ylab="mean for diff_in_days",col="pink")
##No.show vs Age
ggplot(appointment_data, aes(x = No.show, y = Age, fill = No.show))+
geom_boxplot()+
ggtitle("Age vs No Show")+
theme(plot.title = element_text(hjust = 0.5))
#statistics
select(appointment_data, Age, No.show) %>%
group_by(No.show) %>%
summarise(Age_mean = mean(Age))
## # A tibble: 2 x 2
## No.show Age_mean
## <fct> <dbl>
## 1 No 37.8
## 2 Yes 34.3
##mean(Age) vs No.show
N=appointment_data %>%
group_by(No.show) %>%
summarize(mean(Age))
barplot(N$`mean(Age)`,names.arg=N$No.show,xlab="Noshow",ylab="mean of Age",col="purple")