library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.1     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(gtools)
library(gmodels)
library(ggplot2)
library(class)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

1 Introduction

Data used in this analysis is taken from kaggle, it is data about people who make a doctor appointment and if they show up or not on the date of the appointment.

The Data is about 300k observations and contain 12 variables

PatientId - Identification of a patient
AppointmentID - Identification of each appointment
Gender = Male or Female . Female is the greater proportion, woman takes way more care of they health in comparison to man.
DataMarcacaoConsulta = The day of the actuall appointment, when they have to visit the doctor.
DataAgendamento = The day someone called or registered the appointment, this is before appointment of course.
Age = How old is the patient.
Neighbourhood = Where the appointment takes place. Scholarship = Ture of False .
Hipertension = True or False
Diabetes = True or False
Alcoholism = True or False
Handcap = True or False
SMS_received = 1 or more messages sent to the patient.
No-show = True or False.

1.1 Reading Data

data <- read.csv("KaggleV2-May-2016.csv")

1.2 Data Pre Processing

str(data)

## 'data.frame':    110527 obs. of  14 variables:
##  $ PatientId     : num  2.99e+13 5.59e+14 4.26e+12 8.68e+11 8.84e+12 ...
##  $ AppointmentID : int  5642903 5642503 5642549 5642828 5642494 5626772 5630279 5630575 5638447 5629123 ...
##  $ Gender        : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
##  $ ScheduledDay  : Factor w/ 103549 levels "2015-11-10T07:13:56Z",..: 27742 27504 27539 27709 27498 20074 21386 21496 24945 20895 ...
##  $ AppointmentDay: Factor w/ 27 levels "2016-04-29T00:00:00Z",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age           : int  62 56 62 8 56 76 23 39 21 19 ...
##  $ Neighbourhood : Factor w/ 81 levels "AEROPORTO","ANDORINHAS",..: 40 40 47 55 40 59 26 26 2 13 ...
##  $ Scholarship   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hipertension  : int  1 0 0 0 1 1 0 0 0 0 ...
##  $ Diabetes      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Alcoholism    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handcap       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SMS_received  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.show       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...

prop.table(table(data$No.show))

## 
##        No       Yes 
## 0.7980674 0.2019326

data <- data %>% 
  select(-PatientId,-AppointmentID) %>%
  mutate(ScheduledDay = as_datetime(ScheduledDay),
         AppointmentDay = as_datetime(AppointmentDay)) %>% 
     mutate_if(is.numeric, as.factor) %>% 
  mutate(Age = as.integer(Age))

data$Show <- as.factor(ifelse(data$No.show == "Yes","No","Yes"))

#Not Including Scheduled Day and AppointmentDay as we assyne that it is not relevant variables for model
data <- data %>% 
  select(-AppointmentDay, -ScheduledDay)

#Also not including neighbourhood as we assume that the amount factor levels are too many to be distinctive to model
data <- data %>% 
  select(-Neighbourhood, -No.show)

2 Classification Modeling

2.1 Classification with Logistic Regression

Read Data

df <- data

Data Exploration

#Checking NA data
colSums(is.na(df))

##       Gender          Age  Scholarship Hipertension     Diabetes 
##            0            0            0            0            0 
##   Alcoholism      Handcap SMS_received         Show 
##            0            0            0            0

#Checking Porporsi Target
prop.table(table(data$Show))

## 
##        No       Yes 
## 0.2019326 0.7980674

Cross Validation

set.seed(182)
index <- sample(nrow(df),0.75*nrow(df))

df.train <- df[index,]
df.test <- df[-index,]

#Proportion of train data
names(df.train)

## [1] "Gender"       "Age"          "Scholarship"  "Hipertension"
## [5] "Diabetes"     "Alcoholism"   "Handcap"      "SMS_received"
## [9] "Show"

prop.table(table(df.train$Show))

## 
##      No     Yes 
## 0.20152 0.79848

#UpSample for Factor "No"
df.train.new <- upSample(x = df.train[,-9], y = df.train[,9], yname = "Show")
prop.table(table(df.train.new$Show))

## 
##  No Yes 
## 0.5 0.5

Build Model

glm.model <- step(glm(Show~., df.train.new, family = "binomial"), direction = "backward")

## Start:  AIC=179619.6
## Show ~ Gender + Age + Scholarship + Hipertension + Diabetes + 
##     Alcoholism + Handcap + SMS_received
## 
##                Df Deviance    AIC
## - Handcap       4   179601 179617
## <none>              179596 179620
## - Gender        1   179599 179621
## - Hipertension  1   179601 179623
## - Alcoholism    1   179601 179623
## - Diabetes      1   179603 179625
## - Scholarship   1   179670 179692
## - Age           1   180159 180181
## - SMS_received  1   182552 182574
## 
## Step:  AIC=179617.3
## Show ~ Gender + Age + Scholarship + Hipertension + Diabetes + 
##     Alcoholism + SMS_received
## 
##                Df Deviance    AIC
## <none>              179601 179617
## - Gender        1   179605 179619
## - Hipertension  1   179607 179621
## - Alcoholism    1   179607 179621
## - Diabetes      1   179609 179623
## - Scholarship   1   179675 179689
## - Age           1   180166 180180
## - SMS_received  1   182559 182573

summary(glm.model)

## 
## Call:
## glm(formula = Show ~ Gender + Age + Scholarship + Hipertension + 
##     Diabetes + Alcoholism + SMS_received, family = "binomial", 
##     data = df.train.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.49511  -1.18389   0.01563   1.12990   1.53070  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.0262998  0.0134821  -1.951  0.05109 .  
## GenderM        0.0213942  0.0119707   1.787  0.07390 .  
## Age            0.0069245  0.0002921  23.703  < 2e-16 ***
## Scholarship1  -0.1591030  0.0185342  -8.584  < 2e-16 ***
## Hipertension1  0.0409965  0.0178323   2.299  0.02150 *  
## Diabetes1     -0.0682522  0.0246472  -2.769  0.00562 ** 
## Alcoholism1   -0.0786115  0.0332591  -2.364  0.01810 *  
## SMS_received1 -0.6290514  0.0116554 -53.971  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 183518  on 132379  degrees of freedom
## Residual deviance: 179601  on 132372  degrees of freedom
## AIC: 179617
## 
## Number of Fisher Scoring iterations: 4

Prediction

predict.prob <- predict(glm.model, df.test[,-9], type = "response")
predict <- as.factor(ifelse(predict.prob > 0.64, "No", "Yes"))
head(predict)

##   1   8  10  11  23  28 
## Yes Yes Yes Yes Yes Yes 
## Levels: No Yes

Model Performance

confusionMatrix(predict, df.test$Show, positive = "No")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No     74   390
##        Yes  5540 21628
##                                           
##                Accuracy : 0.7854          
##                  95% CI : (0.7805, 0.7902)
##     No Information Rate : 0.7968          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0069         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.013181        
##             Specificity : 0.982287        
##          Pos Pred Value : 0.159483        
##          Neg Pred Value : 0.796084        
##              Prevalence : 0.203170        
##          Detection Rate : 0.002678        
##    Detection Prevalence : 0.016792        
##       Balanced Accuracy : 0.497734        
##                                           
##        'Positive' Class : No              
##

Tuning Model Function to find a lower false positive value, we want to minimize the number of cases where we fail to predict Show = “No” while still maximizing accuracy

#Funtion to find the maximum neg pred value, we want to minimize the number of cases where we fail to predict Show = "No" while still maximizing accuracy

find.specitivity.value <- function(x){
  temp <- as.factor(ifelse(predict.prob>(x/100), "No","Yes"))
  a <- confusionMatrix(temp, df.test$Show, positive = "No")
  return(a$byClass[2])
}
find.specitivity.value(65)

## Specificity 
##   0.9940503

b <- NULL
for(x in (50:100)){
  b <- c(b,as.numeric(find.specitivity.value(x)))
  c <- cbind(50:100,b)
  as.data.frame(c)
}

plot(c[,1],c[,2])

2.2 Classification with KNN

Reading Data

df.knn <- data

str(df.knn)

## 'data.frame':    110527 obs. of  9 variables:
##  $ Gender      : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Age         : int  64 58 64 10 58 78 25 41 23 21 ...
##  $ Scholarship : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Hipertension: Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
##  $ Diabetes    : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ Alcoholism  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Handcap     : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SMS_received: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Show        : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 2 2 ...

Data Pre-Processing

#Make Dummy Variables from categorik Variables, which includes all variables except Age
df.knn.cat <- df.knn %>% 
  select(-Show)

dummy <- dummyVars(~., df.knn.cat, fullRank = T)
class(dummy)

## [1] "dummyVars"

dummy.df <- data.frame(predict(dummy,df.knn.cat))
class(dummy.df)

## [1] "data.frame"

df.knn.cat.new <- cbind("Show"=df.knn$Show, dummy.df)
head(df.knn.cat.new)

##   Show Gender.M Age Scholarship.1 Hipertension.1 Diabetes.1 Alcoholism.1
## 1  Yes        0  64             0              1          0            0
## 2  Yes        1  58             0              0          0            0
## 3  Yes        0  64             0              0          0            0
## 4  Yes        0  10             0              0          0            0
## 5  Yes        0  58             0              1          1            0
## 6  Yes        0  78             0              1          0            0
##   Handcap.1 Handcap.2 Handcap.3 Handcap.4 SMS_received.1
## 1         0         0         0         0              0
## 2         0         0         0         0              0
## 3         0         0         0         0              0
## 4         0         0         0         0              0
## 5         0         0         0         0              0
## 6         0         0         0         0              0

#Scaling Numerical Data
df.knn.cat.new.scale <- df.knn.cat.new %>% 
  mutate_if(is.numeric,scale)

Cross Validation

set.seed(182)
idx.knn <- sample(nrow(df.knn.cat.new.scale), 0.75*nrow(df.knn.cat.new.scale))
knn.train <- df.knn.cat.new.scale[index,]
knn.test <- df.knn.cat.new.scale[-index,]

#Checking Train Proportions
prop.table(table(knn.train$Show))

## 
##      No     Yes 
## 0.20152 0.79848

#Upsample Data train
knn.train.new <- downSample(x = knn.train[,-1], y = knn.train[,1], yname = "Show")
prop.table(table(knn.train.new$Show))

## 
##  No Yes 
## 0.5 0.5

str(knn.train.new)

## 'data.frame':    33410 obs. of  12 variables:
##  $ Gender.M      : num  -0.734 1.363 -0.734 1.363 1.363 ...
##  $ Age           : num  -1.345 -1.605 -0.134 0.689 -0.523 ...
##  $ Scholarship.1 : num  -0.33 -0.33 -0.33 -0.33 -0.33 ...
##  $ Hipertension.1: num  -0.496 -0.496 -0.496 -0.496 -0.496 ...
##  $ Diabetes.1    : num  -0.278 -0.278 -0.278 -0.278 -0.278 ...
##  $ Alcoholism.1  : num  -0.177 -0.177 -0.177 -0.177 -0.177 ...
##  $ Handcap.1     : num  -0.137 -0.137 -0.137 -0.137 -0.137 ...
##  $ Handcap.2     : num  -0.0407 -0.0407 -0.0407 -0.0407 -0.0407 ...
##  $ Handcap.3     : num  -0.0108 -0.0108 -0.0108 -0.0108 -0.0108 ...
##  $ Handcap.4     : num  -0.00521 -0.00521 -0.00521 -0.00521 -0.00521 ...
##  $ SMS_received.1: num  -0.688 1.454 1.454 1.454 1.454 ...
##  $ Show          : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

Build Model KNN

#Determining K-Value
sqrt(nrow(knn.train.new))

## [1] 182.784

K = 183 (odd number)

pred.knn <- knn(train = knn.train.new[,-12],test = knn.test[,-1],cl = knn.train.new[,12], k = 183)

Model Performance

confusionMatrix(pred.knn, knn.test$Show, positive = "No")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No   3046  8865
##        Yes  2568 13153
##                                           
##                Accuracy : 0.5862          
##                  95% CI : (0.5804, 0.5921)
##     No Information Rate : 0.7968          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0987          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5426          
##             Specificity : 0.5974          
##          Pos Pred Value : 0.2557          
##          Neg Pred Value : 0.8367          
##              Prevalence : 0.2032          
##          Detection Rate : 0.1102          
##    Detection Prevalence : 0.4311          
##       Balanced Accuracy : 0.5700          
##                                           
##        'Positive' Class : No              
##

Tuning Model

performance.knn <- function(x){
  temp <- knn(train = knn.train.new[,-12],test = knn.test[,-1],cl = knn.train.new[,12], k = x)
  a <- confusionMatrix(pred.knn,as.factor(knn.test[,1]), positive = "No")
  return(a$overall[1])
}

performance.knn(183)

##  Accuracy 
## 0.5862406

b <- NULL
for(x in c(151,171,191,201)){
  b <- c(b,as.numeric(performance.knn(x)))
  c <- cbind(c(151,171,191,201),b)
  as.data.frame(c)
}

plot(c[,1],c[,2])

We can observe that no visible changes to accuracy occurs to the changing of K value, we will try to tune the model with excluding the categorical variables.

glimpse(df.knn)

## Observations: 110,527
## Variables: 9
## $ Gender       <fct> F, M, F, F, F, F, F, F, F, F, F, M, F, M, F, F, M...
## $ Age          <int> 64, 58, 64, 10, 58, 78, 25, 41, 23, 21, 32, 31, 2...
## $ Scholarship  <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ Hipertension <fct> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Diabetes     <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Alcoholism   <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Handcap      <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ SMS_received <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0...
## $ Show         <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Y...

#Pre Processing Data
df.knn.num <- df.knn %>% 
  select_if(is.numeric)
df.knn.num$Show <- df.knn$Show
str(df.knn.num)

## 'data.frame':    110527 obs. of  2 variables:
##  $ Age : int  64 58 64 10 58 78 25 41 23 21 ...
##  $ Show: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 2 2 ...

#Scaling Numerical Data

df.knn.num <- df.knn.num %>% 
  mutate_if(is.numeric, scale)

#Cross Validation
knn.num.train <- df.knn.num[idx.knn,]
knn.num.test <- df.knn.num[-idx.knn,]

prop.table(table(knn.num.train$Show))

## 
##      No     Yes 
## 0.20152 0.79848

knn.num.train.new <- downSample(x = knn.num.train[,-2], y = knn.num.train[,2], yname = "Show")
prop.table(table(knn.num.train.new$Show))

## 
##  No Yes 
## 0.5 0.5

#Model
#pred.knn.num <- knn(train = knn.num.train.new[,-2],test = knn.num.test[,-2],cl = knn.num.train.new[,2], k = 1001)

From the data we are using, it seems that we are unable to perform a knn model using only numerical variables, because of only 1 variable being in numerical form, which leads to the error of to many ties in knn. It seems as there to many same values of nearest neighbours as so the model function cannot return the value of the specific K value. ### End tabset

3 Conclusion

The glm model obtained is

summary(glm.model)

## 
## Call:
## glm(formula = Show ~ Gender + Age + Scholarship + Hipertension + 
##     Diabetes + Alcoholism + SMS_received, family = "binomial", 
##     data = df.train.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.49511  -1.18389   0.01563   1.12990   1.53070  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.0262998  0.0134821  -1.951  0.05109 .  
## GenderM        0.0213942  0.0119707   1.787  0.07390 .  
## Age            0.0069245  0.0002921  23.703  < 2e-16 ***
## Scholarship1  -0.1591030  0.0185342  -8.584  < 2e-16 ***
## Hipertension1  0.0409965  0.0178323   2.299  0.02150 *  
## Diabetes1     -0.0682522  0.0246472  -2.769  0.00562 ** 
## Alcoholism1   -0.0786115  0.0332591  -2.364  0.01810 *  
## SMS_received1 -0.6290514  0.0116554 -53.971  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 183518  on 132379  degrees of freedom
## Residual deviance: 179601  on 132372  degrees of freedom
## AIC: 179617
## 
## Number of Fisher Scoring iterations: 4

With the confusion matrix of

confusionMatrix(predict, df.test$Show, positive = "No")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No     74   390
##        Yes  5540 21628
##                                           
##                Accuracy : 0.7854          
##                  95% CI : (0.7805, 0.7902)
##     No Information Rate : 0.7968          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0069         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.013181        
##             Specificity : 0.982287        
##          Pos Pred Value : 0.159483        
##          Neg Pred Value : 0.796084        
##              Prevalence : 0.203170        
##          Detection Rate : 0.002678        
##    Detection Prevalence : 0.016792        
##       Balanced Accuracy : 0.497734        
##                                           
##        'Positive' Class : No              
##

In which the increase in probabilty for each variable is

glm.coefficient.table <- cbind(names(glm.model$coefficients),round(inv.logit(glm.model$coefficients),2))
glm.coefficient.table

##               [,1]            [,2]  
## (Intercept)   "(Intercept)"   "0.49"
## GenderM       "GenderM"       "0.51"
## Age           "Age"           "0.5" 
## Scholarship1  "Scholarship1"  "0.46"
## Hipertension1 "Hipertension1" "0.51"
## Diabetes1     "Diabetes1"     "0.48"
## Alcoholism1   "Alcoholism1"   "0.48"
## SMS_received1 "SMS_received1" "0.35"

In which the first column is the coefficient names, and the second column are the probabilities respectively.

The KNN Model obtained resulted in the confusion Matrix of

confusionMatrix(pred.knn, knn.test$Show, positive = "No")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No   3046  8865
##        Yes  2568 13153
##                                           
##                Accuracy : 0.5862          
##                  95% CI : (0.5804, 0.5921)
##     No Information Rate : 0.7968          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0987          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5426          
##             Specificity : 0.5974          
##          Pos Pred Value : 0.2557          
##          Neg Pred Value : 0.8367          
##              Prevalence : 0.2032          
##          Detection Rate : 0.1102          
##    Detection Prevalence : 0.4311          
##       Balanced Accuracy : 0.5700          
##                                           
##        'Positive' Class : No              
##

From the 2 models obtained, we conclude that in this case it is better to use the GLM Model, as it has a higher accuracy and has the lesser False Positive value.
From the data we are using, it seems that we are unable to perform a knn model using only numerical variables, because of only 1 variable being in numerical form, which leads to the error of to many ties in knn. It seems as there to many same values of nearest neighbours as so the model function cannot return the value of the specific K value.

Classification with Logistic Regression

Thona Elisa

9/26/2019