## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Data used in this analysis is taken from kaggle, it is data about people who make a doctor appointment and if they show up or not on the date of the appointment.
The Data is about 300k observations and contain 12 variables
## 'data.frame': 110527 obs. of 14 variables:
## $ PatientId : num 2.99e+13 5.59e+14 4.26e+12 8.68e+11 8.84e+12 ...
## $ AppointmentID : int 5642903 5642503 5642549 5642828 5642494 5626772 5630279 5630575 5638447 5629123 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
## $ ScheduledDay : Factor w/ 103549 levels "2015-11-10T07:13:56Z",..: 27742 27504 27539 27709 27498 20074 21386 21496 24945 20895 ...
## $ AppointmentDay: Factor w/ 27 levels "2016-04-29T00:00:00Z",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 62 56 62 8 56 76 23 39 21 19 ...
## $ Neighbourhood : Factor w/ 81 levels "AEROPORTO","ANDORINHAS",..: 40 40 47 55 40 59 26 26 2 13 ...
## $ Scholarship : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Hipertension : int 1 0 0 0 1 1 0 0 0 0 ...
## $ Diabetes : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Alcoholism : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Handcap : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SMS_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.show : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
##
## No Yes
## 0.7980674 0.2019326
data <- data %>%
select(-PatientId,-AppointmentID) %>%
mutate(ScheduledDay = as_datetime(ScheduledDay),
AppointmentDay = as_datetime(AppointmentDay)) %>%
mutate_if(is.numeric, as.factor) %>%
mutate(Age = as.integer(Age))
data$Show <- as.factor(ifelse(data$No.show == "Yes","No","Yes"))
#Not Including Scheduled Day and AppointmentDay as we assyne that it is not relevant variables for model
data <- data %>%
select(-AppointmentDay, -ScheduledDay)
#Also not including neighbourhood as we assume that the amount factor levels are too many to be distinctive to model
data <- data %>%
select(-Neighbourhood, -No.show)## Gender Age Scholarship Hipertension Diabetes
## 0 0 0 0 0
## Alcoholism Handcap SMS_received Show
## 0 0 0 0
##
## No Yes
## 0.2019326 0.7980674
set.seed(182)
index <- sample(nrow(df),0.75*nrow(df))
df.train <- df[index,]
df.test <- df[-index,]
#Proportion of train data
names(df.train)## [1] "Gender" "Age" "Scholarship" "Hipertension"
## [5] "Diabetes" "Alcoholism" "Handcap" "SMS_received"
## [9] "Show"
##
## No Yes
## 0.20152 0.79848
#UpSample for Factor "No"
df.train.new <- upSample(x = df.train[,-9], y = df.train[,9], yname = "Show")
prop.table(table(df.train.new$Show))##
## No Yes
## 0.5 0.5
## Start: AIC=179619.6
## Show ~ Gender + Age + Scholarship + Hipertension + Diabetes +
## Alcoholism + Handcap + SMS_received
##
## Df Deviance AIC
## - Handcap 4 179601 179617
## <none> 179596 179620
## - Gender 1 179599 179621
## - Hipertension 1 179601 179623
## - Alcoholism 1 179601 179623
## - Diabetes 1 179603 179625
## - Scholarship 1 179670 179692
## - Age 1 180159 180181
## - SMS_received 1 182552 182574
##
## Step: AIC=179617.3
## Show ~ Gender + Age + Scholarship + Hipertension + Diabetes +
## Alcoholism + SMS_received
##
## Df Deviance AIC
## <none> 179601 179617
## - Gender 1 179605 179619
## - Hipertension 1 179607 179621
## - Alcoholism 1 179607 179621
## - Diabetes 1 179609 179623
## - Scholarship 1 179675 179689
## - Age 1 180166 180180
## - SMS_received 1 182559 182573
##
## Call:
## glm(formula = Show ~ Gender + Age + Scholarship + Hipertension +
## Diabetes + Alcoholism + SMS_received, family = "binomial",
## data = df.train.new)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.49511 -1.18389 0.01563 1.12990 1.53070
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0262998 0.0134821 -1.951 0.05109 .
## GenderM 0.0213942 0.0119707 1.787 0.07390 .
## Age 0.0069245 0.0002921 23.703 < 2e-16 ***
## Scholarship1 -0.1591030 0.0185342 -8.584 < 2e-16 ***
## Hipertension1 0.0409965 0.0178323 2.299 0.02150 *
## Diabetes1 -0.0682522 0.0246472 -2.769 0.00562 **
## Alcoholism1 -0.0786115 0.0332591 -2.364 0.01810 *
## SMS_received1 -0.6290514 0.0116554 -53.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 183518 on 132379 degrees of freedom
## Residual deviance: 179601 on 132372 degrees of freedom
## AIC: 179617
##
## Number of Fisher Scoring iterations: 4
predict.prob <- predict(glm.model, df.test[,-9], type = "response")
predict <- as.factor(ifelse(predict.prob > 0.64, "No", "Yes"))
head(predict)## 1 8 10 11 23 28
## Yes Yes Yes Yes Yes Yes
## Levels: No Yes
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 390
## Yes 5540 21628
##
## Accuracy : 0.7854
## 95% CI : (0.7805, 0.7902)
## No Information Rate : 0.7968
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0069
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.013181
## Specificity : 0.982287
## Pos Pred Value : 0.159483
## Neg Pred Value : 0.796084
## Prevalence : 0.203170
## Detection Rate : 0.002678
## Detection Prevalence : 0.016792
## Balanced Accuracy : 0.497734
##
## 'Positive' Class : No
##
#Funtion to find the maximum neg pred value, we want to minimize the number of cases where we fail to predict Show = "No" while still maximizing accuracy
find.specitivity.value <- function(x){
temp <- as.factor(ifelse(predict.prob>(x/100), "No","Yes"))
a <- confusionMatrix(temp, df.test$Show, positive = "No")
return(a$byClass[2])
}
find.specitivity.value(65)## Specificity
## 0.9940503
b <- NULL
for(x in (50:100)){
b <- c(b,as.numeric(find.specitivity.value(x)))
c <- cbind(50:100,b)
as.data.frame(c)
}
plot(c[,1],c[,2])## 'data.frame': 110527 obs. of 9 variables:
## $ Gender : Factor w/ 2 levels "F","M": 1 2 1 1 1 1 1 1 1 1 ...
## $ Age : int 64 58 64 10 58 78 25 41 23 21 ...
## $ Scholarship : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hipertension: Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
## $ Diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ Alcoholism : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Handcap : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ SMS_received: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Show : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 2 2 ...
#Make Dummy Variables from categorik Variables, which includes all variables except Age
df.knn.cat <- df.knn %>%
select(-Show)
dummy <- dummyVars(~., df.knn.cat, fullRank = T)
class(dummy)## [1] "dummyVars"
## [1] "data.frame"
## Show Gender.M Age Scholarship.1 Hipertension.1 Diabetes.1 Alcoholism.1
## 1 Yes 0 64 0 1 0 0
## 2 Yes 1 58 0 0 0 0
## 3 Yes 0 64 0 0 0 0
## 4 Yes 0 10 0 0 0 0
## 5 Yes 0 58 0 1 1 0
## 6 Yes 0 78 0 1 0 0
## Handcap.1 Handcap.2 Handcap.3 Handcap.4 SMS_received.1
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
set.seed(182)
idx.knn <- sample(nrow(df.knn.cat.new.scale), 0.75*nrow(df.knn.cat.new.scale))
knn.train <- df.knn.cat.new.scale[index,]
knn.test <- df.knn.cat.new.scale[-index,]
#Checking Train Proportions
prop.table(table(knn.train$Show))##
## No Yes
## 0.20152 0.79848
#Upsample Data train
knn.train.new <- downSample(x = knn.train[,-1], y = knn.train[,1], yname = "Show")
prop.table(table(knn.train.new$Show))##
## No Yes
## 0.5 0.5
## 'data.frame': 33410 obs. of 12 variables:
## $ Gender.M : num -0.734 1.363 -0.734 1.363 1.363 ...
## $ Age : num -1.345 -1.605 -0.134 0.689 -0.523 ...
## $ Scholarship.1 : num -0.33 -0.33 -0.33 -0.33 -0.33 ...
## $ Hipertension.1: num -0.496 -0.496 -0.496 -0.496 -0.496 ...
## $ Diabetes.1 : num -0.278 -0.278 -0.278 -0.278 -0.278 ...
## $ Alcoholism.1 : num -0.177 -0.177 -0.177 -0.177 -0.177 ...
## $ Handcap.1 : num -0.137 -0.137 -0.137 -0.137 -0.137 ...
## $ Handcap.2 : num -0.0407 -0.0407 -0.0407 -0.0407 -0.0407 ...
## $ Handcap.3 : num -0.0108 -0.0108 -0.0108 -0.0108 -0.0108 ...
## $ Handcap.4 : num -0.00521 -0.00521 -0.00521 -0.00521 -0.00521 ...
## $ SMS_received.1: num -0.688 1.454 1.454 1.454 1.454 ...
## $ Show : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## [1] 182.784
K = 183 (odd number)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3046 8865
## Yes 2568 13153
##
## Accuracy : 0.5862
## 95% CI : (0.5804, 0.5921)
## No Information Rate : 0.7968
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0987
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5426
## Specificity : 0.5974
## Pos Pred Value : 0.2557
## Neg Pred Value : 0.8367
## Prevalence : 0.2032
## Detection Rate : 0.1102
## Detection Prevalence : 0.4311
## Balanced Accuracy : 0.5700
##
## 'Positive' Class : No
##
performance.knn <- function(x){
temp <- knn(train = knn.train.new[,-12],test = knn.test[,-1],cl = knn.train.new[,12], k = x)
a <- confusionMatrix(pred.knn,as.factor(knn.test[,1]), positive = "No")
return(a$overall[1])
}
performance.knn(183)## Accuracy
## 0.5862406
b <- NULL
for(x in c(151,171,191,201)){
b <- c(b,as.numeric(performance.knn(x)))
c <- cbind(c(151,171,191,201),b)
as.data.frame(c)
}
plot(c[,1],c[,2]) We can observe that no visible changes to accuracy occurs to the changing of K value, we will try to tune the model with excluding the categorical variables.
## Observations: 110,527
## Variables: 9
## $ Gender <fct> F, M, F, F, F, F, F, F, F, F, F, M, F, M, F, F, M...
## $ Age <int> 64, 58, 64, 10, 58, 78, 25, 41, 23, 21, 32, 31, 2...
## $ Scholarship <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ Hipertension <fct> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Diabetes <fct> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Alcoholism <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Handcap <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ SMS_received <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0...
## $ Show <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Y...
#Pre Processing Data
df.knn.num <- df.knn %>%
select_if(is.numeric)
df.knn.num$Show <- df.knn$Show
str(df.knn.num)## 'data.frame': 110527 obs. of 2 variables:
## $ Age : int 64 58 64 10 58 78 25 41 23 21 ...
## $ Show: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 2 2 ...
#Scaling Numerical Data
df.knn.num <- df.knn.num %>%
mutate_if(is.numeric, scale)
#Cross Validation
knn.num.train <- df.knn.num[idx.knn,]
knn.num.test <- df.knn.num[-idx.knn,]
prop.table(table(knn.num.train$Show))##
## No Yes
## 0.20152 0.79848
knn.num.train.new <- downSample(x = knn.num.train[,-2], y = knn.num.train[,2], yname = "Show")
prop.table(table(knn.num.train.new$Show))##
## No Yes
## 0.5 0.5
#Model
#pred.knn.num <- knn(train = knn.num.train.new[,-2],test = knn.num.test[,-2],cl = knn.num.train.new[,2], k = 1001)From the data we are using, it seems that we are unable to perform a knn model using only numerical variables, because of only 1 variable being in numerical form, which leads to the error of to many ties in knn. It seems as there to many same values of nearest neighbours as so the model function cannot return the value of the specific K value. ### End tabset
##
## Call:
## glm(formula = Show ~ Gender + Age + Scholarship + Hipertension +
## Diabetes + Alcoholism + SMS_received, family = "binomial",
## data = df.train.new)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.49511 -1.18389 0.01563 1.12990 1.53070
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0262998 0.0134821 -1.951 0.05109 .
## GenderM 0.0213942 0.0119707 1.787 0.07390 .
## Age 0.0069245 0.0002921 23.703 < 2e-16 ***
## Scholarship1 -0.1591030 0.0185342 -8.584 < 2e-16 ***
## Hipertension1 0.0409965 0.0178323 2.299 0.02150 *
## Diabetes1 -0.0682522 0.0246472 -2.769 0.00562 **
## Alcoholism1 -0.0786115 0.0332591 -2.364 0.01810 *
## SMS_received1 -0.6290514 0.0116554 -53.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 183518 on 132379 degrees of freedom
## Residual deviance: 179601 on 132372 degrees of freedom
## AIC: 179617
##
## Number of Fisher Scoring iterations: 4
With the confusion matrix of
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 390
## Yes 5540 21628
##
## Accuracy : 0.7854
## 95% CI : (0.7805, 0.7902)
## No Information Rate : 0.7968
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0069
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.013181
## Specificity : 0.982287
## Pos Pred Value : 0.159483
## Neg Pred Value : 0.796084
## Prevalence : 0.203170
## Detection Rate : 0.002678
## Detection Prevalence : 0.016792
## Balanced Accuracy : 0.497734
##
## 'Positive' Class : No
##
In which the increase in probabilty for each variable is
glm.coefficient.table <- cbind(names(glm.model$coefficients),round(inv.logit(glm.model$coefficients),2))
glm.coefficient.table## [,1] [,2]
## (Intercept) "(Intercept)" "0.49"
## GenderM "GenderM" "0.51"
## Age "Age" "0.5"
## Scholarship1 "Scholarship1" "0.46"
## Hipertension1 "Hipertension1" "0.51"
## Diabetes1 "Diabetes1" "0.48"
## Alcoholism1 "Alcoholism1" "0.48"
## SMS_received1 "SMS_received1" "0.35"
In which the first column is the coefficient names, and the second column are the probabilities respectively.
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3046 8865
## Yes 2568 13153
##
## Accuracy : 0.5862
## 95% CI : (0.5804, 0.5921)
## No Information Rate : 0.7968
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0987
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5426
## Specificity : 0.5974
## Pos Pred Value : 0.2557
## Neg Pred Value : 0.8367
## Prevalence : 0.2032
## Detection Rate : 0.1102
## Detection Prevalence : 0.4311
## Balanced Accuracy : 0.5700
##
## 'Positive' Class : No
##
From the 2 models obtained, we conclude that in this case it is better to use the GLM Model, as it has a higher accuracy and has the lesser False Positive value.
From the data we are using, it seems that we are unable to perform a knn model using only numerical variables, because of only 1 variable being in numerical form, which leads to the error of to many ties in knn. It seems as there to many same values of nearest neighbours as so the model function cannot return the value of the specific K value.