As the “Eradicate Children Stunting” Campaign still lingering on Indonesia’s main health and human development program, this issue won’t get any expired date. Having a simple automatic measure on child growth could help many institutions and health experts when handling children’s data all over the country.
This topic will be discussed about model comparisons between Logistic Regression and k-Nearest Neighbor (kNN) for the best predictor of the classes. For the simple picture to step further, this report will be using IFLS 5 Data from RAND and already pre-process before on Python (see the repository here).
Without further ado, let’s get started!
pidlink: [int] ID per household
height: [double] Height measure (in cm)
weight: [double] Weight measure (in kg)
sex: [factor] gender
month: [int] Children’s age (in month)
zlen: [int] z score of stunting (should be deleted if the prediction would be a challenging one), as: >1 = normal, -1 / -2.9 = moderately stunting, <-3 = severely stunting (obvious variable)
result: [cat] Stunting statement, 1-3 from normal to severely stunting (continous)
library(tidyverse)
library(dplyr)
library(caret)
library(class)stunt<-read.csv("data/Stunt.csv", stringsAsFactors = TRUE)
str(stunt)## 'data.frame': 6205 obs. of 7 variables:
## $ pidlink: int 1060010 1060011 1065104 1085107 1224106 1240022 1240023 1250011 1250012 1250013 ...
## $ height : num 89.1 61.9 80.6 87 69.9 ...
## $ weight : num 13.1 5.3 10 11.5 8.5 15.2 8.3 15.8 12.7 9.2 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 1 2 1 ...
## $ month : num 36 11 35 36 0 59 0 59 36 0 ...
## $ zlen : num -1.89 -5.43 -3.65 -2.45 10.57 ...
## $ result : Factor w/ 3 levels "Moderately Stunting",..: 2 3 3 1 2 1 2 2 2 2 ...
stunting<-stunt%>%
select(-c(zlen,pidlink))%>%
mutate(result=car::recode(result, "c('Moderately Stunting', 'Severely Stunting')='Stunting'"))colSums(is.na(stunting))## height weight sex month result
## 0 0 0 0 0
stunting$result%>%
table()%>%
prop.table()## .
## Normal Stunting
## 0.8949234 0.1050766
Apparently, these data has imbalanced data as normal baby majoring the dataset.
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(156)
# your code here
split<-sample(nrow(stunting), nrow(stunting)*0.75) # 75% persen data
train <- stunting[split, ] # 75% data train
test <- stunting[-split, ] # 25% data testCheck the data balances
train$result%>%table()%>%prop.table()## .
## Normal Stunting
## 0.8957662 0.1042338
test$result%>%table()%>%prop.table()## .
## Normal Stunting
## 0.8923969 0.1076031
lr <- glm(formula =result~., family = "binomial",
data = train)
summary(lr)##
## Call:
## glm(formula = result ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.09786 -0.31848 -0.14137 -0.06165 2.56688
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.88147 0.85173 20.994 < 2e-16 ***
## height -0.34601 0.01623 -21.313 < 2e-16 ***
## weight -0.02997 0.03522 -0.851 0.395
## sexM 0.65014 0.12428 5.231 1.68e-07 ***
## month 0.29229 0.01176 24.865 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3110.9 on 4652 degrees of freedom
## Residual deviance: 1812.7 on 4648 degrees of freedom
## AIC: 1822.7
##
## Number of Fisher Scoring iterations: 7
This variables tends to have good prediction (all of the variables are significant affected the stunting status).
test$prob<-round(predict(lr, type = "response", newdata = test),2)test$pred <- factor(ifelse(test$prob > 0.05, "Stunting","Normal"))
test[1:10, c("pred", "result")]conflr <- confusionMatrix(test$pred, test$result, positive = "Stunting")
conflr## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Stunting
## Normal 1050 6
## Stunting 335 161
##
## Accuracy : 0.7803
## 95% CI : (0.7588, 0.8007)
## No Information Rate : 0.8924
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.387
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9641
## Specificity : 0.7581
## Pos Pred Value : 0.3246
## Neg Pred Value : 0.9943
## Prevalence : 0.1076
## Detection Rate : 0.1037
## Detection Prevalence : 0.3196
## Balanced Accuracy : 0.8611
##
## 'Positive' Class : Stunting
##
aspect<-c("Accuracy","Recall","Precision","Specificity")
logit<-c(conflr$overall[1],conflr$byClass[1],conflr$byClass[3],conflr$byClass[2])
data.frame(aspect,logit)Precision is low
dm <- dummyVars( ~sex, data = stunting)
res <- data.frame(predict(dm, newdata = stunting))
stunt_dm <- cbind(res, stunting)
str(stunt_dm)## 'data.frame': 6205 obs. of 7 variables:
## $ sex.F : num 0 0 1 0 0 0 1 1 0 1 ...
## $ sex.M : num 1 1 0 1 1 1 0 0 1 0 ...
## $ height: num 89.1 61.9 80.6 87 69.9 ...
## $ weight: num 13.1 5.3 10 11.5 8.5 15.2 8.3 15.8 12.7 9.2 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 1 2 2 2 1 1 2 1 ...
## $ month : num 36 11 35 36 0 59 0 59 36 0 ...
## $ result: Factor w/ 2 levels "Normal","Stunting": 1 2 2 2 1 2 1 1 1 1 ...
stunt_dm<-stunt_dm%>%
select(-sex)set.seed(156)
dm_train <- stunt_dm[split,1:5]
dm_test <- stunt_dm[-split,1:5]
dm_train_label <- stunt_dm[split,6]
dm_test_label <- stunt_dm[-split,6]
train_k<-stunt_dm[split,]
test_k<-stunt_dm[-split,]# scale train_x data
dm_train <- scale(dm_train)
# scale test_x data
dm_test <- scale(dm_test,
center = attr(dm_train, "scaled:center"),
scale = attr(dm_train, "scaled:scale"))set.seed(156)
ctrl <- trainControl(method="repeatedcv",repeats = 3)
knnFit <- train(result ~ ., data = train_k, method = "knn", trControl = ctrl, preProcess = c("center","scale"),tuneLength = 20)
knnFit## k-Nearest Neighbors
##
## 4653 samples
## 5 predictor
## 2 classes: 'Normal', 'Stunting'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 4188, 4188, 4188, 4188, 4188, 4187, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9807285 0.8923847
## 7 0.9792250 0.8822822
## 9 0.9764308 0.8656095
## 11 0.9753586 0.8590905
## 13 0.9732801 0.8467934
## 15 0.9718470 0.8359798
## 17 0.9699843 0.8233522
## 19 0.9686231 0.8137722
## 21 0.9671897 0.8030137
## 23 0.9654709 0.7903116
## 25 0.9640388 0.7803930
## 27 0.9619612 0.7670303
## 29 0.9603859 0.7559968
## 31 0.9592394 0.7481305
## 33 0.9581651 0.7399397
## 35 0.9572334 0.7326834
## 37 0.9557300 0.7217378
## 39 0.9548707 0.7141857
## 41 0.9533656 0.7011375
## 43 0.9517890 0.6891457
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
knn_5 <- knn(train = dm_train, test = dm_test, cl = dm_train_label, k = 5)confkn<-confusionMatrix(knn_5, as.factor(dm_test_label) , positive = 'Stunting')
confkn## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Stunting
## Normal 1377 26
## Stunting 8 141
##
## Accuracy : 0.9781
## 95% CI : (0.9695, 0.9848)
## No Information Rate : 0.8924
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8803
##
## Mcnemar's Test P-Value : 0.003551
##
## Sensitivity : 0.84431
## Specificity : 0.99422
## Pos Pred Value : 0.94631
## Neg Pred Value : 0.98147
## Prevalence : 0.10760
## Detection Rate : 0.09085
## Detection Prevalence : 0.09601
## Balanced Accuracy : 0.91927
##
## 'Positive' Class : Stunting
##
knn<-c(confkn$overall[1],confkn$byClass[1],confkn$byClass[3],confkn$byClass[2])
data.frame(aspect,logit,knn)