survey 에서 성별을 추측해보는 모델로 작업을 해봅니다.
library(MASS)
survey_tmp = survey
survey_tmp$Sex_Female = (survey_tmp$Sex == "Female")*1
input_ones = survey_tmp[which(survey_tmp$Sex == "Female"),]
input_zeros = survey_tmp[which(survey_tmp$Sex == "Male"),]
저렇게 하면 ones 에 여성이, zeros 에 남성이 들어갑니다
set.seed(100)
input_ones_training_rows = sample(1:nrow(input_ones), 0.7*nrow(input_ones))
input_zeros_training_rows = sample(1:nrow(input_zeros), 0.7*nrow(input_ones))
training_ones = input_ones[input_ones_training_rows, ]
training_zeros = input_zeros[input_zeros_training_rows, ]
trainingData = rbind(training_ones, training_zeros)
test_ones = input_ones[-input_ones_training_rows, ]
test_zeros = input_zeros[-input_zeros_training_rows, ]
testData = rbind(test_ones, test_zeros)
신기하네
이렇게 만들어서 테스트 데이터 는 test_ones 와 test_zeros 를 섞어서 넣고 training_ones 와 zeros 를 만들었다
library(smbinning)
factor_vars = c("W.Hnd", "Fold", "Clap", "Exer", "Smoke", "M.I")
continuous_vars = c("Wr.Hnd", "NW.Hnd", "Pulse", "Height", "Age")
iv_df = data.frame(VARS=c(factor_vars, continuous_vars), IV=numeric(11))
for(factor_var in factor_vars) {
smb = smbinning.factor(trainingData, y="Sex_Female", x=factor_var)
if ( class(smb) != "character") {
iv_df[iv_df$VARS == factor_var, "IV"] = smb$iv
}
}
강제형변환에 의해 생성된 NA 입니다강제형변환에 의해 생성된 NA 입니다
for(continuous_var in continuous_vars ) {
smb = smbinning(trainingData, y="Sex_Female", x=continuous_var)
if ( class(smb) != "character") {
iv_df[iv_df$VARS == continuous_var, "IV"] = smb$iv
}
}
iv_df = iv_df[order(-iv_df$IV), ]
iv_df
몇 가지 정리를 하면..
smbinning 을 돌렸을 때 리턴되는 smb 는, 딱히 매치가 안 된다면 여기서 “No significant split” 이라는 메시지가 나오고, 이것은 class() 를 넣으면 “character” 가 나온다. 말하자면 return -1 같은 느낌으로..
factor 랑 continuous 값들을 나눠서 각각을 따로 분류해서 IV값을 측정하는데, 그건 smbinning.factor() 나 smbinning() 으로 작업하면 각각을 만들어낼 수 있음. 만들어진 iv 값을 기준으로 어떤 값들이 더 가치있는 평가기준인지 확인할 수 있음.
원래대로라면 얘들이 서로서로 관련이 없다는 게 증명되는 부분이 추가되어야 하지 않나?
table(trainingData$Sex_Female)
0 1
82 82
균형은 잘 맞고
plot(density(survey_tmp$Sex_Female, na.rm=TRUE))

그래프를 보면 binomial 인게 확실하니까, 이건 glm 에서 binomial 로 + link 는 logit 으로 파악하면 된다. IV 테이블 상위 5개는 아래와 같으니 얘들을 그대로 쓰기로 하고
10 Height 2.3670
5 Smoke 0.0858
2 Fold 0.0244
3 Clap 0.0236
4 Exer 0.0094
model = glm( Sex_Female ~ Height + Smoke + Fold + Clap + Exer, data=trainingData, family=binomial(link="logit"))
testData$predicted = predict(model, testData, type="response")
summary(model)
Call:
glm(formula = Sex_Female ~ Height + Smoke + Fold + Clap + Exer,
family = binomial(link = "logit"), data = trainingData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.19285 -0.36062 0.03434 0.52549 1.93776
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 57.28964 10.42630 5.495 0.0000000391 ***
Height -0.32045 0.05791 -5.534 0.0000000313 ***
SmokeNever -0.75935 1.08505 -0.700 0.4840
SmokeOccas -1.76592 1.36317 -1.295 0.1952
SmokeRegul -3.78333 3.48797 -1.085 0.2781
FoldNeither -2.28074 1.11378 -2.048 0.0406 *
FoldR on L 0.22817 0.54579 0.418 0.6759
ClapNeither -0.93497 0.86461 -1.081 0.2795
ClapRight -1.02665 0.79276 -1.295 0.1953
ExerNone -2.24143 0.87394 -2.565 0.0103 *
ExerSome -0.65517 0.64503 -1.016 0.3098
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 201.006 on 144 degrees of freedom
Residual deviance: 99.102 on 134 degrees of freedom
(19 observations deleted due to missingness)
AIC: 121.1
Number of Fisher Scoring iterations: 7
library(car)
sqrt(vif(model)) > 2
GVIF Df GVIF^(1/(2*Df))
Height FALSE FALSE FALSE
Smoke FALSE FALSE FALSE
Fold FALSE FALSE FALSE
Clap FALSE FALSE FALSE
Exer FALSE FALSE FALSE
아까 3번에서 관계있는 거 가려내야 한다 했는데, 보니 쌍방 딱히 선형관계는 없는 모양이다.
library(InformationValue)
not_na_testData = testData[ complete.cases(testData['predicted']) , ]
threshold = optimalCutoff(not_na_testData$Sex_Female, not_na_testData$predicted)
misClassError(not_na_testData$Sex_Female, not_na_testData$predicted, threshold = threshold)
[1] 0.129
plotROC(testData$Sex_Female, testData$predicted)

커브가 대충 저렇게 나온다 뭐 이런 결론이 나옵니다.
AUROC 는 ROC 커브 아랫쪽 면적의 계산 (그러니까, 82%정도 맞추네요)

그럼 저기서 결론은
Height, SmokeNever, SmokeOccas, SmokeRegul, FoldNeither ExerNone ExerSome NW.Hnd
이렇게가 의미가 있다는 얘긴 것 같고
model = glm( Sex_Female ~ Height + Smoke + Fold + Exer + NW.Hnd, data=trainingData, family=binomial(link="logit"))
testData$predicted = predict(model, testData, type="response")
library(InformationValue)
plotROC(testData$Sex_Female, testData$predicted)

오 진짜네 조금 올랐네
참고로 기존에는 0.8271 이었습니다 8271 > 841
역시 사람보다 기계가 해야 (…)
LS0tCnRpdGxlOiAiUiBTdHVkeSA1IC0gR0xNIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpzdXJ2ZXkg7JeQ7IScIOyEseuzhOydhCDstpTsuKHtlbTrs7TripQg66qo642466GcIOyekeyXheydhCDtlbTrtIXri4jri6QuCgpgYGB7cn0KbGlicmFyeShNQVNTKQpzdXJ2ZXlfdG1wID0gc3VydmV5CnN1cnZleV90bXAkU2V4X0ZlbWFsZSA9IChzdXJ2ZXlfdG1wJFNleCA9PSAiRmVtYWxlIikqMQppbnB1dF9vbmVzID0gc3VydmV5X3RtcFt3aGljaChzdXJ2ZXlfdG1wJFNleCA9PSAiRmVtYWxlIiksXQppbnB1dF96ZXJvcyA9IHN1cnZleV90bXBbd2hpY2goc3VydmV5X3RtcCRTZXggPT0gIk1hbGUiKSxdCmBgYAoK7KCA66CH6rKMIO2VmOuptCBvbmVzIOyXkCDsl6zshLHsnbQsIHplcm9zIOyXkCDrgqjshLHsnbQg65Ok7Ja06rCR64uI64ukCgpgYGB7cn0Kc2V0LnNlZWQoMTAwKQppbnB1dF9vbmVzX3RyYWluaW5nX3Jvd3MgPSBzYW1wbGUoMTpucm93KGlucHV0X29uZXMpLCAwLjcqbnJvdyhpbnB1dF9vbmVzKSkKaW5wdXRfemVyb3NfdHJhaW5pbmdfcm93cyA9IHNhbXBsZSgxOm5yb3coaW5wdXRfemVyb3MpLCAwLjcqbnJvdyhpbnB1dF9vbmVzKSkKdHJhaW5pbmdfb25lcyA9IGlucHV0X29uZXNbaW5wdXRfb25lc190cmFpbmluZ19yb3dzLCBdCnRyYWluaW5nX3plcm9zID0gaW5wdXRfemVyb3NbaW5wdXRfemVyb3NfdHJhaW5pbmdfcm93cywgXQp0cmFpbmluZ0RhdGEgPSByYmluZCh0cmFpbmluZ19vbmVzLCB0cmFpbmluZ196ZXJvcykKCnRlc3Rfb25lcyA9IGlucHV0X29uZXNbLWlucHV0X29uZXNfdHJhaW5pbmdfcm93cywgXQp0ZXN0X3plcm9zID0gaW5wdXRfemVyb3NbLWlucHV0X3plcm9zX3RyYWluaW5nX3Jvd3MsIF0KdGVzdERhdGEgPSByYmluZCh0ZXN0X29uZXMsIHRlc3RfemVyb3MpCmBgYAoK7Iug6riw7ZWY64SkCgrsnbTroIfqsowg66eM65Ok7Ja07IScIO2FjOyKpO2KuCDrjbDsnbTthLAg64qUIHRlc3Rfb25lcyDsmYAgdGVzdF96ZXJvcyDrpbwg7ISe7Ja07IScIOuEo+qzoAp0cmFpbmluZ19vbmVzIOyZgCB6ZXJvcyDrpbwg66eM65Ok7JeI64ukIAoKYGBge3J9CmxpYnJhcnkoc21iaW5uaW5nKQpmYWN0b3JfdmFycyA9IGMoIlcuSG5kIiwgIkZvbGQiLCAiQ2xhcCIsICJFeGVyIiwgIlNtb2tlIiwgIk0uSSIpCmNvbnRpbnVvdXNfdmFycyA9IGMoIldyLkhuZCIsICJOVy5IbmQiLCAiUHVsc2UiLCAiSGVpZ2h0IiwgIkFnZSIpCml2X2RmID0gZGF0YS5mcmFtZShWQVJTPWMoZmFjdG9yX3ZhcnMsIGNvbnRpbnVvdXNfdmFycyksIElWPW51bWVyaWMoMTEpKQoKZm9yKGZhY3Rvcl92YXIgaW4gZmFjdG9yX3ZhcnMpIHsKICBzbWIgPSBzbWJpbm5pbmcuZmFjdG9yKHRyYWluaW5nRGF0YSwgeT0iU2V4X0ZlbWFsZSIsIHg9ZmFjdG9yX3ZhcikKICBpZiAoIGNsYXNzKHNtYikgIT0gImNoYXJhY3RlciIpIHsKICAgIGl2X2RmW2l2X2RmJFZBUlMgPT0gZmFjdG9yX3ZhciwgIklWIl0gPSBzbWIkaXYKICB9Cn0KCmZvcihjb250aW51b3VzX3ZhciBpbiBjb250aW51b3VzX3ZhcnMgKSB7CiAgc21iID0gc21iaW5uaW5nKHRyYWluaW5nRGF0YSwgeT0iU2V4X0ZlbWFsZSIsIHg9Y29udGludW91c192YXIpIAogIGlmICggY2xhc3Moc21iKSAhPSAiY2hhcmFjdGVyIikgeyAKICAgIGl2X2RmW2l2X2RmJFZBUlMgPT0gY29udGludW91c192YXIsICJJViJdID0gc21iJGl2CiAgfQp9Cgppdl9kZiA9IGl2X2RmW29yZGVyKC1pdl9kZiRJViksIF0KaXZfZGYKCmBgYAoK66qHIOqwgOyngCDsoJXrpqzrpbwg7ZWY66m0Li4KCjEpIHNtYmlubmluZyDsnYQg64+M66C47J2EIOuVjCDrpqzthLTrkJjripQgc21iIOuKlCwg65Sx7Z6IIOunpOy5mOqwgCDslYgg65Cc64uk66m0IOyXrOq4sOyEnCAiTm8gc2lnbmlmaWNhbnQgc3BsaXQiIOydtOudvOuKlCDrqZTsi5zsp4DqsIAg64KY7Jik6rOgLCDsnbTqsoPsnYAgY2xhc3MoKSDrpbwg64Sj7Jy866m0ICJjaGFyYWN0ZXIiIOqwgCDrgpjsmKjri6QuIOunkO2VmOyekOuptCByZXR1cm4gLTEg6rCZ7J2AIOuKkOuCjOycvOuhnC4uCgoyKSBmYWN0b3Ig656RIGNvbnRpbnVvdXMg6rCS65Ok7J2EIOuCmOuIoOyEnCDqsIHqsIHsnYQg65Sw66GcIOu2hOulmO2VtOyEnCBJVuqwkuydhCDsuKHsoJXtlZjripTrjbAsIOq3uOqxtCBzbWJpbm5pbmcuZmFjdG9yKCkg64KYIHNtYmlubmluZygpIOycvOuhnCDsnpHsl4XtlZjrqbQg6rCB6rCB7J2EIOunjOuTpOyWtOuCvCDsiJgg7J6I7J2MLiDrp4zrk6TslrTsp4QgaXYg6rCS7J2EIOq4sOykgOycvOuhnCDslrTrlqQg6rCS65Ok7J20IOuNlCDqsIDsuZjsnojripQg7Y+J6rCA6riw7KSA7J247KeAIO2ZleyduO2VoCDsiJgg7J6I7J2MLgoKMykg7JuQ656Y64yA66Gc652866m0IOyWmOuTpOydtCDshJzroZzshJzroZwg6rSA66Co7J20IOyXhuuLpOuKlCDqsowg7Kad66qF65CY64qUIOu2gOu2hOydtCDstpTqsIDrkJjslrTslbwg7ZWY7KeAIOyViuuCmD8gCgpgYGB7cn0KdGFibGUodHJhaW5pbmdEYXRhJFNleF9GZW1hbGUpCmBgYAoK6reg7ZiV7J2AIOyemCDrp57qs6AgCgpgYGB7cn0KcGxvdChkZW5zaXR5KHN1cnZleV90bXAkU2V4X0ZlbWFsZSwgbmEucm09VFJVRSkpCmBgYAoK6re4656Y7ZSE66W8IOuztOuptCBiaW5vbWlhbCDsnbjqsowg7ZmV7Iuk7ZWY64uI6rmMLCDsnbTqsbQgZ2xtIOyXkOyEnCBiaW5vbWlhbCDroZwgKyBsaW5rIOuKlCBsb2dpdCDsnLzroZwg7YyM7JWF7ZWY66m0IOuQnOuLpC4gSVYg7YWM7J2067iUIOyDgeychCA16rCc64qUIOyVhOuemOyZgCDqsJnsnLzri4gg7JaY65Ok7J2EIOq3uOuMgOuhnCDsk7DquLDroZwg7ZWY6rOgIAogCjEwCUhlaWdodAkyLjM2NzAJCQo1CVNtb2tlCTAuMDg1OAkJCjIJRm9sZAkwLjAyNDQJCQozCUNsYXAJMC4wMjM2CQkKNAlFeGVyCTAuMDA5NAkKCmBgYHtyfQptb2RlbCA9IGdsbSggU2V4X0ZlbWFsZSB+IEhlaWdodCArIFNtb2tlICsgRm9sZCArIENsYXAgKyBFeGVyLCBkYXRhPXRyYWluaW5nRGF0YSwgZmFtaWx5PWJpbm9taWFsKGxpbms9ImxvZ2l0IikpCnRlc3REYXRhJHByZWRpY3RlZCA9IHByZWRpY3QobW9kZWwsIHRlc3REYXRhLCB0eXBlPSJyZXNwb25zZSIpCnN1bW1hcnkobW9kZWwpCmBgYAoKYGBge3J9CmxpYnJhcnkoY2FyKQpzcXJ0KHZpZihtb2RlbCkpID4gMiAKYGBgCgrslYTquYwgM+uyiOyXkOyEnCDqtIDqs4TsnojripQg6rGwIOqwgOugpOuCtOyVvCDtlZzri6Qg7ZaI64qU642wLCDrs7Tri4gg7IyN67CpIOuUse2eiCDshKDtmJXqtIDqs4TripQg7JeG64qUIOuqqOyWkeydtOuLpC4KCmBgYHtyfQpsaWJyYXJ5KEluZm9ybWF0aW9uVmFsdWUpCm5vdF9uYV90ZXN0RGF0YSA9IHRlc3REYXRhWyBjb21wbGV0ZS5jYXNlcyh0ZXN0RGF0YVsncHJlZGljdGVkJ10pICwgXQp0aHJlc2hvbGQgPSBvcHRpbWFsQ3V0b2ZmKG5vdF9uYV90ZXN0RGF0YSRTZXhfRmVtYWxlLCBub3RfbmFfdGVzdERhdGEkcHJlZGljdGVkKQptaXNDbGFzc0Vycm9yKG5vdF9uYV90ZXN0RGF0YSRTZXhfRmVtYWxlLCBub3RfbmFfdGVzdERhdGEkcHJlZGljdGVkLCB0aHJlc2hvbGQgPSB0aHJlc2hvbGQpCmBgYAoKYGBge3J9CnBsb3RST0ModGVzdERhdGEkU2V4X0ZlbWFsZSwgdGVzdERhdGEkcHJlZGljdGVkKQpgYGAKCuy7pOu4jOqwgCDrjIDstqkg7KCA66CH6rKMIOuCmOyYqOuLpCDrrZAg7J2065+wIOqysOuhoOydtCDrgpjsmLXri4jri6QuIAoKCkFVUk9DIOuKlCBST0Mg7Luk67iMIOyVhOueq+yqvSDrqbTsoIHsnZgg6rOE7IKwICjqt7jrn6zri4jquYwsIDgyJeygleuPhCDrp57stpTrhKTsmpQpCgoKYGBge3J9CmxpYnJhcnkobGVhcHMpCmFsbF9zdWJzZXRfcmVncmVzc2lvbiA8LSByZWdzdWJzZXRzKAogIFNleF9GZW1hbGUgfiBIZWlnaHQgKyBTbW9rZSArIEZvbGQgKyBDbGFwICsgRXhlciArIFcuSG5kICsgTS5JICsgV3IuSG5kICsgTlcuSG5kICsgUHVsc2UgKyBBZ2UsIGRhdGE9dHJhaW5pbmdEYXRhLCBuYmVzdCA9IDIKKQpwbG90KGFsbF9zdWJzZXRfcmVncmVzc2lvbiwgc2NhbGU9ImFkanIyIikKYGBgCgrqt7jrn7wg7KCA6riw7IScIOqysOuhoOydgAoKSGVpZ2h0LCBTbW9rZU5ldmVyLCBTbW9rZU9jY2FzLCBTbW9rZVJlZ3VsLCBGb2xkTmVpdGhlciBFeGVyTm9uZSBFeGVyU29tZSBOVy5IbmQKCuydtOugh+qyjOqwgCDsnZjrr7jqsIAg7J6I64uk64qUIOyWmOq4tCDqsoMg6rCZ6rOgIAoKYGBge3J9Cm1vZGVsID0gZ2xtKCBTZXhfRmVtYWxlIH4gSGVpZ2h0ICsgU21va2UgKyBGb2xkICsgRXhlciArIE5XLkhuZCwgZGF0YT10cmFpbmluZ0RhdGEsIGZhbWlseT1iaW5vbWlhbChsaW5rPSJsb2dpdCIpKQp0ZXN0RGF0YSRwcmVkaWN0ZWQgPSBwcmVkaWN0KG1vZGVsLCB0ZXN0RGF0YSwgdHlwZT0icmVzcG9uc2UiKQpsaWJyYXJ5KEluZm9ybWF0aW9uVmFsdWUpCnBsb3RST0ModGVzdERhdGEkU2V4X0ZlbWFsZSwgdGVzdERhdGEkcHJlZGljdGVkKQpgYGAKCuyYpCDsp4Tsp5zrhKQg7KGw6riIIOyYrOuekOuEpCAKCuywuOqzoOuhnCDquLDsobTsl5DripQgMC44MjcxIOydtOyXiOyKteuLiOuLpCA4MjcxID4gODQxIAoK7Jet7IucIOyCrOuejOuztOuLpCDquLDqs4TqsIAg7ZW07JW8ICguLi4pCgo=