지도학습(Supervised Leaning): 주어진 설명변수로부터 반응변수를 예측해내는 작업분류분석(Classification): 주어진 입력변수에 근거하여 범주형 반응변수를 예측하는 작업이다.회귀분서(Regression Prediction): 연속형과 수치형 반응변수를 예측하는 작업분류분석 문제 접근법 요약
분류분석 문제 접근법 세부
분류분석에 필요한 패키지는 다음과 같다
#install.packages(c("dplyr", "ggplot2", "ISLR", "MASS", "glmnet","randomeForest", "gbm", "rpart", "boot"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ISLR)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(rpart)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:survival':
##
## aml
관측치: n = 32561, 변수개수: 15이고 결측치를 포함한 데이터다. 분류분석의 목적은 13개 설명변수에 근거해서 연소득 $50k가 넘는지를 예측해내는 것이다.
#url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
adult <- read.csv("data/adult.data", header = F, strip.white = T)
# strip.white = T 옵션은 데이터 파일에서 콤마 다음의 공백문자를 제거한다.
#url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
# adult_names <- read.csv(url, header = F, strip.white = T)
names(adult) <- c("age", "workclass", "fnlwgt", "education",
"education_num", "marial_status", "occupation",
"relationship", "race", "sex", "capital_gain",
"capital_loss", "hours_per_week", "natice_country", "wage")
glimpse(adult)
## Observations: 32,561
## Variables: 15
## $ age <int> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30,...
## $ workclass <fctr> State-gov, Self-emp-not-inc, Private, Private,...
## $ fnlwgt <int> 77516, 83311, 215646, 234721, 338409, 284582, 1...
## $ education <fctr> Bachelors, Bachelors, HS-grad, 11th, Bachelors...
## $ education_num <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13,...
## $ marial_status <fctr> Never-married, Married-civ-spouse, Divorced, M...
## $ occupation <fctr> Adm-clerical, Exec-managerial, Handlers-cleane...
## $ relationship <fctr> Not-in-family, Husband, Not-in-family, Husband...
## $ race <fctr> White, White, White, Black, Black, White, Blac...
## $ sex <fctr> Male, Male, Male, Male, Female, Female, Female...
## $ capital_gain <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0...
## $ capital_loss <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hours_per_week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40,...
## $ natice_country <fctr> United-States, United-States, United-States, U...
## $ wage <fctr> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, <=50...
summary(adult)
## age workclass fnlwgt
## Min. :17.00 Private :22696 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2541 1st Qu.: 117827
## Median :37.00 Local-gov : 2093 Median : 178356
## Mean :38.58 ? : 1836 Mean : 189778
## 3rd Qu.:48.00 State-gov : 1298 3rd Qu.: 237051
## Max. :90.00 Self-emp-inc : 1116 Max. :1484705
## (Other) : 981
## education education_num marial_status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## occupation relationship race
## Prof-specialty :4140 Husband :13193 Amer-Indian-Eskimo: 311
## Craft-repair :4099 Not-in-family : 8305 Asian-Pac-Islander: 1039
## Exec-managerial:4066 Other-relative: 981 Black : 3124
## Adm-clerical :3770 Own-child : 5068 Other : 271
## Sales :3650 Unmarried : 3446 White :27816
## Other-service :3295 Wife : 1568
## (Other) :9541
## sex capital_gain capital_loss hours_per_week
## Female:10771 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :21790 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1078 Mean : 87.3 Mean :40.44
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :4356.0 Max. :99.00
##
## natice_country wage
## United-States:29170 <=50K:24720
## Mexico : 643 >50K : 7841
## ? : 583
## Philippines : 198
## Germany : 137
## Canada : 121
## (Other) : 1709
levels(adult$wage)
## [1] "<=50K" ">50K"
levels(adult$race)
## [1] "Amer-Indian-Eskimo" "Asian-Pac-Islander" "Black"
## [4] "Other" "White"
levels(adult$sex)
## [1] "Female" "Male"
R에서 model.matrix 함수를 사용하면 주어진 범주형 변수로부터 모형행렬을 생성할 수 있다.
x <- model.matrix( ~ race + sex + age, adult)
glimpse(x)
## num [1:32561, 1:7] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:32561] "1" "2" "3" "4" ...
## ..$ : chr [1:7] "(Intercept)" "raceAsian-Pac-Islander" "raceBlack" "raceOther" ...
## - attr(*, "assign")= int [1:7] 0 1 1 1 1 2 3
## - attr(*, "contrasts")=List of 2
## ..$ race: chr "contr.treatment"
## ..$ sex : chr "contr.treatment"
colnames(x)
## [1] "(Intercept)" "raceAsian-Pac-Islander"
## [3] "raceBlack" "raceOther"
## [5] "raceWhite" "sexMale"
## [7] "age"
x_orig <- adult %>% dplyr::select(sex, race, age)
#View(x_orig)
head(x_orig)
## sex race age
## 1 Male White 39
## 2 Male White 50
## 3 Male White 38
## 4 Male Black 53
## 5 Female Black 28
## 6 Female White 37
x_mod <- model.matrix( ~ sex + race + age, adult)
#View(x_mod)
head(x_mod)
## (Intercept) sexMale raceAsian-Pac-Islander raceBlack raceOther raceWhite
## 1 1 1 0 0 0 1
## 2 1 1 0 0 0 1
## 3 1 1 0 0 0 1
## 4 1 1 0 1 0 0
## 5 1 0 0 1 0 0
## 6 1 0 0 0 0 1
## age
## 1 39
## 2 50
## 3 38
## 4 53
## 5 28
## 6 37
model.matrix를 사요하면 절편항(intercept)이 자동을 생성된다. race 변수 자체는 첫 번째 레벨인 Amr-Indian_Eskimo를 제외한 나머지 k - 1 = 4 레벨들로 이루어진 열들이 생성되었다. sex 변수도 처 번재 레벨인 Female을 제외한 k - 1 = 1 레벨로 이루어진 열이 생성되었다. 하지만, 수치형 변수였던 age는 그대로 하나의 열로 표현된다.
여기서 실제 p를 찾아내도록 하자. model.matrix 함수를 직접 실행하면 된다.
x <- model.matrix( ~ . - wage, adult)
dim(x)
## [1] 32561 101
즉, adult 데이터의 실제 p = 101 이다.
예측 모형의 일반화 능력을 제대로 평가하기 위해서는 테스트세트가 필요하며, 이 테스트세트는 최종적으로 모형의 성능평가에만 사용되어야 하고, 모형 개발 과정에서는 절대 사용되어서는 안된다.
adult 데이터를 훈련/검증/테스트 세트로 나누는 작업
set.seed(1601)
n <- nrow(adult)
idx <- 1:n
training_idx <- sample(idx, n * .60)
idx <- setdiff(idx, training_idx)
validate_idx <-sample(idx, n * .20)
test_idx <- setdiff(idx, validate_idx)
length(training_idx)
## [1] 19536
length(validate_idx)
## [1] 6512
length(test_idx)
## [1] 6513
training <- adult[training_idx, ]
validation <- adult[validate_idx, ]
test <- adult[test_idx, ]
set.seed() 명령을 사용한 이유는, 다시 실행해도 동일한 훈련/검증/테스트세트를 얻기 위해서다. 재현 가능한(reporducible) 연구의 기본이다. 시드 값 자체는 큰 의미가 없다. (예: 연구 시의 연도/월수의 결합으로 사용 1711(2017년 11월))
나이와 중산층 여부의 관계시각화
training %>%
ggplot(aes(age, fill=wage)) + geom_density(alpha=.5) +
ggtitle("나이와 중산층 여부의 관게") + xlab("연령") + ylab("밀도") +
theme(plot.title = element_text(hjust = 0.5))
중산층 이상의 수입 여부와 나이의 관계는 꼭 선형적이 않다. 이런 비선형적인 관계를 반영하는 방법은 크게 랜덤 포레스트나 gbm 같은 비모수적인 방법을 사용하거나 또는 로지스티이나 라쏘등의 선형 모형을 사용하되, 비선형 함수로 변환된 혹은 범주화된(10대, 20대, 30대…등) 설명변수를 사용하는 것이다. 이렇듯 시각화는 설명변수와 반응변수 간의 비선형적 관계를 찾아내는 데 사용될 수 있다
인종, 셩별, 나이 세 변수와 중산층 여부와의 관계 시각화
library(trelliscopejs)
training %>%
filter(race %in% c("Black", "White")) %>%
ggplot(aes(age, fill = wage)) +
geom_density(alpha=.5) + ylim(0, .1) +
# facet_grid(race ~ sex, scales = "free_y") +
facet_trelliscope(race ~ sex, scales = "free_y",
nrow = 2, ncol = 2, self_contained = T) +
ggtitle("인종, 성별 나이와 중산층여부의 관게") +
xlab("연령") + ylab("밀도") +
theme(plot.title = element_text(hjust = 0.5))
## using data from the first layer
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2017c.1.0/
## zoneinfo/Asia/Seoul'
이 시각화를 통한 시사점
총 교육기간에 따른 중산층 관계 시각화
training %>%
ggplot(aes(education_num, fill = wage)) + geom_bar() +
ggtitle("총 교육기간에 따른 중산층 관계") +
xlab("교육기간") + ylab("분포") +
theme(plot.title = element_text(hjust = 0.5))
설명변수가 하나일 때의 분류분석을 위한 로지스틱 회귀 모형은 glm(, family = binomial) 사용
ad_glm_full <- glm(wage ~ ., data = training, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(ad_glm_full)
##
## Call:
## glm(formula = wage ~ ., family = binomial(), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2870 -0.5041 -0.1828 -0.0207 3.4502
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -8.734e+00 5.640e-01 -15.486
## age 2.575e-02 2.120e-03 12.143
## workclassFederal-gov 9.437e-01 1.954e-01 4.830
## workclassLocal-gov 2.282e-01 1.785e-01 1.278
## workclassNever-worked -1.015e+01 1.141e+03 -0.009
## workclassPrivate 5.057e-01 1.582e-01 3.197
## workclassSelf-emp-inc 6.243e-01 1.899e-01 3.287
## workclassSelf-emp-not-inc -2.928e-02 1.743e-01 -0.168
## workclassState-gov 6.758e-02 1.960e-01 0.345
## workclassWithout-pay -1.429e+01 7.135e+02 -0.020
## fnlwgt 6.439e-07 2.221e-07 2.899
## education11th 2.584e-01 2.619e-01 0.987
## education12th 4.079e-01 3.414e-01 1.195
## education1st-4th -1.778e-01 5.627e-01 -0.316
## education5th-6th 6.362e-02 4.099e-01 0.155
## education7th-8th -4.160e-01 3.048e-01 -1.365
## education9th -1.140e-02 3.352e-01 -0.034
## educationAssoc-acdm 1.349e+00 2.235e-01 6.036
## educationAssoc-voc 1.362e+00 2.142e-01 6.358
## educationBachelors 1.934e+00 1.992e-01 9.711
## educationDoctorate 2.984e+00 2.776e-01 10.748
## educationHS-grad 8.666e-01 1.940e-01 4.467
## educationMasters 2.292e+00 2.131e-01 10.753
## educationPreschool -1.266e+01 3.786e+02 -0.033
## educationProf-school 2.858e+00 2.576e-01 11.093
## educationSome-college 1.201e+00 1.971e-01 6.096
## education_num NA NA NA
## marial_statusMarried-AF-spouse 2.049e+00 7.307e-01 2.804
## marial_statusMarried-civ-spouse 1.905e+00 3.458e-01 5.508
## marial_statusMarried-spouse-absent 1.724e-01 2.731e-01 0.632
## marial_statusNever-married -4.719e-01 1.134e-01 -4.160
## marial_statusSeparated -3.848e-01 2.433e-01 -1.582
## marial_statusWidowed 2.715e-01 1.882e-01 1.443
## occupationAdm-clerical 2.391e-01 1.276e-01 1.874
## occupationArmed-Forces -1.389e+01 8.417e+02 -0.017
## occupationCraft-repair 2.444e-01 1.093e-01 2.236
## occupationExec-managerial 8.752e-01 1.125e-01 7.778
## occupationFarming-fishing -8.930e-01 1.840e-01 -4.854
## occupationHandlers-cleaners -4.670e-01 1.843e-01 -2.534
## occupationMachine-op-inspct -1.403e-01 1.368e-01 -1.025
## occupationOther-service -9.013e-01 1.677e-01 -5.376
## occupationPriv-house-serv -1.388e+01 2.226e+02 -0.062
## occupationProf-specialty 7.090e-01 1.212e-01 5.851
## occupationProtective-serv 6.134e-01 1.681e-01 3.648
## occupationSales 4.767e-01 1.161e-01 4.107
## occupationTech-support 6.970e-01 1.566e-01 4.452
## occupationTransport-moving NA NA NA
## relationshipNot-in-family 2.645e-01 3.417e-01 0.774
## relationshipOther-relative -6.659e-01 3.157e-01 -2.109
## relationshipOwn-child -9.208e-01 3.479e-01 -2.647
## relationshipUnmarried 5.198e-02 3.627e-01 0.143
## relationshipWife 1.416e+00 1.330e-01 10.640
## raceAsian-Pac-Islander 6.903e-01 3.392e-01 2.035
## raceBlack 1.943e-01 2.969e-01 0.654
## raceOther -9.615e-02 4.648e-01 -0.207
## raceWhite 4.758e-01 2.813e-01 1.692
## sexMale 8.071e-01 1.024e-01 7.885
## capital_gain 3.175e-04 1.325e-05 23.964
## capital_loss 6.921e-04 4.923e-05 14.059
## hours_per_week 3.173e-02 2.110e-03 15.040
## natice_countryCambodia 1.120e+00 7.861e-01 1.425
## natice_countryCanada 5.295e-01 3.867e-01 1.369
## natice_countryChina -5.297e-01 5.361e-01 -0.988
## natice_countryColumbia -1.409e+01 3.272e+02 -0.043
## natice_countryCuba 6.667e-01 4.097e-01 1.627
## natice_countryDominican-Republic -1.342e+01 3.095e+02 -0.043
## natice_countryEcuador 1.627e-01 1.091e+00 0.149
## natice_countryEl-Salvador -4.377e-01 6.882e-01 -0.636
## natice_countryEngland 8.166e-01 4.302e-01 1.898
## natice_countryFrance 9.328e-01 7.589e-01 1.229
## natice_countryGermany 3.355e-01 3.666e-01 0.915
## natice_countryGreece -7.823e-01 7.519e-01 -1.040
## natice_countryGuatemala 9.323e-01 7.937e-01 1.175
## natice_countryHaiti 9.005e-01 1.004e+00 0.897
## natice_countryHoland-Netherlands -1.237e+01 2.400e+03 -0.005
## natice_countryHonduras -1.233e+01 8.411e+02 -0.015
## natice_countryHong 3.550e-01 8.971e-01 0.396
## natice_countryHungary 4.433e-01 8.424e-01 0.526
## natice_countryIndia -2.182e-01 4.244e-01 -0.514
## natice_countryIran 3.411e-01 5.899e-01 0.578
## natice_countryIreland 5.652e-03 1.260e+00 0.004
## natice_countryItaly 7.555e-01 4.445e-01 1.700
## natice_countryJamaica -6.460e-01 8.046e-01 -0.803
## natice_countryJapan 1.006e+00 5.325e-01 1.889
## natice_countryLaos -3.027e-01 9.142e-01 -0.331
## natice_countryMexico -3.843e-01 3.239e-01 -1.187
## natice_countryNicaragua -3.550e-01 1.091e+00 -0.325
## natice_countryOutlying-US(Guam-USVI-etc) -1.344e+01 7.305e+02 -0.018
## natice_countryPeru -1.940e-01 1.146e+00 -0.169
## natice_countryPhilippines 8.714e-01 3.647e-01 2.389
## natice_countryPoland 4.134e-01 5.189e-01 0.797
## natice_countryPortugal -5.458e-01 8.782e-01 -0.622
## natice_countryPuerto-Rico 4.654e-01 4.680e-01 0.994
## natice_countryScotland 8.010e-01 1.113e+00 0.720
## natice_countrySouth 1.200e-01 5.241e-01 0.229
## natice_countryTaiwan 3.794e-01 6.336e-01 0.599
## natice_countryThailand -3.116e-01 1.125e+00 -0.277
## natice_countryTrinadad&Tobago 1.004e+00 1.083e+00 0.927
## natice_countryUnited-States 4.244e-01 1.766e-01 2.403
## natice_countryVietnam -1.389e+00 8.446e-01 -1.644
## natice_countryYugoslavia -7.384e-01 1.190e+00 -0.620
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## workclassFederal-gov 1.37e-06 ***
## workclassLocal-gov 0.201247
## workclassNever-worked 0.992904
## workclassPrivate 0.001388 **
## workclassSelf-emp-inc 0.001013 **
## workclassSelf-emp-not-inc 0.866614
## workclassState-gov 0.730183
## workclassWithout-pay 0.984026
## fnlwgt 0.003742 **
## education11th 0.323823
## education12th 0.232279
## education1st-4th 0.752045
## education5th-6th 0.876661
## education7th-8th 0.172277
## education9th 0.972876
## educationAssoc-acdm 1.58e-09 ***
## educationAssoc-voc 2.04e-10 ***
## educationBachelors < 2e-16 ***
## educationDoctorate < 2e-16 ***
## educationHS-grad 7.94e-06 ***
## educationMasters < 2e-16 ***
## educationPreschool 0.973319
## educationProf-school < 2e-16 ***
## educationSome-college 1.09e-09 ***
## education_num NA
## marial_statusMarried-AF-spouse 0.005046 **
## marial_statusMarried-civ-spouse 3.64e-08 ***
## marial_statusMarried-spouse-absent 0.527708
## marial_statusNever-married 3.18e-05 ***
## marial_statusSeparated 0.113731
## marial_statusWidowed 0.149064
## occupationAdm-clerical 0.060936 .
## occupationArmed-Forces 0.986830
## occupationCraft-repair 0.025374 *
## occupationExec-managerial 7.38e-15 ***
## occupationFarming-fishing 1.21e-06 ***
## occupationHandlers-cleaners 0.011282 *
## occupationMachine-op-inspct 0.305346
## occupationOther-service 7.64e-08 ***
## occupationPriv-house-serv 0.950290
## occupationProf-specialty 4.88e-09 ***
## occupationProtective-serv 0.000264 ***
## occupationSales 4.02e-05 ***
## occupationTech-support 8.52e-06 ***
## occupationTransport-moving NA
## relationshipNot-in-family 0.438956
## relationshipOther-relative 0.034952 *
## relationshipOwn-child 0.008133 **
## relationshipUnmarried 0.886041
## relationshipWife < 2e-16 ***
## raceAsian-Pac-Islander 0.041840 *
## raceBlack 0.512897
## raceOther 0.836119
## raceWhite 0.090714 .
## sexMale 3.15e-15 ***
## capital_gain < 2e-16 ***
## capital_loss < 2e-16 ***
## hours_per_week < 2e-16 ***
## natice_countryCambodia 0.154165
## natice_countryCanada 0.170891
## natice_countryChina 0.323111
## natice_countryColumbia 0.965642
## natice_countryCuba 0.103643
## natice_countryDominican-Republic 0.965422
## natice_countryEcuador 0.881497
## natice_countryEl-Salvador 0.524847
## natice_countryEngland 0.057706 .
## natice_countryFrance 0.219003
## natice_countryGermany 0.360173
## natice_countryGreece 0.298166
## natice_countryGuatemala 0.240151
## natice_countryHaiti 0.369685
## natice_countryHoland-Netherlands 0.995885
## natice_countryHonduras 0.988306
## natice_countryHong 0.692336
## natice_countryHungary 0.598716
## natice_countryIndia 0.607215
## natice_countryIran 0.563094
## natice_countryIreland 0.996421
## natice_countryItaly 0.089215 .
## natice_countryJamaica 0.422073
## natice_countryJapan 0.058938 .
## natice_countryLaos 0.740581
## natice_countryMexico 0.235330
## natice_countryNicaragua 0.744821
## natice_countryOutlying-US(Guam-USVI-etc) 0.985324
## natice_countryPeru 0.865554
## natice_countryPhilippines 0.016878 *
## natice_countryPoland 0.425700
## natice_countryPortugal 0.534235
## natice_countryPuerto-Rico 0.320034
## natice_countryScotland 0.471667
## natice_countrySouth 0.818812
## natice_countryTaiwan 0.549243
## natice_countryThailand 0.781861
## natice_countryTrinadad&Tobago 0.353853
## natice_countryUnited-States 0.016262 *
## natice_countryVietnam 0.100101
## natice_countryYugoslavia 0.534939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21431 on 19535 degrees of freedom
## Residual deviance: 12304 on 19437 degrees of freedom
## AIC: 12502
##
## Number of Fisher Scoring iterations: 15
alias(ad_glm_full)
## Model :
## wage ~ age + workclass + fnlwgt + education + education_num +
## marial_status + occupation + relationship + race + sex +
## capital_gain + capital_loss + hours_per_week + natice_country
##
## Complete :
## (Intercept) age workclassFederal-gov
## education_num 6 0 0
## occupationTransport-moving 0 0 1
## workclassLocal-gov workclassNever-worked
## education_num 0 0
## occupationTransport-moving 1 0
## workclassPrivate workclassSelf-emp-inc
## education_num 0 0
## occupationTransport-moving 1 1
## workclassSelf-emp-not-inc workclassState-gov
## education_num 0 0
## occupationTransport-moving 1 1
## workclassWithout-pay fnlwgt education11th
## education_num 0 0 1
## occupationTransport-moving 1 0 0
## education12th education1st-4th education5th-6th
## education_num 2 -4 -3
## occupationTransport-moving 0 0 0
## education7th-8th education9th
## education_num -2 -1
## occupationTransport-moving 0 0
## educationAssoc-acdm educationAssoc-voc
## education_num 6 5
## occupationTransport-moving 0 0
## educationBachelors educationDoctorate
## education_num 7 10
## occupationTransport-moving 0 0
## educationHS-grad educationMasters
## education_num 3 8
## occupationTransport-moving 0 0
## educationPreschool educationProf-school
## education_num -5 9
## occupationTransport-moving 0 0
## educationSome-college
## education_num 4
## occupationTransport-moving 0
## marial_statusMarried-AF-spouse
## education_num 0
## occupationTransport-moving 0
## marial_statusMarried-civ-spouse
## education_num 0
## occupationTransport-moving 0
## marial_statusMarried-spouse-absent
## education_num 0
## occupationTransport-moving 0
## marial_statusNever-married
## education_num 0
## occupationTransport-moving 0
## marial_statusSeparated marial_statusWidowed
## education_num 0 0
## occupationTransport-moving 0 0
## occupationAdm-clerical occupationArmed-Forces
## education_num 0 0
## occupationTransport-moving -1 -1
## occupationCraft-repair
## education_num 0
## occupationTransport-moving -1
## occupationExec-managerial
## education_num 0
## occupationTransport-moving -1
## occupationFarming-fishing
## education_num 0
## occupationTransport-moving -1
## occupationHandlers-cleaners
## education_num 0
## occupationTransport-moving -1
## occupationMachine-op-inspct
## education_num 0
## occupationTransport-moving -1
## occupationOther-service
## education_num 0
## occupationTransport-moving -1
## occupationPriv-house-serv
## education_num 0
## occupationTransport-moving -1
## occupationProf-specialty
## education_num 0
## occupationTransport-moving -1
## occupationProtective-serv occupationSales
## education_num 0 0
## occupationTransport-moving -1 -1
## occupationTech-support
## education_num 0
## occupationTransport-moving -1
## relationshipNot-in-family
## education_num 0
## occupationTransport-moving 0
## relationshipOther-relative
## education_num 0
## occupationTransport-moving 0
## relationshipOwn-child relationshipUnmarried
## education_num 0 0
## occupationTransport-moving 0 0
## relationshipWife raceAsian-Pac-Islander
## education_num 0 0
## occupationTransport-moving 0 0
## raceBlack raceOther raceWhite sexMale
## education_num 0 0 0 0
## occupationTransport-moving 0 0 0 0
## capital_gain capital_loss hours_per_week
## education_num 0 0 0
## occupationTransport-moving 0 0 0
## natice_countryCambodia natice_countryCanada
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryChina natice_countryColumbia
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryCuba
## education_num 0
## occupationTransport-moving 0
## natice_countryDominican-Republic
## education_num 0
## occupationTransport-moving 0
## natice_countryEcuador natice_countryEl-Salvador
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryEngland natice_countryFrance
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryGermany natice_countryGreece
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryGuatemala natice_countryHaiti
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryHoland-Netherlands
## education_num 0
## occupationTransport-moving 0
## natice_countryHonduras natice_countryHong
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryHungary natice_countryIndia
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryIran natice_countryIreland
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryItaly natice_countryJamaica
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryJapan natice_countryLaos
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryMexico natice_countryNicaragua
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryOutlying-US(Guam-USVI-etc)
## education_num 0
## occupationTransport-moving 0
## natice_countryPeru natice_countryPhilippines
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryPoland natice_countryPortugal
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryPuerto-Rico
## education_num 0
## occupationTransport-moving 0
## natice_countryScotland natice_countrySouth
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryTaiwan natice_countryThailand
## education_num 0 0
## occupationTransport-moving 0 0
## natice_countryTrinadad&Tobago
## education_num 0
## occupationTransport-moving 0
## natice_countryUnited-States
## education_num 0
## occupationTransport-moving 0
## natice_countryVietnam natice_countryYugoslavia
## education_num 0 0
## occupationTransport-moving 0 0
summary.glm(ad_glm_full)
##
## Call:
## glm(formula = wage ~ ., family = binomial(), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2870 -0.5041 -0.1828 -0.0207 3.4502
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -8.734e+00 5.640e-01 -15.486
## age 2.575e-02 2.120e-03 12.143
## workclassFederal-gov 9.437e-01 1.954e-01 4.830
## workclassLocal-gov 2.282e-01 1.785e-01 1.278
## workclassNever-worked -1.015e+01 1.141e+03 -0.009
## workclassPrivate 5.057e-01 1.582e-01 3.197
## workclassSelf-emp-inc 6.243e-01 1.899e-01 3.287
## workclassSelf-emp-not-inc -2.928e-02 1.743e-01 -0.168
## workclassState-gov 6.758e-02 1.960e-01 0.345
## workclassWithout-pay -1.429e+01 7.135e+02 -0.020
## fnlwgt 6.439e-07 2.221e-07 2.899
## education11th 2.584e-01 2.619e-01 0.987
## education12th 4.079e-01 3.414e-01 1.195
## education1st-4th -1.778e-01 5.627e-01 -0.316
## education5th-6th 6.362e-02 4.099e-01 0.155
## education7th-8th -4.160e-01 3.048e-01 -1.365
## education9th -1.140e-02 3.352e-01 -0.034
## educationAssoc-acdm 1.349e+00 2.235e-01 6.036
## educationAssoc-voc 1.362e+00 2.142e-01 6.358
## educationBachelors 1.934e+00 1.992e-01 9.711
## educationDoctorate 2.984e+00 2.776e-01 10.748
## educationHS-grad 8.666e-01 1.940e-01 4.467
## educationMasters 2.292e+00 2.131e-01 10.753
## educationPreschool -1.266e+01 3.786e+02 -0.033
## educationProf-school 2.858e+00 2.576e-01 11.093
## educationSome-college 1.201e+00 1.971e-01 6.096
## education_num NA NA NA
## marial_statusMarried-AF-spouse 2.049e+00 7.307e-01 2.804
## marial_statusMarried-civ-spouse 1.905e+00 3.458e-01 5.508
## marial_statusMarried-spouse-absent 1.724e-01 2.731e-01 0.632
## marial_statusNever-married -4.719e-01 1.134e-01 -4.160
## marial_statusSeparated -3.848e-01 2.433e-01 -1.582
## marial_statusWidowed 2.715e-01 1.882e-01 1.443
## occupationAdm-clerical 2.391e-01 1.276e-01 1.874
## occupationArmed-Forces -1.389e+01 8.417e+02 -0.017
## occupationCraft-repair 2.444e-01 1.093e-01 2.236
## occupationExec-managerial 8.752e-01 1.125e-01 7.778
## occupationFarming-fishing -8.930e-01 1.840e-01 -4.854
## occupationHandlers-cleaners -4.670e-01 1.843e-01 -2.534
## occupationMachine-op-inspct -1.403e-01 1.368e-01 -1.025
## occupationOther-service -9.013e-01 1.677e-01 -5.376
## occupationPriv-house-serv -1.388e+01 2.226e+02 -0.062
## occupationProf-specialty 7.090e-01 1.212e-01 5.851
## occupationProtective-serv 6.134e-01 1.681e-01 3.648
## occupationSales 4.767e-01 1.161e-01 4.107
## occupationTech-support 6.970e-01 1.566e-01 4.452
## occupationTransport-moving NA NA NA
## relationshipNot-in-family 2.645e-01 3.417e-01 0.774
## relationshipOther-relative -6.659e-01 3.157e-01 -2.109
## relationshipOwn-child -9.208e-01 3.479e-01 -2.647
## relationshipUnmarried 5.198e-02 3.627e-01 0.143
## relationshipWife 1.416e+00 1.330e-01 10.640
## raceAsian-Pac-Islander 6.903e-01 3.392e-01 2.035
## raceBlack 1.943e-01 2.969e-01 0.654
## raceOther -9.615e-02 4.648e-01 -0.207
## raceWhite 4.758e-01 2.813e-01 1.692
## sexMale 8.071e-01 1.024e-01 7.885
## capital_gain 3.175e-04 1.325e-05 23.964
## capital_loss 6.921e-04 4.923e-05 14.059
## hours_per_week 3.173e-02 2.110e-03 15.040
## natice_countryCambodia 1.120e+00 7.861e-01 1.425
## natice_countryCanada 5.295e-01 3.867e-01 1.369
## natice_countryChina -5.297e-01 5.361e-01 -0.988
## natice_countryColumbia -1.409e+01 3.272e+02 -0.043
## natice_countryCuba 6.667e-01 4.097e-01 1.627
## natice_countryDominican-Republic -1.342e+01 3.095e+02 -0.043
## natice_countryEcuador 1.627e-01 1.091e+00 0.149
## natice_countryEl-Salvador -4.377e-01 6.882e-01 -0.636
## natice_countryEngland 8.166e-01 4.302e-01 1.898
## natice_countryFrance 9.328e-01 7.589e-01 1.229
## natice_countryGermany 3.355e-01 3.666e-01 0.915
## natice_countryGreece -7.823e-01 7.519e-01 -1.040
## natice_countryGuatemala 9.323e-01 7.937e-01 1.175
## natice_countryHaiti 9.005e-01 1.004e+00 0.897
## natice_countryHoland-Netherlands -1.237e+01 2.400e+03 -0.005
## natice_countryHonduras -1.233e+01 8.411e+02 -0.015
## natice_countryHong 3.550e-01 8.971e-01 0.396
## natice_countryHungary 4.433e-01 8.424e-01 0.526
## natice_countryIndia -2.182e-01 4.244e-01 -0.514
## natice_countryIran 3.411e-01 5.899e-01 0.578
## natice_countryIreland 5.652e-03 1.260e+00 0.004
## natice_countryItaly 7.555e-01 4.445e-01 1.700
## natice_countryJamaica -6.460e-01 8.046e-01 -0.803
## natice_countryJapan 1.006e+00 5.325e-01 1.889
## natice_countryLaos -3.027e-01 9.142e-01 -0.331
## natice_countryMexico -3.843e-01 3.239e-01 -1.187
## natice_countryNicaragua -3.550e-01 1.091e+00 -0.325
## natice_countryOutlying-US(Guam-USVI-etc) -1.344e+01 7.305e+02 -0.018
## natice_countryPeru -1.940e-01 1.146e+00 -0.169
## natice_countryPhilippines 8.714e-01 3.647e-01 2.389
## natice_countryPoland 4.134e-01 5.189e-01 0.797
## natice_countryPortugal -5.458e-01 8.782e-01 -0.622
## natice_countryPuerto-Rico 4.654e-01 4.680e-01 0.994
## natice_countryScotland 8.010e-01 1.113e+00 0.720
## natice_countrySouth 1.200e-01 5.241e-01 0.229
## natice_countryTaiwan 3.794e-01 6.336e-01 0.599
## natice_countryThailand -3.116e-01 1.125e+00 -0.277
## natice_countryTrinadad&Tobago 1.004e+00 1.083e+00 0.927
## natice_countryUnited-States 4.244e-01 1.766e-01 2.403
## natice_countryVietnam -1.389e+00 8.446e-01 -1.644
## natice_countryYugoslavia -7.384e-01 1.190e+00 -0.620
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## workclassFederal-gov 1.37e-06 ***
## workclassLocal-gov 0.201247
## workclassNever-worked 0.992904
## workclassPrivate 0.001388 **
## workclassSelf-emp-inc 0.001013 **
## workclassSelf-emp-not-inc 0.866614
## workclassState-gov 0.730183
## workclassWithout-pay 0.984026
## fnlwgt 0.003742 **
## education11th 0.323823
## education12th 0.232279
## education1st-4th 0.752045
## education5th-6th 0.876661
## education7th-8th 0.172277
## education9th 0.972876
## educationAssoc-acdm 1.58e-09 ***
## educationAssoc-voc 2.04e-10 ***
## educationBachelors < 2e-16 ***
## educationDoctorate < 2e-16 ***
## educationHS-grad 7.94e-06 ***
## educationMasters < 2e-16 ***
## educationPreschool 0.973319
## educationProf-school < 2e-16 ***
## educationSome-college 1.09e-09 ***
## education_num NA
## marial_statusMarried-AF-spouse 0.005046 **
## marial_statusMarried-civ-spouse 3.64e-08 ***
## marial_statusMarried-spouse-absent 0.527708
## marial_statusNever-married 3.18e-05 ***
## marial_statusSeparated 0.113731
## marial_statusWidowed 0.149064
## occupationAdm-clerical 0.060936 .
## occupationArmed-Forces 0.986830
## occupationCraft-repair 0.025374 *
## occupationExec-managerial 7.38e-15 ***
## occupationFarming-fishing 1.21e-06 ***
## occupationHandlers-cleaners 0.011282 *
## occupationMachine-op-inspct 0.305346
## occupationOther-service 7.64e-08 ***
## occupationPriv-house-serv 0.950290
## occupationProf-specialty 4.88e-09 ***
## occupationProtective-serv 0.000264 ***
## occupationSales 4.02e-05 ***
## occupationTech-support 8.52e-06 ***
## occupationTransport-moving NA
## relationshipNot-in-family 0.438956
## relationshipOther-relative 0.034952 *
## relationshipOwn-child 0.008133 **
## relationshipUnmarried 0.886041
## relationshipWife < 2e-16 ***
## raceAsian-Pac-Islander 0.041840 *
## raceBlack 0.512897
## raceOther 0.836119
## raceWhite 0.090714 .
## sexMale 3.15e-15 ***
## capital_gain < 2e-16 ***
## capital_loss < 2e-16 ***
## hours_per_week < 2e-16 ***
## natice_countryCambodia 0.154165
## natice_countryCanada 0.170891
## natice_countryChina 0.323111
## natice_countryColumbia 0.965642
## natice_countryCuba 0.103643
## natice_countryDominican-Republic 0.965422
## natice_countryEcuador 0.881497
## natice_countryEl-Salvador 0.524847
## natice_countryEngland 0.057706 .
## natice_countryFrance 0.219003
## natice_countryGermany 0.360173
## natice_countryGreece 0.298166
## natice_countryGuatemala 0.240151
## natice_countryHaiti 0.369685
## natice_countryHoland-Netherlands 0.995885
## natice_countryHonduras 0.988306
## natice_countryHong 0.692336
## natice_countryHungary 0.598716
## natice_countryIndia 0.607215
## natice_countryIran 0.563094
## natice_countryIreland 0.996421
## natice_countryItaly 0.089215 .
## natice_countryJamaica 0.422073
## natice_countryJapan 0.058938 .
## natice_countryLaos 0.740581
## natice_countryMexico 0.235330
## natice_countryNicaragua 0.744821
## natice_countryOutlying-US(Guam-USVI-etc) 0.985324
## natice_countryPeru 0.865554
## natice_countryPhilippines 0.016878 *
## natice_countryPoland 0.425700
## natice_countryPortugal 0.534235
## natice_countryPuerto-Rico 0.320034
## natice_countryScotland 0.471667
## natice_countrySouth 0.818812
## natice_countryTaiwan 0.549243
## natice_countryThailand 0.781861
## natice_countryTrinadad&Tobago 0.353853
## natice_countryUnited-States 0.016262 *
## natice_countryVietnam 0.100101
## natice_countryYugoslavia 0.534939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21431 on 19535 degrees of freedom
## Residual deviance: 12304 on 19437 degrees of freedom
## AIC: 12502
##
## Number of Fisher Scoring iterations: 15
predict(ad_glm_full, newdata = adult[1:5, ], type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## 1 2 3 4 5
## 0.12137717 0.35708376 0.03017053 0.08937526 0.69091339
검증세트에서의 에러확률을 살펴보자. 반응변수와 예측변수를 추츨한다.
y_obs <- ifelse(validation$wage == ">50K", 1, 0)
yhat_lm <- predict(ad_glm_full, newdata = validation, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
간단한 시각화로, 예측값과 실제 간의 관계를 살펴볼 수 있다. 일단 다른 관측값 사이에 확률 에측값의 분포가 확실히 다른 것을 볼 수 있다.
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
df <- data.frame(y_obs, yhat_lm)
p1 <- df %>%
ggplot(aes(y_obs, yhat_lm, group = y_obs, fill = factor(y_obs))) +
geom_boxplot()
p2 <- df %>%
ggplot(aes(yhat_lm, fill = factor(y_obs))) + geom_density(alpha=.5)
grid.arrange(p1, p2, ncol = 2)
예측의 정확도 지표인 이항편차는 다음과 같다
#devtools::install_github("DexGroves/hacktoolkit")
library(hacktoolkit)
## Warning: replacing previous import 'data.table::first' by 'dplyr::first'
## when loading 'hacktoolkit'
## Warning: replacing previous import 'data.table::between' by
## 'dplyr::between' when loading 'hacktoolkit'
## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
## loading 'hacktoolkit'
##
## Attaching package: 'hacktoolkit'
## The following object is masked from 'package:glmnet':
##
## auc
binomial_deviance(y_obs, yhat_lm)
## [1] 4272.905
ROC Curve 얻기
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred_lm <- prediction(yhat_lm, y_obs)
perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
plot(perf_lm, col = "black", main = "ROC Curve for GLM")
abline(0, 1)
performance(pred_lm, "auc")@y.values[[1]]
## [1] 0.9054624