## model weight sugar acid color
## 로얄후지:5 Min. :152.0 Min. :12.10 Min. :0.2800 적색:10
## 미시마 :5 1st Qu.:251.0 1st Qu.:13.30 1st Qu.:0.3100 홍색:15
## 아오리 :5 Median :329.0 Median :13.60 Median :0.3800
## 홍로 :5 Mean :317.3 Mean :14.01 Mean :0.4192
## 홍옥 :5 3rd Qu.:391.0 3rd Qu.:14.40 3rd Qu.:0.4100
## Max. :409.0 Max. :16.80 Max. :0.7300
## 'data.frame': 25 obs. of 5 variables:
## $ model : Factor w/ 5 levels "로얄후지","미시마",..: 3 5 3 2 3 4 1 3 4 1 ...
## $ weight: int 286 256 251 396 282 342 407 238 295 392 ...
## $ sugar : num 12.9 13.4 12.1 16.3 15 13.4 13.6 13.5 15.1 15.6 ...
## $ acid : num 0.31 0.69 0.32 0.39 0.29 0.31 0.4 0.31 0.29 0.38 ...
## $ color : Factor w/ 2 levels "적색","홍색": 2 1 2 2 2 2 1 2 2 1 ...




Decision Tree
## n= 20
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 20 16 로얄후지 (0.2 0.2 0.2 0.2 0.2)
## 2) acid< 0.525 16 12 로얄후지 (0.25 0.25 0.25 0.25 0)
## 4) weight>=366 8 4 로얄후지 (0.5 0.5 0 0 0)
## 8) color=적색 4 0 로얄후지 (1 0 0 0 0) *
## 9) color=홍색 4 0 미시마 (0 1 0 0 0) *
## 5) weight< 366 8 4 아오리 (0 0 0.5 0.5 0)
## 10) weight< 307.5 5 1 아오리 (0 0 0.8 0.2 0)
## 20) weight>=195 4 0 아오리 (0 0 1 0 0) *
## 21) weight< 195 1 0 홍로 (0 0 0 1 0) *
## 11) weight>=307.5 3 0 홍로 (0 0 0 1 0) *
## 3) acid>=0.525 4 0 홍옥 (0 0 0 0 1) *

## Confusion Matrix and Statistics
##
## Reference
## Prediction 로얄후지 미시마 아오리 홍로 홍옥
## 로얄후지 1 0 0 0 0
## 미시마 0 1 0 0 0
## 아오리 0 0 1 0 0
## 홍로 0 0 1 0 0
## 홍옥 0 0 0 0 1
##
## Overall Statistics
##
## Accuracy : 0.8
## 95% CI : (0.2836, 0.9949)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 0.08704
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 로얄후지 Class: 미시마 Class: 아오리 Class: 홍로
## Sensitivity 1.0 1.0 0.5000 NA
## Specificity 1.0 1.0 1.0000 0.8
## Pos Pred Value 1.0 1.0 1.0000 NA
## Neg Pred Value 1.0 1.0 0.7500 NA
## Precision 1.0 1.0 1.0000 0.0
## Recall 1.0 1.0 0.5000 NA
## F1 1.0 1.0 0.6667 NA
## Prevalence 0.2 0.2 0.4000 0.0
## Detection Rate 0.2 0.2 0.2000 0.0
## Detection Prevalence 0.2 0.2 0.2000 0.2
## Balanced Accuracy 1.0 1.0 0.7500 NA
## Class: 홍옥
## Sensitivity 1.0
## Specificity 1.0
## Pos Pred Value 1.0
## Neg Pred Value 1.0
## Precision 1.0
## Recall 1.0
## F1 1.0
## Prevalence 0.2
## Detection Rate 0.2
## Detection Prevalence 0.2
## Balanced Accuracy 1.0
American Cummunity Survey - xgboost
df <- read.table(
"http://jaredlander.com/data/acs_ny.csv",
sep=",", header = TRUE, stringsAsFactor = FALSE
)
acs <-tibble::as_tibble(
read.table(
"http://jaredlander.com/data/acs_ny.csv",
sep=",", header = TRUE, stringsAsFactor = FALSE
)
)
#income 변수 만들기
acs %>%
mutate(income= factor(FamilyIncome >= 150000,
levels = c(FALSE, TRUE),
labels = c("Below","Above"))) -> acs
#xgboos - y: metrics, x= vectros
id <- createDataPartition(acs$income, p=0.7, list= FALSE)
train <- acs[id,]
test <- acs[-id,]
#train_contral 설정
#2번 반복 5-폴드 교차 타당성 검증
ctrl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel = TRUE)
#Xgbbost contral parameters
#nrounds - 반복 횟수 결정
#max-depth = 나무의 최대 복잡고 지정
#eta - 학습속도 (shrinkage 의 양 조절)
#colsample_bytree, subsample - 열과 행에 대한 샘플링 비율
expand.grid(nrounds = 100,
max_depth = c(2,6,10),
eta = c(0.01, 0.1),
gamma = c(0),
colsample_bytree=1,
min_child_weight =1,
subsample= 0.7) -> boostGrid
acsFormula <- income ~ NumChildren + NumRooms + NumVehicles + NumWorkers +
OwnRent + ElectricBill + HeatingFuel
set.seed(73615)
train(acsFormula,
data= train,
method = "xgbTree",
metric = "ROC",
trControl = ctrl,
tuneGrid= boostGrid,
nthread =4) ->boostTuned
plot(boostTuned)

## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.01 2 0 1 1 0.7 100
## 2 0.10 10 0 1 1 0.7 100
## 3 0.01 10 0 1 1 0.7 100
## 4 0.10 6 0 1 1 0.7 100
## 5 0.01 6 0 1 1 0.7 100
## 6 0.10 2 0 1 1 0.7 100
## ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.7287933 1.0000000 0.0000000 0.007247575 0.000000000 0.00000000
## 2 0.7294412 0.9453480 0.1828878 0.009539525 0.003537195 0.02201601
## 3 0.7444798 0.9688433 0.1454952 0.006994547 0.003624643 0.01645273
## 4 0.7490780 0.9659751 0.1536297 0.008179881 0.001628097 0.01393626
## 5 0.7491769 0.9808659 0.1038789 0.006200881 0.003032860 0.01585249
## 6 0.7570401 0.9763870 0.1232771 0.006257553 0.003742215 0.01487743
## Confusion Matrix and Statistics
##
## Reference
## Prediction Below Above
## Below 5323 1198
## Above 131 171
##
## Accuracy : 0.8052
## 95% CI : (0.7956, 0.8146)
## No Information Rate : 0.7994
## P-Value [Acc > NIR] : 0.116
##
## Kappa : 0.1425
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9760
## Specificity : 0.1249
## Pos Pred Value : 0.8163
## Neg Pred Value : 0.5662
## Prevalence : 0.7994
## Detection Rate : 0.7802
## Detection Prevalence : 0.9557
## Balanced Accuracy : 0.5504
##
## 'Positive' Class : Below
##
Logistic Regression
df <- read.table(
"http://jaredlander.com/data/acs_ny.csv",
sep=",", header = TRUE, stringsAsFactor = FALSE
)
# Data_Setting <- 로지스틱 회귀 분석을 위해서 (1, 0) 파생 변수 만들기
df %>%
mutate(income= factor(FamilyIncome >= 150000,
levels = c(FALSE, TRUE),
labels = c("Below","Above"))) -> df
# chracter - factor 변환
df %>%
mutate_if(is.character, as.factor) -> df
str(df)
## 'data.frame': 22745 obs. of 19 variables:
## $ Acres : Factor w/ 3 levels "1-10","10+","Sub 1": 1 1 1 1 1 1 1 1 1 1 ...
## $ FamilyIncome: int 150 180 280 330 330 480 520 550 1400 1400 ...
## $ FamilyType : Factor w/ 3 levels "Female Head",..: 3 1 1 1 2 2 2 2 1 1 ...
## $ NumBedrooms : int 4 3 4 2 3 0 3 3 3 3 ...
## $ NumChildren : int 1 2 0 1 1 3 2 1 1 2 ...
## $ NumPeople : int 3 4 2 2 2 4 3 2 4 3 ...
## $ NumRooms : int 9 6 8 4 5 1 8 5 9 7 ...
## $ NumUnits : Factor w/ 3 levels "Mobile home",..: 3 3 3 3 2 3 1 3 3 3 ...
## $ NumVehicles : int 1 2 3 1 1 0 0 1 2 1 ...
## $ NumWorkers : int 0 0 1 0 0 0 0 1 1 1 ...
## $ OwnRent : Factor w/ 3 levels "Mortgage","Outright",..: 1 3 1 3 1 3 1 3 1 1 ...
## $ YearBuilt : Factor w/ 15 levels "15","1940-1949",..: 3 15 8 3 15 15 6 11 15 6 ...
## $ HouseCosts : int 1800 850 2600 1800 860 700 270 800 710 1500 ...
## $ ElectricBill: int 90 90 260 140 150 140 130 1 150 300 ...
## $ FoodStamp : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
## $ HeatingFuel : Factor w/ 8 levels "Coal","Electricity",..: 3 5 5 5 3 3 8 3 3 2 ...
## $ Insurance : int 2500 0 6600 0 660 0 100 0 1000 0 ...
## $ Language : Factor w/ 5 levels "Asian Pacific",..: 2 2 4 2 5 2 2 2 2 2 ...
## $ income : Factor w/ 2 levels "Below","Above": 1 1 1 1 1 1 1 1 1 1 ...
# 분할
index <- createDataPartition(df$income, p=0.7, list= FALSE)
train <- df[index,]
test <- df[-index,]
set.seed(10)
#Controlling setㄱ
trainControl(method = "repeatedcv",
number = 5,
repeats =5,
classProbs = TRUE,
allowParallel = TRUE,
summaryFunction = twoClassSummary) -> ctrl_one
formul <- income ~ NumBedrooms + NumChildren + NumRooms + NumVehicles +
NumWorkers + HouseCosts + ElectricBill
train(formul,
data= train,
method ="glm",
metric= "ROC",
trControl= ctrl_one) -> my_glm
summary(my_glm)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7958 -0.6109 -0.4220 -0.2479 2.7506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.413e+00 1.102e-01 -49.110 < 2e-16 ***
## NumBedrooms 1.614e-02 2.493e-02 0.647 0.517436
## NumChildren -7.668e-02 2.018e-02 -3.800 0.000145 ***
## NumRooms 1.400e-01 1.110e-02 12.619 < 2e-16 ***
## NumVehicles 2.229e-01 2.473e-02 9.015 < 2e-16 ***
## NumWorkers 5.205e-01 3.214e-02 16.194 < 2e-16 ***
## HouseCosts 7.326e-04 2.141e-05 34.212 < 2e-16 ***
## ElectricBill 1.418e-03 2.100e-04 6.752 1.46e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15967 on 15921 degrees of freedom
## Residual deviance: 12792 on 15914 degrees of freedom
## AIC: 12808
##
## Number of Fisher Scoring iterations: 5
#test 실행
predict(my_glm, test, type ="prob") %>% pull(Below)->pd
roc(test$income, pd) -> my_auc
sen_spec_df <- data_frame(TPR = my_auc$sensitivities,
FPR = 1 - my_auc$specificities,
total = my_auc$sensitivities + my_auc$specificities,
cutoff = my_auc$thresholds)
sen_spec_df %>%
ggplot(aes(x = FPR, ymin = 0, ymax = TPR))+
geom_polygon(aes(y = TPR), fill = "red", alpha = 0.3) +
geom_path(aes(y = TPR), col = "firebrick", size = 1.2) +
geom_abline(intercept = 0, slope = 1, color = "gray37", size = 1, linetype = "dashed") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
theme_bw() +
coord_equal() +
labs(x = "FPR (1 - Specificity)",
y = "TPR (Sensitivity)",
title = "Model Performance for Logistic Model based on Test Data",
subtitle = paste0("AUC Value: ", my_auc$auc %>% round(4)))

## Confusion Matrix and Statistics
##
## Reference
## Prediction Below Above
## Below 5242 1008
## Above 212 361
##
## Accuracy : 0.8212
## 95% CI : (0.8119, 0.8302)
## No Information Rate : 0.7994
## P-Value [Acc > NIR] : 2.681e-06
##
## Kappa : 0.2874
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9611
## Specificity : 0.2637
## Pos Pred Value : 0.8387
## Neg Pred Value : 0.6300
## Prevalence : 0.7994
## Detection Rate : 0.7683
## Detection Prevalence : 0.9160
## Balanced Accuracy : 0.6124
##
## 'Positive' Class : Below
##