##       model       weight          sugar            acid         color   
##  로얄후지:5   Min.   :152.0   Min.   :12.10   Min.   :0.2800   적색:10  
##  미시마  :5   1st Qu.:251.0   1st Qu.:13.30   1st Qu.:0.3100   홍색:15  
##  아오리  :5   Median :329.0   Median :13.60   Median :0.3800            
##  홍로    :5   Mean   :317.3   Mean   :14.01   Mean   :0.4192            
##  홍옥    :5   3rd Qu.:391.0   3rd Qu.:14.40   3rd Qu.:0.4100            
##               Max.   :409.0   Max.   :16.80   Max.   :0.7300

## 'data.frame':    25 obs. of  5 variables:
##  $ model : Factor w/ 5 levels "로얄후지","미시마",..: 3 5 3 2 3 4 1 3 4 1 ...
##  $ weight: int  286 256 251 396 282 342 407 238 295 392 ...
##  $ sugar : num  12.9 13.4 12.1 16.3 15 13.4 13.6 13.5 15.1 15.6 ...
##  $ acid  : num  0.31 0.69 0.32 0.39 0.29 0.31 0.4 0.31 0.29 0.38 ...
##  $ color : Factor w/ 2 levels "적색","홍색": 2 1 2 2 2 2 1 2 2 1 ...

#분포 확인 (무게)

ggplot(apple, aes(model, weight,fill=model))+
  geom_boxplot(alpha= 0.3)

#분포 확인 (당도)

ggplot(apple, aes(sugar, model, fill=model))+
  geom_boxplot(alpha= 0.3)+
  scale_fill_brewer(palette="Dark2")

#분포 확인 (당도)
ggplot(apple, aes(acid, model, fill=model))+
  geom_boxplot(alpha= 0.3)+
  scale_fill_brewer(palette="Dark2")

#색상별 사과 품종 개수 

ggplot(apple, aes(x= color, fill=model))+geom_bar()

Decision Tree

index <- createDataPartition(apple$model, p=0.8, list= FALSE) 


apple[index,] -> train
apple[-index, ] -> test

#2. 실행 

library(rpart) 

rpart(model~., data= train, control = rpart.control(minsplit = 2)) -> model
model

## n= 20 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 20 16 로얄후지 (0.2 0.2 0.2 0.2 0.2)  
##    2) acid< 0.525 16 12 로얄후지 (0.25 0.25 0.25 0.25 0)  
##      4) weight>=366 8  4 로얄후지 (0.5 0.5 0 0 0)  
##        8) color=적색 4  0 로얄후지 (1 0 0 0 0) *
##        9) color=홍색 4  0 미시마 (0 1 0 0 0) *
##      5) weight< 366 8  4 아오리 (0 0 0.5 0.5 0)  
##       10) weight< 307.5 5  1 아오리 (0 0 0.8 0.2 0)  
##         20) weight>=195 4  0 아오리 (0 0 1 0 0) *
##         21) weight< 195 1  0 홍로 (0 0 0 1 0) *
##       11) weight>=307.5 3  0 홍로 (0 0 0 1 0) *
##    3) acid>=0.525 4  0 홍옥 (0 0 0 0 1) *

#3. 시각화 
library(rpart.plot)


rpart.plot(model)

#예측하기 

pred<-predict(model, test, type="class")


#평가 - 실제 test 와 pred 값의 일치 차이 확인 

test$model -> actual

confusionMatrix(actual, pred, mode ="everything")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 로얄후지 미시마 아오리 홍로 홍옥
##   로얄후지        1      0      0    0    0
##   미시마          0      1      0    0    0
##   아오리          0      0      1    0    0
##   홍로            0      0      1    0    0
##   홍옥            0      0      0    0    1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.2836, 0.9949)
##     No Information Rate : 0.4             
##     P-Value [Acc > NIR] : 0.08704         
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 로얄후지 Class: 미시마 Class: 아오리 Class: 홍로
## Sensitivity                      1.0           1.0        0.5000          NA
## Specificity                      1.0           1.0        1.0000         0.8
## Pos Pred Value                   1.0           1.0        1.0000          NA
## Neg Pred Value                   1.0           1.0        0.7500          NA
## Precision                        1.0           1.0        1.0000         0.0
## Recall                           1.0           1.0        0.5000          NA
## F1                               1.0           1.0        0.6667          NA
## Prevalence                       0.2           0.2        0.4000         0.0
## Detection Rate                   0.2           0.2        0.2000         0.0
## Detection Prevalence             0.2           0.2        0.2000         0.2
## Balanced Accuracy                1.0           1.0        0.7500          NA
##                      Class: 홍옥
## Sensitivity                  1.0
## Specificity                  1.0
## Pos Pred Value               1.0
## Neg Pred Value               1.0
## Precision                    1.0
## Recall                       1.0
## F1                           1.0
## Prevalence                   0.2
## Detection Rate               0.2
## Detection Prevalence         0.2
## Balanced Accuracy            1.0

American Cummunity Survey - xgboost

df <- read.table(
  "http://jaredlander.com/data/acs_ny.csv",
  sep=",", header = TRUE, stringsAsFactor = FALSE
)

acs <-tibble::as_tibble(
  read.table(
    "http://jaredlander.com/data/acs_ny.csv",
  sep=",", header = TRUE, stringsAsFactor = FALSE
  )
)


#income 변수 만들기 

acs %>% 
  mutate(income= factor(FamilyIncome >= 150000,
                        levels = c(FALSE, TRUE),
                        labels = c("Below","Above"))) -> acs

#xgboos - y: metrics, x= vectros 

id <- createDataPartition(acs$income, p=0.7, list= FALSE)

train <- acs[id,]
test <- acs[-id,]

#train_contral 설정 
#2번 반복 5-폴드 교차 타당성 검증 

ctrl <- trainControl(method = "repeatedcv",
                     number = 5,
                     repeats = 2,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE, 
                     allowParallel = TRUE)

#Xgbbost contral parameters 
#nrounds - 반복 횟수 결정 
#max-depth = 나무의 최대 복잡고 지정 
#eta - 학습속도 (shrinkage 의 양 조절)
#colsample_bytree, subsample - 열과 행에 대한 샘플링 비율

expand.grid(nrounds = 100,
            max_depth = c(2,6,10),
            eta = c(0.01, 0.1),
            gamma = c(0),
            colsample_bytree=1, 
            min_child_weight =1, 
            subsample= 0.7) -> boostGrid


acsFormula <- income ~ NumChildren + NumRooms + NumVehicles + NumWorkers +
  OwnRent + ElectricBill + HeatingFuel


set.seed(73615)

train(acsFormula,
      data= train,
      method = "xgbTree",
      metric = "ROC", 
      trControl = ctrl, 
      tuneGrid= boostGrid,
      nthread =4) ->boostTuned

plot(boostTuned)

boostTuned$results %>%  arrange(ROC)

##    eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.01         2     0                1                1       0.7     100
## 2 0.10        10     0                1                1       0.7     100
## 3 0.01        10     0                1                1       0.7     100
## 4 0.10         6     0                1                1       0.7     100
## 5 0.01         6     0                1                1       0.7     100
## 6 0.10         2     0                1                1       0.7     100
##         ROC      Sens      Spec       ROCSD      SensSD     SpecSD
## 1 0.7287933 1.0000000 0.0000000 0.007247575 0.000000000 0.00000000
## 2 0.7294412 0.9453480 0.1828878 0.009539525 0.003537195 0.02201601
## 3 0.7444798 0.9688433 0.1454952 0.006994547 0.003624643 0.01645273
## 4 0.7490780 0.9659751 0.1536297 0.008179881 0.001628097 0.01393626
## 5 0.7491769 0.9808659 0.1038789 0.006200881 0.003032860 0.01585249
## 6 0.7570401 0.9763870 0.1232771 0.006257553 0.003742215 0.01487743

predict(boostTuned, test, type= "raw") %>% 
  as.factor()-> pred

as.factor(test$income) -> actual_one



confusionMatrix(pred, actual_one)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Below Above
##      Below  5323  1198
##      Above   131   171
##                                           
##                Accuracy : 0.8052          
##                  95% CI : (0.7956, 0.8146)
##     No Information Rate : 0.7994          
##     P-Value [Acc > NIR] : 0.116           
##                                           
##                   Kappa : 0.1425          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9760          
##             Specificity : 0.1249          
##          Pos Pred Value : 0.8163          
##          Neg Pred Value : 0.5662          
##              Prevalence : 0.7994          
##          Detection Rate : 0.7802          
##    Detection Prevalence : 0.9557          
##       Balanced Accuracy : 0.5504          
##                                           
##        'Positive' Class : Below           
##

library(DiagrammeR)
library(xgboost)

xgb.plot.multi.trees(boostTuned$finalModel,
                     feature_names = boostTuned$coefnames)

Logistic Regression

df <- read.table(
  "http://jaredlander.com/data/acs_ny.csv",
  sep=",", header = TRUE, stringsAsFactor = FALSE
)

# Data_Setting <- 로지스틱 회귀 분석을 위해서 (1, 0) 파생 변수 만들기

df %>% 
  mutate(income= factor(FamilyIncome >= 150000,
                        levels = c(FALSE, TRUE),
                        labels = c("Below","Above"))) -> df
     
# chracter - factor 변환 

df %>% 
  mutate_if(is.character, as.factor) -> df

str(df)

## 'data.frame':    22745 obs. of  19 variables:
##  $ Acres       : Factor w/ 3 levels "1-10","10+","Sub 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ FamilyIncome: int  150 180 280 330 330 480 520 550 1400 1400 ...
##  $ FamilyType  : Factor w/ 3 levels "Female Head",..: 3 1 1 1 2 2 2 2 1 1 ...
##  $ NumBedrooms : int  4 3 4 2 3 0 3 3 3 3 ...
##  $ NumChildren : int  1 2 0 1 1 3 2 1 1 2 ...
##  $ NumPeople   : int  3 4 2 2 2 4 3 2 4 3 ...
##  $ NumRooms    : int  9 6 8 4 5 1 8 5 9 7 ...
##  $ NumUnits    : Factor w/ 3 levels "Mobile home",..: 3 3 3 3 2 3 1 3 3 3 ...
##  $ NumVehicles : int  1 2 3 1 1 0 0 1 2 1 ...
##  $ NumWorkers  : int  0 0 1 0 0 0 0 1 1 1 ...
##  $ OwnRent     : Factor w/ 3 levels "Mortgage","Outright",..: 1 3 1 3 1 3 1 3 1 1 ...
##  $ YearBuilt   : Factor w/ 15 levels "15","1940-1949",..: 3 15 8 3 15 15 6 11 15 6 ...
##  $ HouseCosts  : int  1800 850 2600 1800 860 700 270 800 710 1500 ...
##  $ ElectricBill: int  90 90 260 140 150 140 130 1 150 300 ...
##  $ FoodStamp   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
##  $ HeatingFuel : Factor w/ 8 levels "Coal","Electricity",..: 3 5 5 5 3 3 8 3 3 2 ...
##  $ Insurance   : int  2500 0 6600 0 660 0 100 0 1000 0 ...
##  $ Language    : Factor w/ 5 levels "Asian Pacific",..: 2 2 4 2 5 2 2 2 2 2 ...
##  $ income      : Factor w/ 2 levels "Below","Above": 1 1 1 1 1 1 1 1 1 1 ...

# 분할 

index <- createDataPartition(df$income, p=0.7, list= FALSE)

train <- df[index,]
test <- df[-index,]



set.seed(10)
#Controlling setㄱ
trainControl(method = "repeatedcv",
             number = 5, 
             repeats =5, 
             classProbs = TRUE, 
             allowParallel = TRUE, 
             summaryFunction = twoClassSummary) -> ctrl_one



formul <- income ~ NumBedrooms + NumChildren + NumRooms + NumVehicles +
  NumWorkers + HouseCosts + ElectricBill

train(formul, 
      data= train,
      method ="glm",
      metric= "ROC",
      trControl= ctrl_one) -> my_glm

summary(my_glm)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7958  -0.6109  -0.4220  -0.2479   2.7506  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.413e+00  1.102e-01 -49.110  < 2e-16 ***
## NumBedrooms   1.614e-02  2.493e-02   0.647 0.517436    
## NumChildren  -7.668e-02  2.018e-02  -3.800 0.000145 ***
## NumRooms      1.400e-01  1.110e-02  12.619  < 2e-16 ***
## NumVehicles   2.229e-01  2.473e-02   9.015  < 2e-16 ***
## NumWorkers    5.205e-01  3.214e-02  16.194  < 2e-16 ***
## HouseCosts    7.326e-04  2.141e-05  34.212  < 2e-16 ***
## ElectricBill  1.418e-03  2.100e-04   6.752 1.46e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15967  on 15921  degrees of freedom
## Residual deviance: 12792  on 15914  degrees of freedom
## AIC: 12808
## 
## Number of Fisher Scoring iterations: 5

#test 실행 
predict(my_glm, test, type ="prob") %>%  pull(Below)->pd 
  

roc(test$income, pd) -> my_auc




sen_spec_df <- data_frame(TPR = my_auc$sensitivities, 
                          FPR = 1 - my_auc$specificities, 
                          total = my_auc$sensitivities + my_auc$specificities, 
                          cutoff = my_auc$thresholds)


sen_spec_df %>% 
  ggplot(aes(x = FPR, ymin = 0, ymax = TPR))+
  geom_polygon(aes(y = TPR), fill = "red", alpha = 0.3) +
  geom_path(aes(y = TPR), col = "firebrick", size = 1.2) +
  geom_abline(intercept = 0, slope = 1, color = "gray37", size = 1, linetype = "dashed") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous(labels = scales::percent) + 
  theme_bw() +
  coord_equal() +
  labs(x = "FPR (1 - Specificity)", 
       y = "TPR (Sensitivity)", 
       title = "Model Performance for Logistic Model based on Test Data", 
       subtitle = paste0("AUC Value: ", my_auc$auc %>% round(4)))

predict(my_glm, test, type ="raw") %>% 
  as.factor()-> pred_glm


confusionMatrix(pred_glm, test$income)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Below Above
##      Below  5242  1008
##      Above   212   361
##                                           
##                Accuracy : 0.8212          
##                  95% CI : (0.8119, 0.8302)
##     No Information Rate : 0.7994          
##     P-Value [Acc > NIR] : 2.681e-06       
##                                           
##                   Kappa : 0.2874          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9611          
##             Specificity : 0.2637          
##          Pos Pred Value : 0.8387          
##          Neg Pred Value : 0.6300          
##              Prevalence : 0.7994          
##          Detection Rate : 0.7683          
##    Detection Prevalence : 0.9160          
##       Balanced Accuracy : 0.6124          
##                                           
##        'Positive' Class : Below           
##

Classification Analysis

DOEUN

2020 4 23

Decision Tree

American Cummunity Survey - xgboost

Logistic Regression