library(readr)
url <- "https://hmkang98.github.io/data/spss/csc_train.csv"
csc_data <- read_csv(url)
## Rows: 8068 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Ever_Married, Graduated, Profession, Spending_Score, Var_1,...
## dbl (4): ID, Age, Work_Experience, Family_Size
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(csc_data)
## # A tibble: 6 × 11
##       ID Gender Ever_Married   Age Graduated Profession    Work_Experience
##    <dbl> <chr>  <chr>        <dbl> <chr>     <chr>                   <dbl>
## 1 462809 Male   No              22 No        Healthcare                  1
## 2 462643 Female Yes             38 Yes       Engineer                   NA
## 3 466315 Female Yes             67 Yes       Engineer                    1
## 4 461735 Male   Yes             67 Yes       Lawyer                      0
## 5 462669 Female Yes             40 Yes       Entertainment              NA
## 6 461319 Male   Yes             56 No        Artist                      0
## # ℹ 4 more variables: Spending_Score <chr>, Family_Size <dbl>, Var_1 <chr>,
## #   Segmentation <chr>
str(csc_data)
## spc_tbl_ [8,068 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID             : num [1:8068] 462809 462643 466315 461735 462669 ...
##  $ Gender         : chr [1:8068] "Male" "Female" "Female" "Male" ...
##  $ Ever_Married   : chr [1:8068] "No" "Yes" "Yes" "Yes" ...
##  $ Age            : num [1:8068] 22 38 67 67 40 56 32 33 61 55 ...
##  $ Graduated      : chr [1:8068] "No" "Yes" "Yes" "Yes" ...
##  $ Profession     : chr [1:8068] "Healthcare" "Engineer" "Engineer" "Lawyer" ...
##  $ Work_Experience: num [1:8068] 1 NA 1 0 NA 0 1 1 0 1 ...
##  $ Spending_Score : chr [1:8068] "Low" "Average" "Low" "High" ...
##  $ Family_Size    : num [1:8068] 4 3 1 2 6 2 3 3 3 4 ...
##  $ Var_1          : chr [1:8068] "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
##  $ Segmentation   : chr [1:8068] "D" "A" "B" "B" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   Gender = col_character(),
##   ..   Ever_Married = col_character(),
##   ..   Age = col_double(),
##   ..   Graduated = col_character(),
##   ..   Profession = col_character(),
##   ..   Work_Experience = col_double(),
##   ..   Spending_Score = col_character(),
##   ..   Family_Size = col_double(),
##   ..   Var_1 = col_character(),
##   ..   Segmentation = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
sapply(csc_data, function(x) length(unique(x)))
##              ID          Gender    Ever_Married             Age       Graduated 
##            8068               2               3              67               3 
##      Profession Work_Experience  Spending_Score     Family_Size           Var_1 
##              10              16               3              10               8 
##    Segmentation 
##               4
missing_values <- sapply(csc_data, function(x) sum(is.na(x)))
missing_values
##              ID          Gender    Ever_Married             Age       Graduated 
##               0               0             140               0              78 
##      Profession Work_Experience  Spending_Score     Family_Size           Var_1 
##             124             829               0             335              76 
##    Segmentation 
##               0
table(csc_data$Ever_Married, useNA="ifany")
## 
##   No  Yes <NA> 
## 3285 4643  140
library(zoo)
## 
## 다음의 패키지를 부착합니다: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
csc_data$Ever_Married <- na.locf(csc_data$Ever_Married, na.rm = TRUE)
table(csc_data$Graduated, useNA="ifany")
## 
##   No  Yes <NA> 
## 3022 4968   78
csc_data$Graduated <- na.locf(csc_data$Graduated, na.rm = TRUE)
table(csc_data$Profession, useNA="ifany")
## 
##        Artist        Doctor      Engineer Entertainment     Executive 
##          2516           688           699           949           599 
##    Healthcare     Homemaker        Lawyer     Marketing          <NA> 
##          1332           246           623           292           124
csc_data$Profession <- na.locf(csc_data$Profession, na.rm = TRUE)
table(csc_data$Work_Experience, useNA="ifany")
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 <NA> 
## 2318 2354  286  255  253  194  204  196  463  474   53   50   48   46   45  829
csc_data$Work_Experience <- na.locf(csc_data$Work_Experience, na.rm = TRUE)
table(csc_data$Family_Size, useNA="ifany")
## 
##    1    2    3    4    5    6    7    8    9 <NA> 
## 1453 2390 1497 1379  612  212   96   50   44  335
csc_data$Family_Size <- na.locf(csc_data$Family_Size, na.rm = TRUE)
table(csc_data$Var_1, useNA="ifany")
## 
## Cat_1 Cat_2 Cat_3 Cat_4 Cat_5 Cat_6 Cat_7  <NA> 
##   133   422   822  1089    85  5238   203    76
csc_data$Var_1 <- na.locf(csc_data$Var_1, na.rm = TRUE)
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
csc_data <- csc_data %>% select(-ID)
csc_data$Segmentation <- factor(csc_data$Segmentation, labels=c("A", "B", "C", "D"))
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
set.seed(90)
samples <- createDataPartition(csc_data$Segmentation, p=0.7, list=FALSE)
train <- csc_data[samples, ]
test <- csc_data[-samples, ]
model <- train(Segmentation ~ ., data=train, method="glmnet")
final_model <- train(Segmentation ~ ., data=train, method="glmnet", tuneGrid=data.frame(alpha=0.1, lambda=0.003998122), metric="Accuracy")
coefficients <- coef(final_model$finalModel, s=final_model$bestTune$lambda)
test_predictor <- predict(final_model, test)
confusionMatrix(data=test_predictor, reference=test$Segmentation, positive="alive")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D
##          A 302 180  72 147
##          B  69 125 100  31
##          C 114 190 345  32
##          D 106  62  74 470
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5134          
##                  95% CI : (0.4933, 0.5335)
##     No Information Rate : 0.2811          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3484          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D
## Sensitivity            0.5110  0.22442   0.5838   0.6912
## Specificity            0.7817  0.89259   0.8162   0.8608
## Pos Pred Value         0.4308  0.38462   0.5066   0.6601
## Neg Pred Value         0.8318  0.79370   0.8585   0.8770
## Prevalence             0.2443  0.23026   0.2443   0.2811
## Detection Rate         0.1248  0.05167   0.1426   0.1943
## Detection Prevalence   0.2898  0.13435   0.2815   0.2943
## Balanced Accuracy      0.6464  0.55850   0.7000   0.7760
new_row <- tibble(
  Gender = "Male",
  Ever_Married = "Yes",
  Age = 50,
  Graduated = "Yes",
  Profession = "Engineer",
  Work_Experience = 5,
  Spending_Score = "Average",
  Family_Size = 5,
  Var_1 = "Cat_4",
  Segmentation = factor("A", levels = c("A", "B", "C", "D"))
)
predict(final_model, newdata=new_row)
## [1] B
## Levels: A B C D