library(readr)
url <- "https://hmkang98.github.io/data/spss/csc_train.csv"
csc_data <- read_csv(url)
## Rows: 8068 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Ever_Married, Graduated, Profession, Spending_Score, Var_1,...
## dbl (4): ID, Age, Work_Experience, Family_Size
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(csc_data)
## # A tibble: 6 × 11
## ID Gender Ever_Married Age Graduated Profession Work_Experience
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 462809 Male No 22 No Healthcare 1
## 2 462643 Female Yes 38 Yes Engineer NA
## 3 466315 Female Yes 67 Yes Engineer 1
## 4 461735 Male Yes 67 Yes Lawyer 0
## 5 462669 Female Yes 40 Yes Entertainment NA
## 6 461319 Male Yes 56 No Artist 0
## # ℹ 4 more variables: Spending_Score <chr>, Family_Size <dbl>, Var_1 <chr>,
## # Segmentation <chr>
str(csc_data)
## spc_tbl_ [8,068 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:8068] 462809 462643 466315 461735 462669 ...
## $ Gender : chr [1:8068] "Male" "Female" "Female" "Male" ...
## $ Ever_Married : chr [1:8068] "No" "Yes" "Yes" "Yes" ...
## $ Age : num [1:8068] 22 38 67 67 40 56 32 33 61 55 ...
## $ Graduated : chr [1:8068] "No" "Yes" "Yes" "Yes" ...
## $ Profession : chr [1:8068] "Healthcare" "Engineer" "Engineer" "Lawyer" ...
## $ Work_Experience: num [1:8068] 1 NA 1 0 NA 0 1 1 0 1 ...
## $ Spending_Score : chr [1:8068] "Low" "Average" "Low" "High" ...
## $ Family_Size : num [1:8068] 4 3 1 2 6 2 3 3 3 4 ...
## $ Var_1 : chr [1:8068] "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
## $ Segmentation : chr [1:8068] "D" "A" "B" "B" ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. Gender = col_character(),
## .. Ever_Married = col_character(),
## .. Age = col_double(),
## .. Graduated = col_character(),
## .. Profession = col_character(),
## .. Work_Experience = col_double(),
## .. Spending_Score = col_character(),
## .. Family_Size = col_double(),
## .. Var_1 = col_character(),
## .. Segmentation = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
sapply(csc_data, function(x) length(unique(x)))
## ID Gender Ever_Married Age Graduated
## 8068 2 3 67 3
## Profession Work_Experience Spending_Score Family_Size Var_1
## 10 16 3 10 8
## Segmentation
## 4
missing_values <- sapply(csc_data, function(x) sum(is.na(x)))
missing_values
## ID Gender Ever_Married Age Graduated
## 0 0 140 0 78
## Profession Work_Experience Spending_Score Family_Size Var_1
## 124 829 0 335 76
## Segmentation
## 0
table(csc_data$Ever_Married, useNA="ifany")
##
## No Yes <NA>
## 3285 4643 140
library(zoo)
##
## 다음의 패키지를 부착합니다: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
csc_data$Ever_Married <- na.locf(csc_data$Ever_Married, na.rm = TRUE)
table(csc_data$Graduated, useNA="ifany")
##
## No Yes <NA>
## 3022 4968 78
csc_data$Graduated <- na.locf(csc_data$Graduated, na.rm = TRUE)
table(csc_data$Profession, useNA="ifany")
##
## Artist Doctor Engineer Entertainment Executive
## 2516 688 699 949 599
## Healthcare Homemaker Lawyer Marketing <NA>
## 1332 246 623 292 124
csc_data$Profession <- na.locf(csc_data$Profession, na.rm = TRUE)
table(csc_data$Work_Experience, useNA="ifany")
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 <NA>
## 2318 2354 286 255 253 194 204 196 463 474 53 50 48 46 45 829
csc_data$Work_Experience <- na.locf(csc_data$Work_Experience, na.rm = TRUE)
table(csc_data$Family_Size, useNA="ifany")
##
## 1 2 3 4 5 6 7 8 9 <NA>
## 1453 2390 1497 1379 612 212 96 50 44 335
csc_data$Family_Size <- na.locf(csc_data$Family_Size, na.rm = TRUE)
table(csc_data$Var_1, useNA="ifany")
##
## Cat_1 Cat_2 Cat_3 Cat_4 Cat_5 Cat_6 Cat_7 <NA>
## 133 422 822 1089 85 5238 203 76
csc_data$Var_1 <- na.locf(csc_data$Var_1, na.rm = TRUE)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
csc_data <- csc_data %>% select(-ID)
csc_data$Segmentation <- factor(csc_data$Segmentation, labels=c("A", "B", "C", "D"))
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
set.seed(90)
samples <- createDataPartition(csc_data$Segmentation, p=0.7, list=FALSE)
train <- csc_data[samples, ]
test <- csc_data[-samples, ]
model <- train(Segmentation ~ ., data=train, method="glmnet")
final_model <- train(Segmentation ~ ., data=train, method="glmnet", tuneGrid=data.frame(alpha=0.1, lambda=0.003998122), metric="Accuracy")
coefficients <- coef(final_model$finalModel, s=final_model$bestTune$lambda)
test_predictor <- predict(final_model, test)
confusionMatrix(data=test_predictor, reference=test$Segmentation, positive="alive")
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D
## A 302 180 72 147
## B 69 125 100 31
## C 114 190 345 32
## D 106 62 74 470
##
## Overall Statistics
##
## Accuracy : 0.5134
## 95% CI : (0.4933, 0.5335)
## No Information Rate : 0.2811
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3484
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D
## Sensitivity 0.5110 0.22442 0.5838 0.6912
## Specificity 0.7817 0.89259 0.8162 0.8608
## Pos Pred Value 0.4308 0.38462 0.5066 0.6601
## Neg Pred Value 0.8318 0.79370 0.8585 0.8770
## Prevalence 0.2443 0.23026 0.2443 0.2811
## Detection Rate 0.1248 0.05167 0.1426 0.1943
## Detection Prevalence 0.2898 0.13435 0.2815 0.2943
## Balanced Accuracy 0.6464 0.55850 0.7000 0.7760
new_row <- tibble(
Gender = "Male",
Ever_Married = "Yes",
Age = 50,
Graduated = "Yes",
Profession = "Engineer",
Work_Experience = 5,
Spending_Score = "Average",
Family_Size = 5,
Var_1 = "Cat_4",
Segmentation = factor("A", levels = c("A", "B", "C", "D"))
)
predict(final_model, newdata=new_row)
## [1] B
## Levels: A B C D