K- MEANS
WINE
'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' -> url
read.table(url, header = FALSE,
sep = ',', stringsAsFactors = FALSE,
col.names = c("Cultivar", "Alcohol", "Malic.acid", "Ash",
"Alcalinity" ,"Magnesium",
"Total.phenols", "Flavanoid",
"nonflavanoid.phenols", "Proanthocyanin",
"Color.intensityy", "Hue",
"diulted.wines", "Proline")) -> wine
glimpse(wine)## Observations: 178
## Variables: 14
## $ Cultivar <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ Alcohol <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.3...
## $ Malic.acid <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15...
## $ Ash <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61...
## $ Alcalinity <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6...
## $ Magnesium <int> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98,...
## $ Total.phenols <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60...
## $ Flavanoid <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51...
## $ nonflavanoid.phenols <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31...
## $ Proanthocyanin <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25...
## $ Color.intensityy <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05...
## $ Hue <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06...
## $ diulted.wines <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58...
## $ Proline <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295,...
# 첫번째 cultivar 제외
wine[,-1] -> wineTrain
# K-MEAN 주의 사항
# 1. 데이터는 numeric 으로 scales 필요 (Categorical variable 은 사용 할 수 없다)
# 2. Cluster 객수 확인 - 난수가 적용해야함으로 시드를 결정한다.
set.seed(1004)
# 임의로 K를 3개로 할당
wineK3 <- kmeans(x=wineTrain, centers = 3) # K를 3으로 주고
wineK3## K-means clustering with 3 clusters of sizes 47, 69, 62
##
## Cluster means:
## Alcohol Malic.acid Ash Alcalinity Magnesium Total.phenols Flavanoid
## 1 13.80447 1.883404 2.426170 17.02340 105.51064 2.867234 3.014255
## 2 12.51667 2.494203 2.288551 20.82319 92.34783 2.070725 1.758406
## 3 12.92984 2.504032 2.408065 19.89032 103.59677 2.111129 1.584032
## nonflavanoid.phenols Proanthocyanin Color.intensityy Hue diulted.wines
## 1 0.2853191 1.910426 5.702553 1.0782979 3.114043
## 2 0.3901449 1.451884 4.086957 0.9411594 2.490725
## 3 0.3883871 1.503387 5.650323 0.8839677 2.365484
## Proline
## 1 1195.1489
## 2 458.2319
## 3 728.3387
##
## Clustering vector:
## [1] 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 1 1 3 3 1 1 3 1 1 1 1 1 1 3 3
## [38] 1 1 3 3 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 2 2 3 2 2 3 3 3 2 2 1
## [75] 3 2 2 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 2 2 3 3 2 3 2 3 2 2 2 3 2 2 2 2 3 2
## [112] 2 3 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 3 2 2 3 3 3 3 2 2 2 3 3 2 2 3 3 2 3
## [149] 3 2 2 2 2 3 3 3 2 3 3 3 2 3 2 3 3 2 3 3 3 3 2 2 3 3 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 1360950.5 443166.7 566572.5
## (between_SS / total_SS = 86.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# 최초의 랜덤 조건에 따라 영향을 받을 수 있는 알고리즘. nstart 인자
set.seed(1234)
kmeans(x=wineTrain, centers = 3, nstart = 25) -> wine3N25## [1] 47 69 62
## [1] 47 69 62
적절한 군집 갯수 찾아보기 방법: FitMenas (하티건) & Gap 통계량
FitKMeans(wineTrain, max.clusters = 20, nstart = 25, seed = 278613) -> wineBest
PlotHartigan(wineBest)실제 그룹의 cluster 와 3개를 할당해준 wine3n25개를 비교 Cross - table
##
## 1 2 3
## 1 46 0 13
## 2 1 50 20
## 3 0 19 29
# 실제 데이터와 K-mean를 사용한 데이터의 혼동 메트릭스 시각화
plot(table(wine$Cultivar, wine3N25$cluster),
main = "Confusion Matrix for Wine Clustering",
xlab = "Cultivar", ylab ="Cluster")Gap 통계량 - 군집 내 상이도를 데이터의 것과 붓스트랩 샘플로 얻은 데이터의 것을 서로 비교한다.
실제와 기대의 차이를 확인 - 우리는 이미 3개로 나눠진것을 알고 있기 때문에 가능
library(cluster)
clusGap(wineTrain, FUNcluster = pam, K.max = 20) -> theGAP
as.data.frame(theGAP$Tab) -> gapDF
gapDF # 갭 통계량이 급격하게 낮아지는 군집의 갯수 -> 시각화 필요 ## logW E.logW gap SE.sim
## 1 9.655294 9.939941 0.2846472 0.03514345
## 2 8.987942 9.255181 0.2672389 0.03329494
## 3 8.617563 8.870678 0.2531152 0.02869671
## 4 8.370194 8.587259 0.2170650 0.03116535
## 5 8.193144 8.387801 0.1946568 0.02820239
## 6 7.979259 8.236368 0.2571090 0.02988772
## 7 7.819287 8.098896 0.2796087 0.03104365
## 8 7.685612 7.991956 0.3063439 0.02629217
## 9 7.591487 7.899753 0.3082661 0.02329127
## 10 7.496676 7.820809 0.3241326 0.02353644
## 11 7.398811 7.752443 0.3536316 0.02285021
## 12 7.340516 7.692275 0.3517596 0.02365140
## 13 7.269456 7.640784 0.3713283 0.02412040
## 14 7.224292 7.593502 0.3692103 0.02428222
## 15 7.157981 7.551405 0.3934239 0.02679342
## 16 7.104300 7.511463 0.4071627 0.02767741
## 17 7.054116 7.475877 0.4217612 0.02774118
## 18 7.006179 7.439694 0.4335148 0.02724484
## 19 6.971455 7.406621 0.4351667 0.02809914
## 20 6.932463 7.374723 0.4422595 0.02842890
ggplot(gapDF, aes(x=1:nrow(gapDF)))+
geom_line(aes(y=logW), color= "Blue")+
geom_point(aes(y=logW), color= "Blue")+
geom_line(aes(y=E.logW), color = "red")+
geom_point(aes(y=E.logW), color ="red")+
labs(x= "Number of Cluster")+
theme_fivethirtyeight()+
geom_text(label = "관찰되는 군집 내 상이도", x=5, y=7, color="blue")+
geom_text(label = "기대되는 군집 내 상이도", x= 12, y=9, color ="red")# Gap 차이
ggplot(gapDF, aes(x=1:nrow(gapDF)))+
geom_line(aes(y= gap), color="red")+
geom_point(aes(y= gap), color= "red")+
geom_errorbar(aes(ymin = gap-SE.sim, ymax = gap + SE.sim), color="Red")갭 통계량을 확인해본 결과 5개의 군집이 괜찮다.
KNN
winH1 <- hclust(dist(wineTrain), method = "single")
winH2 <- hclust(dist(wineTrain), method = "complete")
winH3 <- hclust(dist(wineTrain), method = "average")
winH4 <- hclust(dist(wineTrain), method = "centroid")KNN - Linkage method 단일, 완전, 평균, 중게중심
Market Segmentation - KMEANS
setwd("C:/Users/Administrator/Desktop/BIG DATA")
read.csv("commerce.csv") -> df
df %>%
sample_n(10) %>%
select(1:4) %>%
kable()| InvoiceNo | StockCode | Description | Quantity |
|---|---|---|---|
| 541811 | 85099B | JUMBO BAG RED RETROSPOT | 9 |
| 536415 | 22632 | HAND WARMER RED RETROSPOT | 3 |
| 578314 | 22697 | GREEN REGENCY TEACUP AND SAUCER | 24 |
| 545162 | 85014A | BLACK/BLUE POLKADOT UMBRELLA | 6 |
| 575325 | 84580 | MOUSE TOY WITH PINK T-SHIRT | 1 |
| 567853 | 23290 | SPACEBOY CHILDRENS BOWL | 4 |
| 546542 | 22077 | 6 RIBBONS RUSTIC CHARM | 2 |
| 541871 | 37342 | POLKADOT COFFEE CUP & SAUCER PINK | 2 |
| 566399 | 22561 | WOODEN SCHOOL COLOURING SET | 12 |
| 547365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 12 |
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 0 0 0 0
## CustomerID Country
## 135080 0
df_one %>%
mutate(time_ymd_hm = mdy_hm(InvoiceDate),
time_hour = hour(time_ymd_hm),
time_min = minute(time_ymd_hm),
w_day = wday(time_ymd_hm, label = TRUE, abbr = TRUE),
time_mon = month(time_ymd_hm, label = TRUE, abbr = TRUE),
time_ymd = InvoiceDate %>% str_split(" ", simplify = TRUE)
%>% data.frame() %>% pull(X1) %>% mdy) -> my_dfmy_df %>%
group_by(time_ymd_hm) %>%
summarise(sales = sum(Quantity)) %>%
ungroup() -> sales_figure
sales_figure %>%
ggplot(aes(x= time_ymd_hm, y= sales))+
geom_line(color = "#073763" )+
theme_fivethirtyeight()+
labs(title = "Sales Figure : Unit Sales by Mintues and Day",
subtitle = "Sales flow chart",
caption = "Data Source : Kaggle")+
theme(axis.text.y = element_text(color ="grey30", size=10))+
theme(axis.text.x = element_text())+
theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) +
theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial",
face = "bold")) +
theme(axis.text.y = element_text(color = "grey30", size = 10)) +
theme(axis.text.x = element_text(color = "grey30", size = 10)) +
theme(plot.title = element_text(size= 15))+
theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) +
theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))my_df %>%
group_by(time_ymd) %>%
summarise(sale = sum(Quantity)) %>%
ungroup() -> sale_figure2
DT:: datatable(sale_figure2)highchart() %>%
hc_add_series(sale_figure2, hcaes(x= time_ymd, y = sale), type= "line") %>%
hc_tooltip(crosshairs = TRUE) %>%
hc_title(text = "Sales figures by YMD") %>%
hc_add_theme(hc_theme_economist())sale_figure2 %>%
ggplot(aes(x= time_ymd, y= sale))+
geom_line(color = "#073763", size= 0.3)+
geom_point(color = "indianred2")+
theme_fivethirtyeight()+
labs(title = "Sales Figure : Unit Sales by Year and Month",
subtitle = "Sales flow chart 2",
caption = "Data Source : Kaggle")+
theme(axis.text.y = element_text(color ="grey30", size=10))+
theme(axis.text.x = element_text())+
theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) +
theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial",
face = "bold")) +
theme(axis.text.y = element_text(color = "grey30", size = 10)) +
theme(axis.text.x = element_text(color = "grey30", size = 10)) +
theme(plot.title = element_text(size= 15))+
theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) +
theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm")) my_df %>%
mutate(profit = Quantity * UnitPrice) -> my_df
my_df %>%
group_by(time_ymd_hm) %>%
summarise(profit = round(sum(profit),0)) -> sales_figure3
sales_figure3 %>%
ggplot(aes(x= time_ymd_hm, y= profit))+
geom_line(color = "#073763" )+
theme_fivethirtyeight()+
labs(title = "Sales Figure : Sales Profit by min",
subtitle = "Sales flow chart-3",
caption = "Data Source : Kaggle")+
theme(axis.text.y = element_text(color ="grey30", size=10))+
theme(axis.text.x = element_text())+
theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) +
theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial",
face = "bold")) +
theme(axis.text.y = element_text(color = "grey30", size = 10)) +
theme(axis.text.x = element_text(color = "grey30", size = 10)) +
theme(plot.title = element_text(size= 15))+
theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) +
theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))my_df %>%
group_by(time_mon) %>%
summarise(profit_mon = sum(profit)) %>%
mutate(profit100 = round(profit_mon/ 1000,0)) %>%
ggplot(aes(time_mon, profit100))+
geom_col(fill = "#eca221", col= "white")+
theme(panel.grid.major.x = element_blank())+
theme_fivethirtyeight()+
labs(title = "Sales Figure : Sales Profit by Month",
subtitle = "Sales flow chart-3",
caption = "Data Source : Kaggle")+
annotate("text", label= "The Highest Profit", x= 9.5, y=1700, size =3.5, color = "grey30")+
geom_curve(x= 10, xend =11, y = 1600, yend = 1500, arrow = arrow(length = unit(0.5,"cm")), curvature =0.1)+
theme(axis.text.y = element_text(color ="grey30", size=10))+
theme(axis.text.x = element_text())+
theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) +
theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial",
face = "bold")) +
theme(axis.text.y = element_text(color = "grey30", size = 10)) +
theme(axis.text.x = element_text(color = "grey30", size = 10)) +
theme(plot.title = element_text(size= 15))+
theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) +
theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))my_df %>%
group_by(time_mon) %>%
summarise(profit_mon = sum(profit)) %>%
mutate(profit100 = round(profit_mon/ 1000,0)) -> figure4
figure4## # A tibble: 12 x 3
## time_mon profit_mon profit100
## <ord> <dbl> <dbl>
## 1 1 691365. 691
## 2 2 523632. 524
## 3 3 717639. 718
## 4 4 537809. 538
## 5 5 770536. 771
## 6 6 761740. 762
## 7 7 719221. 719
## 8 8 759138. 759
## 9 9 1058590. 1059
## 10 10 1154979. 1155
## 11 11 1509496. 1509
## 12 12 1462539. 1463
highchart() %>%
hc_add_series(figure4, hcaes(time_mon, profit100), type= "column") %>%
hc_add_series(figure4, hcaes(time_mon, profit100), type= "line") %>%
hc_xAxis(categories = c("JAN", "FEB", "MAR", "APR", "MAY",
"JUNE", "JULY", "AUG","SEP","OCT", "NOV", "DEC")) %>%
hc_yAxis(labels = list(style = list(fontSize = 10)),
title = list(text = "Sales / 1000",
style = list(fontSize = 10))) %>%
hc_tooltip(pointFormat = paste("Month: <b> {point.time_mon} </b> <br>
Sales: <b>{point.y}</b>")) %>%
hc_title(text = "Profits per Month and",
style = (list(fontSize = '14px'))) %>%
hc_add_theme(hc_theme_flat())my_df %>%
group_by(Description) %>%
summarise(sales= sum(profit)) %>%
ungroup() %>%
arrange(-sales) %>%
mutate(Description = factor(Description), levels = Description) %>%
slice(1:20) -> top_items
top_items## # A tibble: 20 x 3
## Description sales levels
## <fct> <dbl> <fct>
## 1 "DOTCOM POSTAGE" 206249. "DOTCOM POSTAGE"
## 2 "REGENCY CAKESTAND 3 TIER" 174485. "REGENCY CAKESTAND 3 TIER"
## 3 "PAPER CRAFT , LITTLE BIRDIE" 168470. "PAPER CRAFT , LITTLE BIRDIE"
## 4 "WHITE HANGING HEART T-LIGHT HOLDE~ 106293. "WHITE HANGING HEART T-LIGHT HOL~
## 5 "PARTY BUNTING" 99504. "PARTY BUNTING"
## 6 "JUMBO BAG RED RETROSPOT" 94340. "JUMBO BAG RED RETROSPOT"
## 7 "MEDIUM CERAMIC TOP STORAGE JAR" 81701. "MEDIUM CERAMIC TOP STORAGE JAR"
## 8 "Manual" 78113. "Manual"
## 9 "POSTAGE" 78102. "POSTAGE"
## 10 "RABBIT NIGHT LIGHT" 66965. "RABBIT NIGHT LIGHT"
## 11 "PAPER CHAIN KIT 50'S CHRISTMAS " 64952. "PAPER CHAIN KIT 50'S CHRISTMAS "
## 12 "ASSORTED COLOUR BIRD ORNAMENT" 59095. "ASSORTED COLOUR BIRD ORNAMENT"
## 13 "CHILLI LIGHTS" 54118. "CHILLI LIGHTS"
## 14 "SPOTTY BUNTING" 42548. "SPOTTY BUNTING"
## 15 "JUMBO BAG PINK POLKADOT" 42436. "JUMBO BAG PINK POLKADOT"
## 16 "BLACK RECORD COVER FRAME" 40652. "BLACK RECORD COVER FRAME"
## 17 "PICNIC BASKET WICKER 60 PIECES" 39620. "PICNIC BASKET WICKER 60 PIECES"
## 18 "DOORMAT KEEP CALM AND COME IN" 38167. "DOORMAT KEEP CALM AND COME IN"
## 19 "SET OF 3 CAKE TINS PANTRY DESIGN " 38158. "SET OF 3 CAKE TINS PANTRY DESIG~
## 20 "JAM MAKING SET WITH JARS" 37129. "JAM MAKING SET WITH JARS"
ggplot(top_items, aes(x= reorder(Description,sales)))+
geom_col(aes(y= sales), fill= "#eca221")+
geom_line(aes(y= sales), size= 0.8, group=1, color = "Black")+
labs(x= "Items", y= "Sales $",title = "Sales Figure : Sales Profit by Month",
subtitle = "Sales flow chart-3",
caption = "Data Source : Kaggle")+
theme(text = element_text(family = "Arial Black", size = 8, color = "gray30")) +
theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial",
face = "bold")) +
theme(axis.text.y = element_text(color = "grey30", size = 8)) +
theme(axis.text.x = element_blank())+
theme(plot.title = element_text(size= 13))+
theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) +
theme(plot.background = element_rect(fill="#F0F0F0"))+
theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm"))+
annotate("text", label = "DOTCOM POSTAGE", x= 18, y=200000, size= 3.0)highchart() %>%
hc_add_series(top_items, hcaes(Description, sales), type= "column") %>%
hc_add_series(top_items, hcaes(Description, sales), type= "line") %>%
hc_yAxis(labels = list(style = list(fontSize = 10)),
title = list(text = "sales",
style = list(fontSize = 10))) %>%
hc_xAxis(title= list(text= NULL)) %>%
hc_tooltip(pointFormat = paste("Description: <b> {point.Description} </b> <br>
Sales: <b>{point.y}</b>")) %>%
hc_title(text = "Top 20 Items by sales figure",
style = (list(fontSize = '14px'))) %>%
hc_add_theme(hc_theme_flatdark()) -> p1
htmltools::tagList(p1)Customer Segments
y <- as.duration(ymd_hm("2011-12-31 24:59") - my_df$time_ymd_hm) %>% as.numeric()
y <- round(y / (3600*24), 0)
my_df %>% mutate(receny = y) ->my_df
my_df$CustomerID <- as.character(my_df$CustomerID)
my_df %>%
group_by(CustomerID) %>%
summarise_each(funs(sum), profit) %>%
ungroup() %>%
na.omit() -> profit_df #고객 ID 별 Profit
my_df %>%
group_by(CustomerID) %>%
summarise_each(funs(min), receny) %>%
ungroup() %>%
na.omit() -> receny_df #Reency 모음
my_df %>%
group_by(CustomerID) %>%
count() %>%
ungroup() %>%
na.omit()-> count_df ## 고객 Count
# Coustemer ID 기준으로
profit_df %>%
mutate(CustomerID = as.character(CustomerID)) %>%
full_join(receny_df, by ="CustomerID") %>%
full_join(count_df, bu = "CustomerID") -> final_df
DT::datatable(final_df)이상치 제거
outlier_label <- function(x) {
a <- mean(x)
b <- sd(x)
th1 <- a - 3*b
th2 <- a + 3*b
y <- case_when(x >= th1 & x <= th2 ~ "Normal", TRUE ~ "Outlier")
return(y)
}
final_df %>%
mutate(nor_money = outlier_label(profit), nor_freq = outlier_label(n)) %>%
filter(nor_money == "Normal", nor_freq == "Normal") %>%
select(1:4)-> df_allnormal -> df_allnormalOptimal- K 찾기
하티건 그래프
#Numeric Data 만 선별
df_allnormal[, sapply(final_df, is.numeric)] -> K_data
#정규화
as.data.frame(scale(K_data)) -> scale_data
# Optimal K
FitKMeans(K_data, max.clusters = 15, nstart = 25, seed= 125423) -> bestk
PlotHartigan(bestk) Elbow Method - SS/TOtal_SS 비율이 적게 움직이는 Point 를 찾아보기.
5 개로 클러스터링 하는 것이 적합해 보임
set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(scale_data, k, nstart=50,
iter.max = 15)$tot.withinss})
wss## [1] 12861.000 8044.794 4787.072 3550.839 2815.099 2411.752 2114.481
## [8] 1858.031 1704.857 1525.015 1417.678 1294.715 1227.693 1150.527
## [15] 1082.664
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")set.seed(1004)
km_one <- kmeans(scale_data, 4, nstart = 30)
df_allnormal %>%
mutate(K_group = km_one$cluster) %>%
mutate(K_group = paste("Group", K_group)) -> Kmean_df
# Groups 별 EDA
Kmean_df %>%
group_by(K_group) %>%
summarise_each(funs(mean), profit, receny, n) %>%
mutate_if(is.numeric, function(x){round(x,0)}) %>%
ungroup() %>%
arrange(-profit) -> Kgroup_df
DT::datatable(Kgroup_df)Group 1 은 평균 14100 정도로 들어가있는 고객들은 2628 명 (VIP로 분류)
Group 3 은 평균 3786 정도 지불하지만 (VIP 로 올라갈 수 있는 가망 고객들) - 타켓 설정
#그룹평 평균 sales/ receny/ n
Kgroup_df %>%
group_by(K_group) %>%
summarise_each(funs(mean), profit) %>%
mutate(sale_per = profit/sum(profit)*100) %>%
mutate(sale_per = round(sale_per,0))-> sales_bygroup
ggplot(sales_bygroup, aes(reorder(K_group,sale_per), sale_per), fill=K_group, color= K_group)+
geom_col(fill = "#eca221", col= "white")+
theme_fivethirtyeight()+
coord_flip()+
geom_text(aes(label = paste0(sale_per,"%"), hjust=-0.05))+
scale_fill_tableau() +
scale_color_tableau() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
theme(panel.grid.major = element_blank()) +
theme(panel.grid.minor = element_blank())+
labs(title = "Sales by Customer Groups (K-MEAN)",
caption = "Data Source : Kaggle")+
theme(plot.title = element_text(size=13))## Classes 'tbl_df', 'tbl' and 'data.frame': 4 obs. of 4 variables:
## $ K_group: chr "Group 4" "Group 2" "Group 1" "Group 3"
## $ profit : num 14100 3786 965 439
## $ receny : num 38 47 70 273
## $ n : num 323 268 53 26
as.factor(Kgroup_df$K_group) -> Kgroup_df$K_group
Kgroup_df %>%
mutate(pct= round(n/sum(n) *100,0)) %>%
mutate(pct = paste(pct, "%")) %>%
ggplot(aes(K_group, pct, fill = K_group, color= K_group),)+
geom_col()+
theme_fivethirtyeight()+
coord_flip()+
geom_text(aes(label = paste0(pct,"%"), hjust=-0.05))+
scale_fill_tableau() +
scale_color_tableau() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
theme(panel.grid.major = element_blank()) +
theme(panel.grid.minor = element_blank())+
labs(title = "The Number of Customer Groups (K-MEAN)",
caption = "Data Source : Kaggle")+
theme(plot.title = element_text(size=13))+
theme(legend.position = "none") Random_Forest
library(caret)
Kmean_df%>%
mutate(Group = as.factor(K_group)) %>%
select(-CustomerID, -K_group) -> new_df
set.seed(1004)
flag <- createDataPartition(new_df$Group, p = 0.8, list = FALSE)
train <- new_df[flag,]
test <- new_df[-flag,]
train(Group ~ ., method= "rf", data=train) -> rf## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Confusion Matrix and Statistics
##
## Reference
## Prediction Group 1 Group 2 Group 3 Group 4
## Group 1 525 2 1 0
## Group 2 0 103 1 0
## Group 3 1 0 205 0
## Group 4 0 1 0 17
##
## Overall Statistics
##
## Accuracy : 0.993
## 95% CI : (0.9848, 0.9974)
## No Information Rate : 0.6145
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9872
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Group 1 Class: Group 2 Class: Group 3
## Sensitivity 0.9981 0.9717 0.9903
## Specificity 0.9909 0.9987 0.9985
## Pos Pred Value 0.9943 0.9904 0.9951
## Neg Pred Value 0.9970 0.9960 0.9969
## Prevalence 0.6145 0.1238 0.2418
## Detection Rate 0.6133 0.1203 0.2395
## Detection Prevalence 0.6168 0.1215 0.2407
## Balanced Accuracy 0.9945 0.9852 0.9944
## Class: Group 4
## Sensitivity 1.00000
## Specificity 0.99881
## Pos Pred Value 0.94444
## Neg Pred Value 1.00000
## Prevalence 0.01986
## Detection Rate 0.01986
## Detection Prevalence 0.02103
## Balanced Accuracy 0.99940