K- MEANS

WINE

'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' -> url

read.table(url, header = FALSE, 
           sep = ',', stringsAsFactors = FALSE,
           col.names = c("Cultivar", "Alcohol", "Malic.acid", "Ash",
                         "Alcalinity" ,"Magnesium",
                         "Total.phenols", "Flavanoid",
                         "nonflavanoid.phenols", "Proanthocyanin",
                         "Color.intensityy", "Hue",
                         "diulted.wines", "Proline")) -> wine

glimpse(wine)

## Observations: 178
## Variables: 14
## $ Cultivar             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ Alcohol              <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.3...
## $ Malic.acid           <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15...
## $ Ash                  <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61...
## $ Alcalinity           <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6...
## $ Magnesium            <int> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98,...
## $ Total.phenols        <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60...
## $ Flavanoid            <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51...
## $ nonflavanoid.phenols <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31...
## $ Proanthocyanin       <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25...
## $ Color.intensityy     <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05...
## $ Hue                  <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06...
## $ diulted.wines        <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58...
## $ Proline              <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295,...

# 첫번째 cultivar 제외 

wine[,-1] -> wineTrain


# K-MEAN 주의 사항 
# 1. 데이터는 numeric 으로 scales 필요 (Categorical variable 은 사용 할 수 없다)
# 2. Cluster 객수 확인 - 난수가 적용해야함으로 시드를 결정한다. 

set.seed(1004)

# 임의로 K를 3개로 할당 

wineK3 <- kmeans(x=wineTrain, centers = 3) # K를 3으로 주고 

wineK3

## K-means clustering with 3 clusters of sizes 47, 69, 62
## 
## Cluster means:
##    Alcohol Malic.acid      Ash Alcalinity Magnesium Total.phenols Flavanoid
## 1 13.80447   1.883404 2.426170   17.02340 105.51064      2.867234  3.014255
## 2 12.51667   2.494203 2.288551   20.82319  92.34783      2.070725  1.758406
## 3 12.92984   2.504032 2.408065   19.89032 103.59677      2.111129  1.584032
##   nonflavanoid.phenols Proanthocyanin Color.intensityy       Hue diulted.wines
## 1            0.2853191       1.910426         5.702553 1.0782979      3.114043
## 2            0.3901449       1.451884         4.086957 0.9411594      2.490725
## 3            0.3883871       1.503387         5.650323 0.8839677      2.365484
##     Proline
## 1 1195.1489
## 2  458.2319
## 3  728.3387
## 
## Clustering vector:
##   [1] 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 1 1 3 3 1 1 3 1 1 1 1 1 1 3 3
##  [38] 1 1 3 3 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 2 2 3 2 2 3 3 3 2 2 1
##  [75] 3 2 2 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 2 2 3 3 2 3 2 3 2 2 2 3 2 2 2 2 3 2
## [112] 2 3 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 3 2 2 3 3 3 3 2 2 2 3 3 2 2 3 3 2 3
## [149] 3 2 2 2 2 3 3 3 2 3 3 3 2 3 2 3 3 2 3 3 3 3 2 2 3 3 3 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 1360950.5  443166.7  566572.5
##  (between_SS / total_SS =  86.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# K를 3으로 준 내용을 시각화처리 
library(useful)

plot(wineK3, data= wineTrain)

# 원래의 cultivar 를 shape으로 코딩해서, 군집화된 결과와 원래의 데이터를 비교 

plot(wineK3, data = wine, class = "Cultivar")

# 최초의 랜덤 조건에 따라 영향을 받을 수 있는 알고리즘. nstart 인자 

set.seed(1234)

kmeans(x=wineTrain, centers = 3, nstart = 25) -> wine3N25

wineK3$size

## [1] 47 69 62

wine3N25$size

## [1] 47 69 62

적절한 군집 갯수 찾아보기 방법: FitMenas (하티건) & Gap 통계량

FitKMeans(wineTrain, max.clusters = 20, nstart = 25, seed = 278613) -> wineBest

PlotHartigan(wineBest)

#대략 14-15개의 군집을 사용해야하지만, 데이터에 3개의 품종인것을 알고 있기 때문에 그닥 중요하진 않다.

실제 그룹의 cluster 와 3개를 할당해준 wine3n25개를 비교 Cross - table

table(wine$Cultivar, wine3N25$cluster)

##    
##      1  2  3
##   1 46  0 13
##   2  1 50 20
##   3  0 19 29

# 실제 데이터와 K-mean를 사용한 데이터의 혼동 메트릭스 시각화 

plot(table(wine$Cultivar, wine3N25$cluster),
           main = "Confusion Matrix for Wine Clustering",
           xlab = "Cultivar", ylab ="Cluster")

Gap 통계량 - 군집 내 상이도를 데이터의 것과 붓스트랩 샘플로 얻은 데이터의 것을 서로 비교한다.

실제와 기대의 차이를 확인 - 우리는 이미 3개로 나눠진것을 알고 있기 때문에 가능

library(cluster)

clusGap(wineTrain, FUNcluster = pam, K.max = 20) -> theGAP

as.data.frame(theGAP$Tab) -> gapDF

gapDF  # 갭 통계량이 급격하게 낮아지는 군집의 갯수 -> 시각화 필요

##        logW   E.logW       gap     SE.sim
## 1  9.655294 9.939941 0.2846472 0.03514345
## 2  8.987942 9.255181 0.2672389 0.03329494
## 3  8.617563 8.870678 0.2531152 0.02869671
## 4  8.370194 8.587259 0.2170650 0.03116535
## 5  8.193144 8.387801 0.1946568 0.02820239
## 6  7.979259 8.236368 0.2571090 0.02988772
## 7  7.819287 8.098896 0.2796087 0.03104365
## 8  7.685612 7.991956 0.3063439 0.02629217
## 9  7.591487 7.899753 0.3082661 0.02329127
## 10 7.496676 7.820809 0.3241326 0.02353644
## 11 7.398811 7.752443 0.3536316 0.02285021
## 12 7.340516 7.692275 0.3517596 0.02365140
## 13 7.269456 7.640784 0.3713283 0.02412040
## 14 7.224292 7.593502 0.3692103 0.02428222
## 15 7.157981 7.551405 0.3934239 0.02679342
## 16 7.104300 7.511463 0.4071627 0.02767741
## 17 7.054116 7.475877 0.4217612 0.02774118
## 18 7.006179 7.439694 0.4335148 0.02724484
## 19 6.971455 7.406621 0.4351667 0.02809914
## 20 6.932463 7.374723 0.4422595 0.02842890

ggplot(gapDF, aes(x=1:nrow(gapDF)))+
  geom_line(aes(y=logW), color= "Blue")+
  geom_point(aes(y=logW), color= "Blue")+
  geom_line(aes(y=E.logW), color = "red")+
  geom_point(aes(y=E.logW), color ="red")+
  labs(x= "Number of Cluster")+
  theme_fivethirtyeight()+
  geom_text(label = "관찰되는 군집 내 상이도", x=5, y=7, color="blue")+
  geom_text(label = "기대되는 군집 내 상이도", x= 12, y=9, color ="red")

# Gap 차이 

ggplot(gapDF, aes(x=1:nrow(gapDF)))+
        geom_line(aes(y= gap), color="red")+
         geom_point(aes(y= gap), color= "red")+
         geom_errorbar(aes(ymin = gap-SE.sim, ymax = gap + SE.sim), color="Red")

갭 통계량을 확인해본 결과 5개의 군집이 괜찮다.

KNN

winH <- hclust(d= dist(wineTrain))

plot(winH)

winH1 <- hclust(dist(wineTrain), method = "single")
winH2 <- hclust(dist(wineTrain), method = "complete")
winH3 <- hclust(dist(wineTrain), method = "average")
winH4 <- hclust(dist(wineTrain), method = "centroid")

KNN - Linkage method 단일, 완전, 평균, 중게중심

plot(winH1, labels = FALSE, main ="Single")

plot(winH2, labels = FALSE, main ="complete")

plot(winH3, labels = FALSE, main ="average")

plot(winH4, labels = FALSE, main ="centroid")

plot(winH)

rect.hclust(winH, k=3,  border = "red")
rect.hclust(winH, k=13, border = "blue")

Market Segmentation - KMEANS

setwd("C:/Users/Administrator/Desktop/BIG DATA")

read.csv("commerce.csv") -> df

df %>% 
  sample_n(10) %>% 
  select(1:4) %>% 
  kable()

InvoiceNo	StockCode	Description	Quantity
541811	85099B	JUMBO BAG RED RETROSPOT	9
536415	22632	HAND WARMER RED RETROSPOT	3
578314	22697	GREEN REGENCY TEACUP AND SAUCER	24
545162	85014A	BLACK/BLUE POLKADOT UMBRELLA	6
575325	84580	MOUSE TOY WITH PINK T-SHIRT	1
567853	23290	SPACEBOY CHILDRENS BOWL	4
546542	22077	6 RIBBONS RUSTIC CHARM	2
541871	37342	POLKADOT COFFEE CUP & SAUCER PINK	2
566399	22561	WOODEN SCHOOL COLOURING SET	12
547365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	12

colSums(is.na(df))

##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0           0           0           0           0 
##  CustomerID     Country 
##      135080           0

df %>% 
  filter(Quantity > 0, UnitPrice > 0) -> df_one

df_one %>% 
  mutate(time_ymd_hm = mdy_hm(InvoiceDate), 
         time_hour = hour(time_ymd_hm), 
         time_min = minute(time_ymd_hm), 
         w_day = wday(time_ymd_hm, label = TRUE, abbr = TRUE), 
         time_mon = month(time_ymd_hm, label = TRUE, abbr = TRUE), 
         time_ymd = InvoiceDate %>% str_split(" ", simplify = TRUE) 
         %>% data.frame() %>% pull(X1) %>% mdy) -> my_df

my_df %>% 
  group_by(time_ymd_hm) %>% 
  summarise(sales = sum(Quantity)) %>% 
  ungroup() -> sales_figure

sales_figure %>% 
  ggplot(aes(x= time_ymd_hm, y= sales))+
  geom_line(color = "#073763" )+
  theme_fivethirtyeight()+
  labs(title = "Sales Figure : Unit Sales by Mintues and Day",
       subtitle = "Sales flow chart",
       caption = "Data Source : Kaggle")+
  theme(axis.text.y =  element_text(color ="grey30", size=10))+ 
  theme(axis.text.x = element_text())+
  theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) + 
  theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial", 
                                    face = "bold")) + 
  theme(axis.text.y = element_text(color = "grey30", size = 10)) + 
  theme(axis.text.x = element_text(color = "grey30", size = 10)) +
  theme(plot.title = element_text(size= 15))+
  theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) + 
  theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))

my_df %>% 
  group_by(time_ymd) %>% 
  summarise(sale = sum(Quantity)) %>% 
  ungroup() -> sale_figure2

DT:: datatable(sale_figure2)

highchart() %>% 
  hc_add_series(sale_figure2, hcaes(x= time_ymd, y = sale), type= "line") %>% 
  hc_tooltip(crosshairs = TRUE) %>% 
  hc_title(text = "Sales figures by YMD") %>% 
  hc_add_theme(hc_theme_economist())

sale_figure2 %>% 
  ggplot(aes(x= time_ymd, y= sale))+
  geom_line(color = "#073763", size= 0.3)+
  geom_point(color = "indianred2")+
  theme_fivethirtyeight()+
  labs(title = "Sales Figure : Unit Sales by Year and Month",
       subtitle = "Sales flow chart 2",
       caption = "Data Source : Kaggle")+
  theme(axis.text.y =  element_text(color ="grey30", size=10))+ 
  theme(axis.text.x = element_text())+
  theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) + 
  theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial", 
                                    face = "bold")) + 
  theme(axis.text.y = element_text(color = "grey30", size = 10)) + 
  theme(axis.text.x = element_text(color = "grey30", size = 10)) +
  theme(plot.title = element_text(size= 15))+
  theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) + 
  theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))

my_df %>% 
  mutate(profit  = Quantity * UnitPrice) -> my_df

my_df %>% 
  group_by(time_ymd_hm) %>% 
  summarise(profit = round(sum(profit),0)) -> sales_figure3

sales_figure3 %>% 
  ggplot(aes(x= time_ymd_hm, y= profit))+
  geom_line(color = "#073763" )+
  theme_fivethirtyeight()+
  labs(title = "Sales Figure : Sales Profit by min",
       subtitle = "Sales flow chart-3",
       caption = "Data Source : Kaggle")+
  theme(axis.text.y =  element_text(color ="grey30", size=10))+ 
  theme(axis.text.x = element_text())+
  theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) + 
  theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial", 
                                    face = "bold")) + 
  theme(axis.text.y = element_text(color = "grey30", size = 10)) + 
  theme(axis.text.x = element_text(color = "grey30", size = 10)) +
  theme(plot.title = element_text(size= 15))+
  theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) + 
  theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))

my_df %>% 
  group_by(time_mon) %>% 
  summarise(profit_mon = sum(profit)) %>% 
  mutate(profit100 = round(profit_mon/ 1000,0)) %>% 
  ggplot(aes(time_mon, profit100))+
  geom_col(fill = "#eca221", col= "white")+
  theme(panel.grid.major.x = element_blank())+ 
  theme_fivethirtyeight()+
    labs(title = "Sales Figure : Sales Profit by Month",
       subtitle = "Sales flow chart-3",
       caption = "Data Source : Kaggle")+
  annotate("text", label= "The Highest Profit", x= 9.5, y=1700, size =3.5, color = "grey30")+
    geom_curve(x= 10, xend =11, y = 1600, yend = 1500, arrow = arrow(length = unit(0.5,"cm")), curvature =0.1)+
  theme(axis.text.y =  element_text(color ="grey30", size=10))+ 
  theme(axis.text.x = element_text())+
  theme(text = element_text(family = "Arial Black", size = 13, color = "gray30")) + 
  theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial", 
                                    face = "bold")) + 
  theme(axis.text.y = element_text(color = "grey30", size = 10)) + 
  theme(axis.text.x = element_text(color = "grey30", size = 10)) +
  theme(plot.title = element_text(size= 15))+
  theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) + 
  theme(plot.margin = unit(c(1.4, 1.4, 1.4, 1.4), "cm"))

my_df %>% 
  group_by(time_mon) %>% 
  summarise(profit_mon = sum(profit)) %>% 
  mutate(profit100 = round(profit_mon/ 1000,0)) -> figure4

figure4

## # A tibble: 12 x 3
##    time_mon profit_mon profit100
##    <ord>         <dbl>     <dbl>
##  1 1           691365.       691
##  2 2           523632.       524
##  3 3           717639.       718
##  4 4           537809.       538
##  5 5           770536.       771
##  6 6           761740.       762
##  7 7           719221.       719
##  8 8           759138.       759
##  9 9          1058590.      1059
## 10 10         1154979.      1155
## 11 11         1509496.      1509
## 12 12         1462539.      1463

highchart() %>% 
  hc_add_series(figure4, hcaes(time_mon, profit100), type= "column") %>%  
  hc_add_series(figure4, hcaes(time_mon, profit100), type= "line") %>% 
  hc_xAxis(categories = c("JAN", "FEB", "MAR", "APR", "MAY",
                          "JUNE", "JULY", "AUG","SEP","OCT", "NOV", "DEC")) %>% 
  hc_yAxis(labels = list(style = list(fontSize = 10)),
             title = list(text = "Sales / 1000",
             style = list(fontSize = 10))) %>% 
  hc_tooltip(pointFormat = paste("Month: <b> {point.time_mon} </b> <br> 
                                 Sales: <b>{point.y}</b>")) %>%
  hc_title(text = "Profits per Month and",
           style = (list(fontSize = '14px'))) %>% 
  hc_add_theme(hc_theme_flat())

my_df %>% 
  group_by(Description) %>% 
  summarise(sales= sum(profit)) %>% 
  ungroup() %>% 
  arrange(-sales) %>% 
  mutate(Description = factor(Description), levels = Description) %>% 
  slice(1:20) -> top_items

top_items

## # A tibble: 20 x 3
##    Description                           sales levels                           
##    <fct>                                 <dbl> <fct>                            
##  1 "DOTCOM POSTAGE"                    206249. "DOTCOM POSTAGE"                 
##  2 "REGENCY CAKESTAND 3 TIER"          174485. "REGENCY CAKESTAND 3 TIER"       
##  3 "PAPER CRAFT , LITTLE BIRDIE"       168470. "PAPER CRAFT , LITTLE BIRDIE"    
##  4 "WHITE HANGING HEART T-LIGHT HOLDE~ 106293. "WHITE HANGING HEART T-LIGHT HOL~
##  5 "PARTY BUNTING"                      99504. "PARTY BUNTING"                  
##  6 "JUMBO BAG RED RETROSPOT"            94340. "JUMBO BAG RED RETROSPOT"        
##  7 "MEDIUM CERAMIC TOP STORAGE JAR"     81701. "MEDIUM CERAMIC TOP STORAGE JAR" 
##  8 "Manual"                             78113. "Manual"                         
##  9 "POSTAGE"                            78102. "POSTAGE"                        
## 10 "RABBIT NIGHT LIGHT"                 66965. "RABBIT NIGHT LIGHT"             
## 11 "PAPER CHAIN KIT 50'S CHRISTMAS "    64952. "PAPER CHAIN KIT 50'S CHRISTMAS "
## 12 "ASSORTED COLOUR BIRD ORNAMENT"      59095. "ASSORTED COLOUR BIRD ORNAMENT"  
## 13 "CHILLI LIGHTS"                      54118. "CHILLI LIGHTS"                  
## 14 "SPOTTY BUNTING"                     42548. "SPOTTY BUNTING"                 
## 15 "JUMBO BAG PINK POLKADOT"            42436. "JUMBO BAG PINK POLKADOT"        
## 16 "BLACK RECORD COVER FRAME"           40652. "BLACK RECORD COVER FRAME"       
## 17 "PICNIC BASKET WICKER 60 PIECES"     39620. "PICNIC BASKET WICKER 60 PIECES" 
## 18 "DOORMAT KEEP CALM AND COME IN"      38167. "DOORMAT KEEP CALM AND COME IN"  
## 19 "SET OF 3 CAKE TINS PANTRY DESIGN "  38158. "SET OF 3 CAKE TINS PANTRY DESIG~
## 20 "JAM MAKING SET WITH JARS"           37129. "JAM MAKING SET WITH JARS"

DT::datatable(top_items)

ggplot(top_items, aes(x= reorder(Description,sales)))+
  geom_col(aes(y= sales), fill= "#eca221")+
  geom_line(aes(y= sales), size= 0.8, group=1, color = "Black")+
  labs(x= "Items", y= "Sales $",title = "Sales Figure : Sales Profit by Month",
       subtitle = "Sales flow chart-3",
       caption = "Data Source : Kaggle")+
  theme(text = element_text(family = "Arial Black", size = 8, color = "gray30")) + 
  theme(plot.caption = element_text(size = 10, color = "grey40", family = "Arial", 
                                    face = "bold")) + 
  theme(axis.text.y = element_text(color = "grey30", size = 8)) + 
  theme(axis.text.x = element_blank())+ 
  theme(plot.title = element_text(size= 13))+
  theme(plot.subtitle = element_text(color = "gray30", size = 10, family = "Arial")) + 
  theme(plot.background = element_rect(fill="#F0F0F0"))+
  theme(plot.margin = unit(c(1.2, 1.2, 1.2, 1.2), "cm"))+
  annotate("text", label = "DOTCOM POSTAGE", x= 18, y=200000, size= 3.0)

highchart() %>% 
  hc_add_series(top_items, hcaes(Description, sales), type= "column") %>%  
  hc_add_series(top_items, hcaes(Description, sales), type= "line") %>% 
  hc_yAxis(labels = list(style = list(fontSize = 10)),
             title = list(text = "sales",
             style = list(fontSize = 10))) %>% 
  hc_xAxis(title= list(text= NULL)) %>% 
  hc_tooltip(pointFormat = paste("Description: <b> {point.Description} </b> <br> 
                                  Sales: <b>{point.y}</b>")) %>%
  hc_title(text = "Top 20 Items by sales figure",
           style = (list(fontSize = '14px'))) %>% 
  hc_add_theme(hc_theme_flatdark()) -> p1


htmltools::tagList(p1)

Customer Segments

y <- as.duration(ymd_hm("2011-12-31 24:59") - my_df$time_ymd_hm) %>% as.numeric()
y <- round(y / (3600*24), 0)

my_df %>%  mutate(receny = y) ->my_df

my_df$CustomerID <- as.character(my_df$CustomerID)

my_df %>% 
  group_by(CustomerID) %>% 
  summarise_each(funs(sum), profit) %>% 
  ungroup() %>% 
  na.omit() -> profit_df  #고객 ID 별 Profit 


my_df %>% 
  group_by(CustomerID) %>% 
  summarise_each(funs(min), receny)  %>% 
  ungroup() %>% 
  na.omit() -> receny_df    #Reency 모음

my_df %>% 
  group_by(CustomerID) %>% 
  count() %>%  
  ungroup() %>% 
  na.omit()-> count_df  ## 고객 Count


# Coustemer ID 기준으로 

profit_df %>% 
  mutate(CustomerID = as.character(CustomerID)) %>% 
  full_join(receny_df, by ="CustomerID") %>% 
  full_join(count_df, bu = "CustomerID") -> final_df



DT::datatable(final_df)

## OUtliers 확인 


boxplot(final_df$profit)

boxplot(final_df$receny)

boxplot(final_df$n)

이상치 제거

outlier_label <- function(x) {
  a <- mean(x)
  b <- sd(x)
  th1 <- a - 3*b
  th2 <- a + 3*b
  y <- case_when(x >= th1 & x <= th2 ~ "Normal", TRUE ~ "Outlier")
  return(y)
  
}


final_df %>% 
  mutate(nor_money = outlier_label(profit), nor_freq = outlier_label(n)) %>%  
  filter(nor_money == "Normal", nor_freq == "Normal") %>% 
  select(1:4)-> df_allnormal -> df_allnormal

boxplot(df_allnormal$profit)

Optimal- K 찾기

하티건 그래프

#Numeric Data 만 선별 

df_allnormal[, sapply(final_df, is.numeric)] -> K_data



#정규화 
as.data.frame(scale(K_data)) -> scale_data



# Optimal K 

FitKMeans(K_data, max.clusters = 15, nstart = 25, seed= 125423) -> bestk

PlotHartigan(bestk)

Elbow Method - SS/TOtal_SS 비율이 적게 움직이는 Point 를 찾아보기.

5 개로 클러스터링 하는 것이 적합해 보임

set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15

wss <- sapply(1:k.max, 
              function(k){kmeans(scale_data, k, nstart=50,
                                 iter.max = 15)$tot.withinss})
wss

##  [1] 12861.000  8044.794  4787.072  3550.839  2815.099  2411.752  2114.481
##  [8]  1858.031  1704.857  1525.015  1417.678  1294.715  1227.693  1150.527
## [15]  1082.664

plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

set.seed(1004)

km_one <- kmeans(scale_data, 4, nstart = 30)

df_allnormal %>% 
  mutate(K_group = km_one$cluster) %>% 
  mutate(K_group = paste("Group", K_group))  -> Kmean_df


# Groups 별 EDA 

Kmean_df %>% 
  group_by(K_group) %>% 
  summarise_each(funs(mean), profit, receny, n) %>% 
  mutate_if(is.numeric, function(x){round(x,0)}) %>% 
  ungroup() %>% 
  arrange(-profit) -> Kgroup_df


DT::datatable(Kgroup_df)

Group 1 은 평균 14100 정도로 들어가있는 고객들은 2628 명 (VIP로 분류)

Group 3 은 평균 3786 정도 지불하지만 (VIP 로 올라갈 수 있는 가망 고객들) - 타켓 설정

#그룹평 평균 sales/ receny/ n 

Kgroup_df %>% 
  group_by(K_group) %>% 
  summarise_each(funs(mean), profit) %>% 
  mutate(sale_per = profit/sum(profit)*100) %>% 
  mutate(sale_per = round(sale_per,0))-> sales_bygroup



ggplot(sales_bygroup, aes(reorder(K_group,sale_per), sale_per), fill=K_group, color= K_group)+
  geom_col(fill = "#eca221", col= "white")+
  theme_fivethirtyeight()+
  coord_flip()+
  geom_text(aes(label = paste0(sale_per,"%"), hjust=-0.05))+
  scale_fill_tableau() + 
  scale_color_tableau() + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank())+ 
  labs(title = "Sales by Customer Groups (K-MEAN)",
        caption = "Data Source : Kaggle")+
  theme(plot.title = element_text(size=13))

str(Kgroup_df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    4 obs. of  4 variables:
##  $ K_group: chr  "Group 4" "Group 2" "Group 1" "Group 3"
##  $ profit : num  14100 3786 965 439
##  $ receny : num  38 47 70 273
##  $ n      : num  323 268 53 26

as.factor(Kgroup_df$K_group) -> Kgroup_df$K_group

Kgroup_df %>% 
  mutate(pct= round(n/sum(n) *100,0)) %>% 
  mutate(pct = paste(pct, "%")) %>% 
  ggplot(aes(K_group, pct, fill = K_group, color= K_group),)+
  geom_col()+
  theme_fivethirtyeight()+
  coord_flip()+
  geom_text(aes(label = paste0(pct,"%"), hjust=-0.05))+
  scale_fill_tableau() + 
  scale_color_tableau() + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) + 
  theme(panel.grid.major = element_blank()) + 
  theme(panel.grid.minor = element_blank())+ 
  labs(title = "The Number of Customer Groups (K-MEAN)",
        caption = "Data Source : Kaggle")+
  theme(plot.title = element_text(size=13))+
  theme(legend.position = "none")

Random_Forest

library(caret)

Kmean_df%>% 
  mutate(Group = as.factor(K_group)) %>% 
  select(-CustomerID, -K_group)  -> new_df



set.seed(1004)

flag <- createDataPartition(new_df$Group, p = 0.8, list = FALSE)

train <- new_df[flag,]
test <- new_df[-flag,]


train(Group ~ ., method= "rf", data=train) -> rf

## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

predict(rf, test) -> pred

confusionMatrix(pred, test$Group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Group 1 Group 2 Group 3 Group 4
##    Group 1     525       2       1       0
##    Group 2       0     103       1       0
##    Group 3       1       0     205       0
##    Group 4       0       1       0      17
## 
## Overall Statistics
##                                           
##                Accuracy : 0.993           
##                  95% CI : (0.9848, 0.9974)
##     No Information Rate : 0.6145          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9872          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Group 1 Class: Group 2 Class: Group 3
## Sensitivity                  0.9981         0.9717         0.9903
## Specificity                  0.9909         0.9987         0.9985
## Pos Pred Value               0.9943         0.9904         0.9951
## Neg Pred Value               0.9970         0.9960         0.9969
## Prevalence                   0.6145         0.1238         0.2418
## Detection Rate               0.6133         0.1203         0.2395
## Detection Prevalence         0.6168         0.1215         0.2407
## Balanced Accuracy            0.9945         0.9852         0.9944
##                      Class: Group 4
## Sensitivity                 1.00000
## Specificity                 0.99881
## Pos Pred Value              0.94444
## Neg Pred Value              1.00000
## Prevalence                  0.01986
## Detection Rate              0.01986
## Detection Prevalence        0.02103
## Balanced Accuracy           0.99940

test %>% 
  mutate(Predicted = pred) %>% 
  group_by(Predicted) %>% 
  summarise_each(funs(mean), profit, receny, n) %>% 
  mutate_if(is.numeric, function(x){round(x,0)}) -> test_df


DT::datatable(test_df)

train %>% 
  group_by(Group) %>% 
  summarise_each(funs(mean), profit, receny, n) %>% 
  mutate_if(is.numeric, function(x){round(x,0)}) -> train_df

DT::datatable(train_df)

K-MEANS Clustering

DOEUN

2020-04-07

K- MEANS

KNN

Market Segmentation - KMEANS

Customer Segments

Random_Forest