k means 实操练习2：

Data preperation:

数据下载：https://archive.ics.uci.edu/ml/datasets/Online+Retail

整体观察，有多少消费者，有多少缺失值
我们在消费者层面做研究，所以剔除那些缺失CustomerID的样本
将日期这一变量的格式规范化
选取一整年的数据（强迫症）：
消费者行为特点具有地域性，我们选取一个国家作为研究对象吧：
看来数据主要来自英国。
这些收据里面，有的是购物的，有的是退货的，有必要做一下区分。C表示cancel退货。

onlineRetail <- read.csv("https://www.dropbox.com/s/zemwk29vgvgb87t/Online%20Retail.csv?dl=1")

str(onlineRetail)

## 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
##  $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
##  $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: Factor w/ 23260 levels "1/10/11 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...

用dplyr package 的pipe operator 结合select()等函数，检查多少样本缺了CustomerID

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

onlineRetail %>%
  select(CustomerID) %>%
  is.na() %>%
  sum()

## [1] 135080

使用filter()排除缺失了CustomerID的样本，新得到的数据命名为onlineRetail_new：

onlineRetail_new <- onlineRetail %>%
  filter(!is.na(CustomerID))

将变量InvoiceDate的格式修改为日期格式：

onlineRetail_new<-onlineRetail_new %>%
  mutate(InvoiceDate=as.Date(InvoiceDate, format = "%m/%d/%y"))

查看InvoiceDate的日期范围

range(onlineRetail_new$InvoiceDate)

## [1] "2010-12-01" "2011-12-09"

取一整年数据作为研究对象：

onlineRetail_new <- subset(onlineRetail_new, InvoiceDate >= "2010-12-09")

确认我们的数据的时间范围：

range(onlineRetail_new$InvoiceDate)

## [1] "2010-12-09" "2011-12-09"

查看商品销售的国家分布：

table(onlineRetail_new$Country)

## 
##            Australia              Austria              Bahrain 
##                 1237                  401                   17 
##              Belgium               Brazil               Canada 
##                 2057                   32                  151 
##      Channel Islands               Cyprus       Czech Republic 
##                  758                  622                   30 
##              Denmark                 EIRE   European Community 
##                  389                 7351                   61 
##              Finland               France              Germany 
##                  695                 8313                 9274 
##               Greece            Hong Kong              Iceland 
##                  146                    0                  151 
##               Israel                Italy                Japan 
##                  250                  778                  342 
##              Lebanon            Lithuania                Malta 
##                   45                    0                  127 
##          Netherlands               Norway               Poland 
##                 2369                  939                  333 
##             Portugal                  RSA         Saudi Arabia 
##                 1413                   58                   10 
##            Singapore                Spain               Sweden 
##                  229                 2528                  462 
##          Switzerland United Arab Emirates       United Kingdom 
##                 1871                   68               349806 
##          Unspecified                  USA 
##                  244                  291

选取英国作为研究对象：

onlineRetail_new <- subset(onlineRetail_new, Country == "United Kingdom")

查看销售来自多少次交易，和多少个消费者：

length(unique(onlineRetail_new$InvoiceNo))

## [1] 19140

length(unique(onlineRetail_new$CustomerID))

## [1] 3891

识别并标记哪些是属于退货，哪些是属于新购买：

# Identify returns
onlineRetail_new$item.return <- grepl("C", onlineRetail_new$InvoiceNo, fixed=TRUE) 
onlineRetail_new$purchase.invoice <- ifelse(onlineRetail_new$item.return=="TRUE", 0, 1)

做Customer segmentation，我们最关注的是each customer’s recency of last purchase, frequency of purchase, and monetary value. These three variables, collectively known as RFM, are often used in customer segmentation for marketing purposes。具体的参见维基百科：https://en.wikipedia.org/wiki/RFM_(customer_value)

重构数据：以消费者为单位 Create customer-level data

我们现在的数据是收据-商品水平的，一个消费者可能多次消费有多张收据，一张收据上有多个商品。我们要做的是消费者层面的分析，所以要重构数据，以消费者为单位。

具体地，

recency，每个消费者上一次消费距离现在有多少天？
frequency，消费的次数是多少，即是每一个人收据的个数？
Monetary value，每个消费者消费总额多少？
有些人的Monetary value是负的，这可能是因为今年退货了去年买的东西，我们把负的统一设置成0。
recency

除掉退货的，我们只考虑最新的购物。
我们把分析时间定在最近的一张收据的后一天，所以Recency是最近一次消费距离“2011-12-10”的天数。

library(dplyr)
recency <- onlineRetail_new %>%
  filter(purchase.invoice == 1) %>%
  mutate(recency = as.numeric(difftime("2011-12-10", as.Date(InvoiceDate)), units="days")
         )

recency <- aggregate(recency ~ CustomerID, data=recency, FUN=min, na.rm=TRUE)

Frequency

library(dplyr)
frequency<- onlineRetail_new %>%
  filter(purchase.invoice == 1) %>%
  select("CustomerID","InvoiceNo", "purchase.invoice") %>%
  arrange(CustomerID)

frequency <- aggregate(purchase.invoice ~ CustomerID, data=frequency, FUN=sum, na.rm=TRUE)

colnames(frequency)[colnames(frequency)=="purchase.invoice"] <- "frequency"

Monetary Value of Customers

# Total spent on each item on an invoice
annual.sales <- onlineRetail_new %>%
  filter(purchase.invoice == 1) %>%
  mutate(Amount = Quantity*UnitPrice)

# Aggregated total sales to customer
annual.sales <- aggregate(Amount ~ CustomerID, data=annual.sales, FUN=sum, na.rm=TRUE)
names(annual.sales)[names(annual.sales)=="Amount"] <- "monetary"

以CustomerID为纽带，合并三个数据框，得到以消费者为单位的新数据：

# merge all three variables
customers <- left_join(recency, frequency, by="CustomerID") %>%
  left_join(.,annual.sales, by="CustomerID")

将为负数的年消费额修改为0:

summary(customers$monetary)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0    296.7    642.4   1827.2   1554.5 233736.9

customers$monetary <- ifelse(customers$monetary < 0, 0, customers$monetary)

我们用kmeans的方法来做集聚分析，可以把这个数据里的消费者分成几个组？
从1到10都试一下，根据肘子法则，方差下降最大的k为最合适的。(Total within-cluster sum of squares, i.e. sum(withinss).)

library(purrr)
library(ggplot2)
library(dplyr)
  
# Use map_dbl to run many models with varying value of k (centers)
tot_withinss <- map_dbl(1:10,  function(k){
  model <- kmeans(x = customers[,2:4], centers = k)
  model$tot.withinss
})

# Generate a data frame containing both k and tot_withinss
elbow_df <- data.frame(
  k = 1:10,
  tot_withinss = tot_withinss
)

# Plot the elbow plot
ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
  geom_line() +
  scale_x_continuous(breaks = 1:10)

根据上面的结果，运用kmeans() 来聚类成2组，将结果合并到原数据，检查分组特征：

# Build a kmeans model
model_km <- kmeans(customers[,2:4], centers = 2)

# Extract the cluster assignment vector from the kmeans model
clust_km <- model_km$cluster

# Create a new dataframe appending the cluster assignment
customers_segment <- mutate(customers, cluster = clust_km)

customers_segment %>% 
  group_by(cluster) %>% 
  summarise_all(funs(mean(.)))

## # A tibble: 2 x 5
##   cluster CustomerID recency frequency monetary
##     <int>      <dbl>   <dbl>     <dbl>    <dbl>
## 1       1     15556.    89.0      85.3    1480.
## 2       2     15524.    28.7     956.    90844.

炫酷的3D图：

customers<-read.csv("https://www.dropbox.com/s/3bs78srrnjuwcmu/customers.csv?dl=1")
library(purrr)
library(car)
library(rgl)
library(dplyr)
library(mgcv)

# Plot clusters in 3D

colors <- c('red','orange','green3','deepskyblue','blue','darkorchid4','violet','pink1','tan3','black')

# Build a kmeans model
model_km <- kmeans(customers[,2:4], centers = 4)

# Extract the cluster assignment vector from the kmeans model
clust_km <- model_km$cluster

# Create a new dataframe appending the cluster assignment
customers_segment <- mutate(customers, cluster = clust_km)

scatter3d(x = log(customers_segment$frequency), 
          y = log(customers_segment$monetary),
          z = log(customers_segment$recency), 
          groups = factor(customers_segment$cluster),
          xlab = "Frequency (Log-transformed)", 
          ylab = "Monetary Value (log-transformed)",
          zlab = "Recency (Log-transformed)",
          surface.col = colors,
          axis.scales = FALSE,
          surface = TRUE, # produces the horizonal planes through the graph at each level of monetary value
          fit = "smooth",
          #     ellipsoid = TRUE, # to graph ellipses uses this command and set "surface = " to FALSE
          grid = TRUE,
          axis.col = c("black", "black", "black"))

remove(colors)

完结撒花！