Project Overview

自动售货机以线上经营的理念，提供线下的便利服务，以小巧、自助的经营模式节省人工成本，让实惠、高品质的商品触手可及，成为当下零售经营的又一主流模式。自动售货机内商品的供给频率、种类选择、供给量、站点选择等是自动售货机运者需要重点关注的问题。因此，科学的商业数据分析能够帮助经营者了解用户需求，掌握商品需求量，为用户提供精准贴心的服务，是掌握经营方向的重要手段，对自动售货机这一营销模式的发展有着非常重要的意义。某商场在不同地点安放了5台自动售货机,，编号分别为 A 、 B 、 C 、 D 、 E 。提供了从 201 7 年 1 月 1 日至 201 7 年 12 月 3 1 日每台自动售货机的商品销售数据。

Access to the Dataset

library(tidyverse)
# csv中包含中文，所以encoding = 'gb2312'
file1 <- read_csv("file1.csv", locale = locale(encoding = "gb2312"))
file2 <- read_csv("file2.csv", locale = locale(encoding = "gb2312"))
names(file1) <- c("order_ID", "machine_ID", "sum_owing", "sum_pay", "product", "date", 
    "location", "state", "withdraw")
names(file2) <- c("product", "General_Category", "Sub_Category")

library(lubridate)  # 处理时间数据
file1$date <- ymd_hm(file1$date)
file1 <- file1 %>% mutate(month = month(date), day = day(date), hour = hour(date))
# 查看数据结构 str(file1) str(file2)

Discover and Visualize the Data to Gain Insights

A. Extract machine sales data, respectively

为了进行针对性分析，我们首先从汇总的数据集中分别提取出A、B、C、D、E机器的数据，并将它们一并储存在列表当中，方便后续调用。

# 改变策略，将五个机器的数据分别储存在list里面
locations <- c("A", "B", "C", "D", "E")
M_data <- vector("list", 5)
names(M_data) <- c("Machine_A", "Machine_B", "Machine_C", "Machine_D", "Machine_E")
for (i in seq_along(locations)) {
    M_data[[i]] <- filter(file1, location == locations[[i]])
}
# 编写函数进行储存
csv_names <- c("task1-1A.csv", "task1-1B.csv", "task1-1C.csv", "task1-1D.csv", "task1-1E.csv")
for (i in seq_along(names(M_data))) {
    write.csv(M_data[[i]], csv_names[[i]])
}

B. Explore the data in April

我们首先通过提取四月份的数据，对整体数据集进行初探，通过对机器的销售额，销售量，以及整体销售情况的分析，得以对整体数据集有所了解。

data_201704 <- file1 %>% filter(year(date) == 2017, month(date) == 4)

GMV of each machine

## 每台售货机的交易额
data_201704 %>% group_by(location) %>% summarise(Total_trading_value = sum(sum_pay)) %>% 
    arrange(desc(Total_trading_value)) %>% DT::datatable()

在四月份，E售货机与C售货机都有着较高的销售额，B机器的销售额中规中矩，而反观A与D却分别只有1804元与1679元的低水平销售额度。

Order quantity of each machine

## 每台售货机的订单量
data_201704 %>% group_by(location) %>% summarise(Total_trading_amount = n()) %>% 
    arrange(desc(Total_trading_amount)) %>% DT::datatable()

同样的，从销售量角度来看，E机器与C机器同样处于领先位置，E作为销售冠军在四月份的销售量高达895件，高于D机器销售量的两倍。

Total GMV and quantity in April

## 所有交易额
NA

[1] NA

## 所有订单量
NA

[1] NA

所有机器在四月份所达成了3122次交易，为投资者创造了12896.4元的收入。通过对四月份数据的探索我们对无人售货机的销售情况有了初步的了解，在接下来的部分我们将扩大分析层面，帮助投资者进行商业分析，获取更大收益。

C. Visualization

数据可视化能让我们将商业规律直观的呈现于图表之上，帮助我们从复杂繁琐的数据中提取出可视的信息以便于我们后续建模的决策。在此部分我们将通过可视化的方式展现：

年度畅销商品
月度销售额的变化趋势
交易额月环比增长率
售货机交易额与订单量关系
畅销类别
三种时间层面上的销售趋势

TOP 5 Order Amount

首先，我们想知道2017年的畅销商品是什么，于是通过条形图的方法进行了汇总展示。如下图所示，怡宝纯净水在销售量上遥遥领先，年销量接近5000件；第二梯队有脉动，东鹏特饮，阿萨姆奶茶与营养快线，它们的年销售量都处于2200到2700内；统一冰红茶处于第三梯队，不过也有着1891件售出的好成绩。

bar_data_top5 <- as.data.frame(table(file1$product)) %>% arrange(desc(Freq))

names(bar_data_top5) <- c("product", "Freq")

ggplot(data = head(bar_data_top5), aes(x = product, y = Freq)) + geom_bar(aes(fill = product), 
    stat = "identity") + geom_text(aes(label = Freq), hjust = 1.5, colour = "black") + 
    theme_bw() + labs(title = "2017年销售前五商品") + coord_flip() + theme(legend.position = "none", 
    axis.title.x = element_text(face = "italic", colour = "black", size = 12), axis.title.y = element_text(face = "italic", 
        colour = "black", size = 12))

Plot monthly GMV

接下来我们首先对销售数据进行了按月汇总，如此一来我们便能通过绘制折线图得出销售额的月度变化趋势，在此之上，我们还对数据按机器进行了分类，从而能够帮助分析者精准的得到每台售货机独自的销售趋势。

整体趋势：在整体上看，从一月份到六月份，所有无人售货机都有着稳步增长的趋势，并在六月份达到顶峰，但却在七月份有着明显的大幅度下降，随后销售量又缓缓回升，最终达到与六月份峰值上下的位置。
个别趋势：当我们把视线转移到每台售货机上，可以发现A、B、C三台机器的月度销售额变化趋势是相近的。D机器的销售状态一直比较疲软，在六月份到七月份它的销售额没有大幅度下降，但是在17年其余时刻它的销售额也并没有非常好的表现。反观E机器，在经历过七月份销售额走跌后，它迅速回升，上涨幅度很大，并在11月份达到了20000元左右的销售量，由此我们应该对E机器的属性进行研究，争取扩展到其他机器上，为我们带来更大的收益。

## 折线图
value_per_month <- file1 %>% group_by(location, month) %>% summarise(value = sum(sum_pay))
value_per_month <- na.omit(value_per_month)

p_monthly_GMV <- ggplot(data = value_per_month) + geom_line(aes(month, value), color = "black", 
    size = 1) + geom_point(aes(month, value), color = "red", size = 3, alpha = 2/3) + 
    facet_wrap(~location, nrow = 5) + scale_x_continuous(breaks = c(1, 2, 3, 4, 5, 
    6, 7, 8, 9, 10, 11, 12), labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", 
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) + theme_bw()

p_monthly_GMV

Quarter-on-quarter growth rate

通过上面的折线图分析，在此部分我们想要更直观的得到各个机器的交易月环比增长率，从而得出哪些月份交易额有着较大的上涨，哪些月份交易额又有着大幅度的下跌。从下图中我们可以发现，在折线图中不易观测到的，二月份普遍有着中等程度的销售额下跌。与在折线图中一致的是，所有机器的交易额都在七月份有着大幅度的骤跌，在其余月份总体上都在小幅度增长。

growth_rate <- function(data) {
    rating <- data %>% group_by(month) %>% summarise(value = sum(sum_pay))
    rating <- rating %>% mutate(rate = c(0, diff(value)/rating$value[-1]))
    rating <- rating[-1, ]
    ggplot(rating) + geom_bar(aes(month, rate), stat = "identity", fill = "black", 
        color = "gray") + theme_bw() + scale_x_continuous(breaks = c(2, 3, 4, 5, 
        6, 7, 8, 9, 10, 11, 12), labels = c("Feb", "Mar", "Apr", "May", "Jun", "Jul", 
        "Aug", "Sep", "Oct", "Nov", "Dec"))
}

plotlist <- vector("list", 5)
names(plotlist) <- c("p_A", "p_B", "p_C", "p_D", "p_E")
library(stringr)
title <- str_c(LETTERS[1:5], "机器交易月环比增长率")

for (i in c(1:5)) {
    plotlist[[i]] <- growth_rate(M_data[[i]]) + labs(title = title[[i]])
}
Rmisc::multiplot(plotlist = plotlist, cols = 2)

Bubble Plot

bubble_value <- file1 %>% group_by(location) %>% summarise(value = sum(sum_pay)) %>% 
    ggplot(aes(location, value)) + geom_point(aes(size = value, color = location), 
    alpha = 2/3) + ggrepel::geom_label_repel(aes(label = value), data = file1 %>% 
    group_by(location) %>% summarise(value = sum(sum_pay))) + theme_bw()
# bubble_value

bubble_amount <- file1 %>% group_by(location) %>% summarise(n = n()) %>% ggplot(aes(location, 
    n)) + geom_point(aes(size = n, color = location), alpha = 2/3) + ggrepel::geom_label_repel(aes(label = n), 
    data = file1 %>% group_by(location) %>% summarise(n = n())) + theme_bw()
# bubble_amount

Rmisc::multiplot(bubble_amount, bubble_value, cols = 1)

Order Amount Analysis

在可视化的第一部分，我们已经分析出了一些畅销产品，而在此部分中，我们想知道，在不同的机器中，分别有哪些类别的商品比较畅销，然后通过分析结果来优化我们的产品投放。

bar_data <- as.data.frame(table(file1$product)) %>% arrange(desc(Freq))
names(bar_data) <- c("product", "Freq")

join_data <- bar_data %>% left_join(file2, by = "product")
breaks <- c(-Inf, 20, 100, Inf)
labels <- c("滞销", "正常", "热销")
join_data <- join_data %>% mutate(tag = cut(Freq, breaks, labels))

product_bar_data <- join_data %>% group_by(Sub_Category) %>% summarise(Freq = sum(Freq)) %>% 
    arrange(desc(Freq))

plot_All <- ggplot(product_bar_data, aes(reorder(Sub_Category, Freq), Freq)) + geom_bar(stat = "identity", 
    aes(fill = Sub_Category)) + geom_text(aes(label = Freq), hjust = 1.5, colour = "black") + 
    coord_flip() + theme_minimal() + labs(y = "Freq", x = "Sub_Category", title = "销售量汇总图") + 
    theme(legend.position = "none")

fun_all <- function(x, title) {
    as.data.frame(table(x$product)) %>% arrange(desc(Freq)) %>% mutate(product = Var1) %>% 
        select(product, Freq) %>% left_join(file2, by = "product") %>% mutate(tag = cut(Freq, 
        breaks, labels)) %>% group_by(Sub_Category) %>% summarise(Freq = sum(Freq)) %>% 
        arrange(desc(Freq)) %>% ggplot(aes(reorder(Sub_Category, Freq), Freq)) + 
        geom_bar(stat = "identity", aes(fill = Sub_Category)) + geom_text(aes(label = Freq), 
        hjust = 1.5, colour = "black") + coord_flip() + theme_minimal() + labs(y = "Freq", 
        x = "Sub_Category", title = title) + theme(legend.position = "none")
}
## Plot
plotlist2 <- vector("list", 5)
names(plotlist2) <- c("p_A", "p_B", "p_C", "p_D", "p_E")
title2 <- str_c(LETTERS[1:5], "机器销售量分析")

for (i in c(1:5)) {
    plotlist2[[i]] <- fun_all(M_data[[i]], title2[[i]])
}
plotlist2$plot_All <- plot_All

p <- Rmisc::multiplot(plotlist = plotlist2, cols = 3)

NULL

A机器销量分析

热销产品	滞销产品
肉干/豆制品/蛋 (1942 pcs)	果冻/龟苓膏 (4 pcs)
茶饮料 (1430 pcs)	坚果炒货 (17 pcs)
乳制品 (1269 pcs)	香烟 (25 pcs)
功能饮料 (1109 pcs)	纸巾 (25 pcs)
饼干高点 (789 pcs)	其他 (37 pcs)

B机器销量分析

热销产品	滞销产品
茶饮料 (2144 pcs)	其他 (37 pcs)
肉干/豆制品/蛋 (1942 pcs)	果冻/龟苓膏 (13 pcs)
乳制品 (1660 pcs)	纸巾 (23 pcs)
水 (1408 pcs)	糖果/巧克力 (25 pcs)
功能饮料 (1337 pcs)	坚果炒货 (28 pcs)

C机器销量分析

热销产品	滞销产品
茶饮料 (2530 pcs)	果冻/龟苓膏 (4 pcs)
肉干/豆制品/蛋 (1848 pcs)	其他 (11 pcs)
功能饮料 (1783 pcs)	坚果炒货 (21 pcs)
乳制品 (1661 pcs)	香烟 (24 pcs)
碳酸饮料 (1276 pcs)	糖果/巧克力 (26 pcs)

D机器销量分析

热销产品	滞销产品
肉干/豆制品/蛋 (1540 pcs)	纸巾 (6 pcs)
茶饮料 (1501 pcs)	坚果炒货 (11 pcs)
功能饮料 (1028 pcs)	糖果巧克力 (22 pcs)
乳制品 (969 pcs)	香烟 (22 pcs)
碳酸饮料 (939 pcs)	果冻/龟苓膏 (23 pcs)

E机器销量分析

热销产品	滞销产品
茶饮料 (3959 pcs)	香烟 (10 pcs)
肉干/豆制品/蛋 (3162 pcs)	坚果炒货 (14 pcs)
乳制品 (3030 pcs)	果冻/龟苓膏 (21 pcs)
功能饮料 (2159 pcs)	纸巾 (29 pcs)
碳酸饮料 (2173 pcs)	糖果/巧克力 (37 pcs)

汇总销量分析

热销产品	滞销产品
茶饮料 (11564 pcs)	果冻/龟苓膏 (10 pcs)
肉干/豆制品/蛋 (10434 pcs)	坚果炒货 (14 pcs)
乳制品 (8589 pcs)	纸巾 (21 pcs)
功能饮料 (7776 pcs)	其他 (29 pcs)
碳酸饮料 (6333 pcs)	香烟 (37 pcs)

由上面呈现的各个无人售货机的年度销售状况可以看到，无人售货机的热销产品与滞销产品的类别基本一致，热销类主要有茶饮料、肉干/豆制品/蛋、乳制品、功能饮料、碳酸饮料、饼干糕点、水等等，而滞销类别主要有果冻/龟苓膏、坚果炒货、纸巾、香烟、糖果巧克力等，无人售货机管理员应当依据分析结果对商品投放进行优化。

Time Analysis

以上的数据可视化都是针对无人售货机的年度销量与成交额或者是月度销量趋势进行分析，而在这一部分我们将从小时、周内时间、月份，这三个时间维度，对无人售货机的销售数据进行挖掘，为我们后面机器学习的特征提取做出铺垫。

Analysis in a day

我们从一天内的销售情况来开始入手，首先我们将原始数据按小时进行分组汇总，得到了无人售货机2017年，一天内各个小时的销售额数据，我们用柱状图的方式将其展现，并使用LOESS回归方法加入一条拟合曲线，来呈现一天内销售额的变化趋势。

day_analysis <- file1 %>% group_by(hour) %>% summarise(Value = sum(sum_pay))

day_analysis <- na.omit(day_analysis)

day_plot <- ggplot(day_analysis) + geom_bar(stat = "identity", aes(x = factor(hour), 
    y = Value, fill = factor(hour))) + geom_point(aes(x = hour, y = Value, color = hour, 
    size = Value)) + geom_smooth(aes(x = hour, y = Value), method = "auto") + theme_minimal() + 
    labs(x = "hour", y = "Value", title = "Daily Trend") + scale_y_continuous(breaks = c(5000, 
    10000, 15000, 20000, 25000)) + theme(legend.position = "none", axis.title.x = element_text(face = "italic", 
    colour = "black", size = 18), axis.title.y = element_text(face = "italic", colour = "black", 
    size = 18)) + scale_x_discrete(breaks = c(0:23), labels = c("00:00", "01:00", 
    "02:00", "03:00", "04:00", "05:00", "06:00", "07:00", "08:00", "09:00", "10:00", 
    "11:00", "12:00", "13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", 
    "20:00", "21:00", "22:00", "23:00")) + theme(axis.text.x = element_text(angle = 45, 
    size = 12))

plotly::ggplotly(day_plot)

我们对上图展开分析，由于一天当中销售额的最低值出现在上午06:00，所以我们从上午06:00开始展开分析。

上午（06:00 – 12:00）： 在06:00销售额达到最低值后，销售额开始显著攀升；在09:00达到第一个峰值后，销售额在接下来的两个小时内开始下降，直到12:00点后，销售量开始骤增到了20951元，超过了11:00销售量的两倍；
下午（13:00 – 19:00）： 在下午13:00时销售额又跌倒与上午09:00销售额上下的13718元，而随后销售额又开始了它新一轮的增长，从13:00到16:00这3个小时内，无人售货机的总销售额开始有指数增长的趋势，在下午16:00时，其销售额已经达到了一天的峰值——26869.4元，在后面的两小时中其销售额逐渐下跌，在19:00时无人售货机的销售额为16624.6元。
晚上（20:00 – 23:00）： 在晚上，除了20:00到21:00有小幅度上升，销售额的总体趋势是下降的，在23:00时，销售额下降至8056.5元。
黎明（00:00 – 05:00）： 黎明中销售额的最高值出现在00:00，最低值出现在05:00，而在00:00到05:00间并没有明显的涨幅趋势，相反的，从01:00到04:00，销售额都稳定在2800元左右，比较平稳。

Analysis in a week

接下来，我们在时间维度的分析更进一步，我们展开对无人售货机一周内销售额的分析。如下图，我们不难发现，周一与周二处于销售额最高的层次，分别有44000元与45759元的销售额；一周内销售额最低的一天出现在周三，销售额仅有36983元；而在剩余的几天中，销售额的相差不大，均在40000元上下波动。

week_analysis <- file1 %>% group_by(weekday = wday(date)) %>% summarise(Value = sum(sum_pay))

week_analysis <- na.omit(week_analysis)

week_plot <- ggplot(week_analysis) + geom_bar(stat = "identity", aes(x = factor(weekday), 
    y = Value, fill = factor(weekday))) + geom_point(aes(x = weekday, y = Value, 
    color = weekday, size = Value)) + theme_minimal() + labs(x = "weekday", y = "Value", 
    title = "Weekly Trend") + scale_y_continuous(breaks = c(10000, 20000, 30000, 
    40000, 50000)) + theme(legend.position = "none", axis.title.x = element_text(face = "italic", 
    colour = "black", size = 18), axis.title.y = element_text(face = "italic", colour = "black", 
    size = 18)) + scale_x_discrete(breaks = c(1:7), labels = c("MON", "TUE", "WES", 
    "THU", "FRI", "SAT", "SUN")) + theme(axis.text.x = element_text(size = 12))
plotly::ggplotly(week_plot)

Analysis in a month

经过前面两个时间维度的分析后，我们来到了时间维度分析的最后一步——月度趋势分析。同样的，我们选择用柱状图来绘制Monthly Plot。首先我们可以看到二月份有着最少的销售额，这与中国传统的春节应当有一定的联系，在十一月份有着一年的销售额峰值，到达了48488.88元。接下来我们可以看到，6月份、9月份、10月份、11月份、12月份都有着较高的销售额，整体来看17年下半年的销售额显著大于上半年。

month_analysis <- file1 %>% group_by(month = month) %>% summarise(Value = sum(sum_pay))

month_analysis <- na.omit(month_analysis)

month_plot <- ggplot(month_analysis) + geom_bar(stat = "identity", aes(x = factor(month), 
    y = Value, fill = factor(month))) + theme_minimal() + # geom_smooth(aes(x = month, y = Value), method = 'auto') +
labs(x = "month", y = "Value", title = "Monthly Trend") + scale_y_continuous(breaks = c(10000, 
    20000, 30000, 40000, 50000)) + theme(legend.position = "none", axis.title.x = element_text(face = "italic", 
    colour = "black", size = 18), axis.title.y = element_text(face = "italic", colour = "black", 
    size = 18)) + scale_x_discrete(breaks = c(1:12), labels = month.abb)
theme(axis.text.x = element_text(size = 12))

List of 1
 $ axis.text.x:List of 11
  ..$ family       : NULL
  ..$ face         : NULL
  ..$ colour       : NULL
  ..$ size         : num 12
  ..$ hjust        : NULL
  ..$ vjust        : NULL
  ..$ angle        : NULL
  ..$ lineheight   : NULL
  ..$ margin       : NULL
  ..$ debug        : NULL
  ..$ inherit.blank: logi FALSE
  ..- attr(*, "class")= chr [1:2] "element_text" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE

plotly::ggplotly(month_plot)

Time Series Analysis on Sales

在进行过描述性统计分析与可视化后，我们已经熟悉了无人售货机的销售数据，并且初步发现了一些重要特征与规律，现在我们开始进行建模工作。我们首先想到的是通过时间序列分析的方法，使用ARIMA模型对无人售货机的销售额进行建模。

# Data preperation
pred_data <- file1 %>% group_by(month, day) %>% summarise(Sale = sum(sum_pay))
ts <- ts(pred_data$Sale)

# Plot Series
plot(ts)

# Unit Roots Validation
library(fUnitRoots)
print(adfTest(ts))  # p stationary


Title:
 Augmented Dickey-Fuller Test

Test Results:
  PARAMETER:
    Lag Order: 1
  STATISTIC:
    Dickey-Fuller: -3.5287
  P VALUE:
    0.01 

Description:
 Thu Jan 09 16:03:37 2020 by user: Administrator

# ACF & PACF
acf(ts)

pacf(ts)

# diff ACF & PACF
acf(diff(ts))

pacf(diff(ts))

绘制销售额时间序列曲线：

通过绘制销售额时间序列曲线，我们可以初步得到销售额的整体上升趋势，为了检验该序列的平稳性，我们将对其进行单位根检验。我们通过ADF检验得到D-F统计量为-3.5287，p值为0.01，所以存在单位根，结果表明该序列平稳。

绘制ACF图与PACF图

ACF图表现为acf拖尾，PACF表现为pacf1阶截尾。

差分之后的ACF与PACF

序列差分后，acf拖尾，pacf拖尾。

通过以上分析，我们决定使用ARIMA(1,1,1)来对销售额序列进行建模。 \(Sale = 0.5633X(t) + eplison(t) - 0.9354eplison(t-1)\) 通过残差检验，残差为白噪声。

# Modeling
library(forecast)
A.fit <- auto.arima(ts)
summary(A.fit)

Series: ts 
ARIMA(1,1,1) 

Coefficients:
         ar1      ma1
      0.5633  -0.9354
s.e.  0.0629   0.0319

sigma^2 estimated as 237262:  log likelihood=-2585.99
AIC=5177.99   AICc=5178.06   BIC=5189.48

Training set error measures:
                  ME     RMSE      MAE       MPE     MAPE     MASE        ACF1
Training set 15.2709 484.9477 308.5769 -188.0387 211.1738 1.030166 -0.02602274

# 残差检验
qqnorm(A.fit$residuals)
qqline(A.fit$residuals)

for (i in 1:2) print(Box.test(A.fit$residuals, lag = 6 * i))


    Box-Pierce test

data:  A.fit$residuals
X-squared = 1.7397, df = 6, p-value = 0.942


    Box-Pierce test

data:  A.fit$residuals
X-squared = 24.457, df = 12, p-value = 0.01762

Fore <- forecast(A.fit, h = 30)
plot(Fore)

Machine Learning Models

Feature Engineering

file_join <- file1 %>% left_join(file2, by = "product") %>% select(sum_pay, location, 
    date, month, day, hour, General_Category, Sub_Category)



cut_hour = c(-1, 5, 11, 16, 21, 23)
cut_labels = c("dawn", "morning", "afternoon", "evening", "Night")
file_join <- file_join %>% mutate(hour_cut = cut(hour, breaks = cut_hour, labels = cut_labels))
file_join$hour_cut = factor(file_join$hour_cut, levels = cut_labels, labels = c(1, 
    2, 3, 4, 5))


file_join$wday <- wday(file_join$date)

file_join <- na.omit(file_join)

# One Hot Encoding

file_join <- cbind(file_join, as.data.frame(model.matrix(~General_Category - 1, file_join)), 
    as.data.frame(model.matrix(~Sub_Category - 1, file_join)), as.data.frame(model.matrix(~location - 
        1, file_join)), as.data.frame(model.matrix(~hour_cut - 1, file_join))) %>% 
    select(-General_Category, -Sub_Category, -location, -hour_cut)

# Create ML Datasets
ML_dataset <- file_join %>% group_by(month, day) %>% summarise(sum_pay = sum(sum_pay), 
    beverage = sum(General_Category饮料), non_beverage = sum(General_Category非饮料), 
    cookie = sum(Sub_Category饼干糕点), tea = sum(Sub_Category茶饮料), fast_food = sum(Sub_Category方便速食), 
    functional_drink = sum(Sub_Category功能饮料), jelly = sum(`Sub_Category果冻/龟苓膏`), 
    juice = sum(Sub_Category果蔬饮料), sea_snacks = sum(Sub_Category海味零食), nuts = sum(Sub_Category坚果炒货), 
    coffe = sum(Sub_Category咖啡), succade = sum(`Sub_Category蜜饯/果干`), puffed_food = sum(Sub_Category膨化食品), 
    others = sum(Sub_Category其他), meat_tofu_eggs = sum(`Sub_Category肉干/豆制品/蛋`), 
    dairy_product = sum(Sub_Category乳制品), water = sum(Sub_Category水), soda = sum(Sub_Category碳酸饮料), 
    candy = sum(`Sub_Category糖果/巧克力`), cigarette = sum(Sub_Category香烟), plant_protein = sum(Sub_Category植物蛋白), 
    napkin = sum(Sub_Category纸巾), locationA = sum(locationA), locationB = sum(locationB), 
    locationC = sum(locationC), locationD = sum(locationD), locationE = sum(locationE), 
    dawn = sum(hour_cut1), morning = sum(hour_cut2), afternoon = sum(hour_cut3), 
    evening = sum(hour_cut4), night = sum(hour_cut5))


ML_dataset <- ML_dataset %>% mutate(date = make_date(2017, month, day)) %>% mutate(wday = wday(date))


ML_dataset$festival <- 0

ML_dataset$festival[(ML_dataset$month == 1) & (ML_dataset$day <= 3)] <- 1

ML_dataset$festival[(ML_dataset$month == 2) & (ML_dataset$day <= 10) & (ML_dataset$day >= 
    4)] <- 1

ML_dataset$festival[(ML_dataset$month == 5) & (ML_dataset$day <= 4)] <- 1

ML_dataset$festival[(ML_dataset$month == 6) & (ML_dataset$day <= 9) & (ML_dataset$day >= 
    7)] <- 1

ML_dataset$festival[(ML_dataset$month == 9) & (ML_dataset$day <= 15) & (ML_dataset$day >= 
    13)] <- 1

ML_dataset$festival[(ML_dataset$month == 10) & (ML_dataset$day <= 7)] <- 1

ML_dataset$festival[(ML_dataset$month == 11) & (ML_dataset$day == 11)] <- 1

ML_dataset$festival[(ML_dataset$month == 12) & (ML_dataset$day <= 25) & (ML_dataset$day >= 
    24)] <- 1


dataset <- ML_dataset %>% select(-date) %>% select(sum_pay, everything())

Split Training set and Test set

"
library(caTools)
set.seed(123)
split = sample.split(dataset$sum_pay, SplitRatio = 0.8) #Ture for Trainning Set; False for Test Set
training_set = subset(dataset, split == T)
test_set = subset(dataset, split == F)
"

[1] "\nlibrary(caTools)\nset.seed(123)\nsplit = sample.split(dataset$sum_pay, SplitRatio = 0.8) #Ture for Trainning Set; False for Test Set\ntraining_set = subset(dataset, split == T)\ntest_set = subset(dataset, split == F)\n"

set.seed(123)
index <- sample.int(dim(dataset)[1], 0.8 * dim(dataset)[1])
training_set = dataset[index, ]
test_set = dataset[-index, ]

RandomForest

library(caret)
regressor_rf <- train(sum_pay ~., data = training_set, method = 'rf',#使用随机森林建模
                  trControl = trainControl(method = 'cv', 
                                           number = 5, #选择五折交叉验证
                                           selectionFunction = 'oneSE'#OneSE选择最好最简单的
                  ),
                  importance=T)

y_pred <- predict(regressor_rf, test_set[,-1])

errors = abs(y_pred - test_set$sum_pay)

print(mean(errors)) # 70.68472

[1] 51.36162

plot(varImp(regressor_rf))

Feature Selection

rfeControls <- rfeControl(functions = rfFuncs, method = "cv", number = 5)
results <- rfe(dataset[, -1], dataset$sum_pay, sizes = c(1:36), rfeControl = rfeControls)
predictors(results)

 [1] "beverage"         "dairy_product"    "non_beverage"     "locationE"       
 [5] "meat_tofu_eggs"   "locationB"        "evening"          "tea"             
 [9] "cookie"           "afternoon"        "puffed_food"      "fast_food"       
[13] "morning"          "locationD"        "locationC"        "locationA"       
[17] "juice"            "soda"             "water"            "functional_drink"

plot(results, type = c("g", "o"))

as.data.frame(results$optVariables)

   results$optVariables
1              beverage
2         dairy_product
3          non_beverage
4             locationE
5        meat_tofu_eggs
6             locationB
7               evening
8                   tea
9                cookie
10            afternoon
11          puffed_food
12            fast_food
13              morning
14            locationD
15            locationC
16            locationA
17                juice
18                 soda
19                water
20     functional_drink

vars <- c('sum_pay', results$optVariables) # 构建新数据的变量
regressor_2 <- train(x = training_set[vars][-1], 
                     y = training_set$sum_pay, method = "rf",
                                  trControl = trainControl(method = 'cv', 
                                                           number = 5, #选择五折交叉验证
                                                           selectionFunction = 'oneSE'),
                     importance=T)
y_pred_2 <- predict(regressor_2, test_set[,-1])
errors_2 <- abs(y_pred_2 - test_set$sum_pay)
print(mean(errors_2)) # 63.57884 有提高！

[1] 46.63185

plot(varImp(regressor_2))

Reference

[1] 使用R语言进行机器学习特征选择 https://www.jianshu.com/p/3ce79b6f371a

[2] R语言中的one-hot编码实战 https://blog.csdn.net/Ron_Lee_sdj/article/details/80772009

[3] 【机器学习】时序数据处理 https://blog.csdn.net/qq_34105362/article/details/85257254

[4] 机器学习与时间序列预测https://www.jianshu.com/p/e81ab6846214

Vending Machine