KNN analysis on 5000 and 500000 Sales Recrod

In this HW, I’ve selected my small data set as 5000 sales record and large data set as 500000. First let me import the data and do some data exploration.

Imporating the Data

print(getwd())
## [1] "C:/Users/linwe/Desktop/Data 622/HW1"
setwd("C:/Users/linwe/Desktop/Data 622/HW1")
print(getwd())
## [1] "C:/Users/linwe/Desktop/Data 622/HW1"
data1 <- read.csv("5000 Sales Records.csv")
data2 <- read.csv("500000 Sales Records.csv")

Data Exploration

head(data1)
##                              Region              Country     Item.Type
## 1 Central America and the Caribbean Antigua and Barbuda      Baby Food
## 2 Central America and the Caribbean               Panama        Snacks
## 3                            Europe       Czech Republic     Beverages
## 4                              Asia          North Korea        Cereal
## 5                              Asia            Sri Lanka        Snacks
## 6      Middle East and North Africa              Morocco Personal Care
##   Sales.Channel Order.Priority Order.Date  Order.ID  Ship.Date Units.Sold
## 1        Online              M 12/20/2013 957081544  1/11/2014        552
## 2       Offline              C   7/5/2010 301644504  7/26/2010       2167
## 3       Offline              C  9/12/2011 478051030  9/29/2011       4778
## 4       Offline              L  5/13/2010 892599952  6/15/2010       9016
## 5       Offline              C  7/20/2015 571902596  7/27/2015       7542
## 6       Offline              L  11/8/2010 412882792 11/22/2010         48
##   Unit.Price Unit.Cost Total.Revenue Total.Cost Total.Profit
## 1     255.28    159.42     140914.56   87999.84     52914.72
## 2     152.58     97.44     330640.86  211152.48    119488.38
## 3      47.45     31.79     226716.10  151892.62     74823.48
## 4     205.70    117.11    1854591.20 1055863.76    798727.44
## 5     152.58     97.44    1150758.36  734892.48    415865.88
## 6      81.73     56.67       3923.04    2720.16      1202.88
print(ncol(data1))
## [1] 14
print(nrow(data1))
## [1] 5000
names(data1)
##  [1] "Region"         "Country"        "Item.Type"      "Sales.Channel" 
##  [5] "Order.Priority" "Order.Date"     "Order.ID"       "Ship.Date"     
##  [9] "Units.Sold"     "Unit.Price"     "Unit.Cost"      "Total.Revenue" 
## [13] "Total.Cost"     "Total.Profit"
str(data1)
## 'data.frame':    5000 obs. of  14 variables:
##  $ Region        : chr  "Central America and the Caribbean" "Central America and the Caribbean" "Europe" "Asia" ...
##  $ Country       : chr  "Antigua and Barbuda " "Panama" "Czech Republic" "North Korea" ...
##  $ Item.Type     : chr  "Baby Food" "Snacks" "Beverages" "Cereal" ...
##  $ Sales.Channel : chr  "Online" "Offline" "Offline" "Offline" ...
##  $ Order.Priority: chr  "M" "C" "C" "L" ...
##  $ Order.Date    : chr  "12/20/2013" "7/5/2010" "9/12/2011" "5/13/2010" ...
##  $ Order.ID      : int  957081544 301644504 478051030 892599952 571902596 412882792 932776868 919133651 579814469 192993152 ...
##  $ Ship.Date     : chr  "1/11/2014" "7/26/2010" "9/29/2011" "6/15/2010" ...
##  $ Units.Sold    : int  552 2167 4778 9016 7542 48 8258 927 8841 9817 ...
##  $ Unit.Price    : num  255.3 152.6 47.5 205.7 152.6 ...
##  $ Unit.Cost     : num  159.4 97.4 31.8 117.1 97.4 ...
##  $ Total.Revenue : num  140915 330641 226716 1854591 1150758 ...
##  $ Total.Cost    : num  88000 211152 151893 1055864 734892 ...
##  $ Total.Profit  : num  52915 119488 74823 798727 415866 ...
data1$Region <-as.factor(data1$Region)
data1$Country <- as.factor(data1$Country)
data1$Item.Type <- as.factor(data1$Item.Type)
data1$Sales.Channel <- as.factor(data1$Sales.Channel)
data1$Order.Priority <- as.factor(data1$Order.Priority)
data1$Order.Date <- as.Date(data1$Order.Date, "%m/%d/%Y")
data1$Ship.Date <- as.Date(data1$Ship.Date, "%m/%d/%Y")

summary(data1)
##                                Region                         Country    
##  Asia                             : 719   Andorra                 :  40  
##  Australia and Oceania            : 416   San Marino              :  40  
##  Central America and the Caribbean: 534   Ghana                   :  38  
##  Europe                           :1330   Mauritius               :  38  
##  Middle East and North Africa     : 610   United States of America:  38  
##  North America                    : 106   Tonga                   :  37  
##  Sub-Saharan Africa               :1285   (Other)                 :4769  
##            Item.Type    Sales.Channel  Order.Priority   Order.Date        
##  Beverages      : 447   Offline:2504   C:1174         Min.   :2010-01-01  
##  Fruits         : 447   Online :2496   H:1278         1st Qu.:2011-12-08  
##  Baby Food      : 445                  L:1227         Median :2013-10-23  
##  Cosmetics      : 424                  M:1321         Mean   :2013-10-19  
##  Household      : 424                                 3rd Qu.:2015-09-08  
##  Office Supplies: 420                                 Max.   :2017-07-28  
##  (Other)        :2393                                                     
##     Order.ID           Ship.Date            Units.Sold     Unit.Price    
##  Min.   :100090873   Min.   :2010-01-06   Min.   :   2   Min.   :  9.33  
##  1st Qu.:320104217   1st Qu.:2012-01-06   1st Qu.:2453   1st Qu.: 81.73  
##  Median :552314960   Median :2013-11-14   Median :5123   Median :154.06  
##  Mean   :548644737   Mean   :2013-11-13   Mean   :5031   Mean   :265.75  
##  3rd Qu.:768770944   3rd Qu.:2015-10-03   3rd Qu.:7576   3rd Qu.:437.20  
##  Max.   :999879729   Max.   :2017-08-31   Max.   :9999   Max.   :668.27  
##                                                                          
##    Unit.Cost      Total.Revenue       Total.Cost       Total.Profit      
##  Min.   :  6.92   Min.   :     65   Min.   :     48   Min.   :     16.9  
##  1st Qu.: 35.84   1st Qu.: 257417   1st Qu.: 154748   1st Qu.:  85339.3  
##  Median : 97.44   Median : 779409   Median : 468181   Median : 279095.2  
##  Mean   :187.49   Mean   :1325738   Mean   : 933093   Mean   : 392644.6  
##  3rd Qu.:263.33   3rd Qu.:1839975   3rd Qu.:1189578   3rd Qu.: 565106.4  
##  Max.   :524.96   Max.   :6672676   Max.   :5248025   Max.   :1726007.5  
## 

After some basic data exploration and data information, we can see that the data for sales record are randomly generated. It contains the variables: Region, Country, Item Type, Sales Channel, Order Priority, Order Date, Order ID, Ship Date, Units Sold, Unit Price, Unit Cost, Total Revenue, Total Cost and Total Profit. Some data type are character and some are numeric. One good thing is that the data set don’t have missing data, so there is no need to work on the missing values.

Selecting Alogrithms

I’ve decided to take Sales.Channel as a response variable and try to predict the order will be taken online or offline with other variables in the data set. The two algorithms I wanted to practice on are K-Nearest Neighbors (KNN) and Naive Bayes. KNN is a relative simple, easy-to-implement supervised machine algorithm.KNN assumes that similar things occur in close proximity, and it determines the outcome by neighbors’ value. Naive Bayes is another classifier, which uses a table of probabilities to estimate the likelihood that an instance belongs to a particular class. This algorithms uses the probability of prior events to estimate the probability of future events.

For this homework, I would like to use KNN to analyze the sales record. The data set is pretty balanced and contain no missing data. For KNN, I also don’t need to make assumptions about the underlying data distribution. KNN is straight forward and effective.

Install Packages

install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/linwe/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\linwe\AppData\Local\Temp\RtmpYlRA6t\downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/linwe/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\linwe\Documents\R\win-library\4.1\00LOCK\dplyr\libs\x64\dplyr.dll to C:
## \Users\linwe\Documents\R\win-library\4.1\dplyr\libs\x64\dplyr.dll: Permission
## denied
## Warning: restored 'dplyr'
## 
## The downloaded binary packages are in
##  C:\Users\linwe\AppData\Local\Temp\RtmpYlRA6t\downloaded_packages
install.packages("class", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/linwe/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'class' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'class'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\linwe\Documents\R\win-library\4.1\00LOCK\class\libs\x64\class.dll to C:
## \Users\linwe\Documents\R\win-library\4.1\class\libs\x64\class.dll: Permission
## denied
## Warning: restored 'class'
## 
## The downloaded binary packages are in
##  C:\Users\linwe\AppData\Local\Temp\RtmpYlRA6t\downloaded_packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(class)

KNN on 5000 sales records

Now that I have my data, I would like to use numeric variables in the data set as predictors for response.

data1 <- data1 %>%
  select(
    Units.Sold,
    Unit.Price,
    Unit.Cost,
    Total.Revenue,
    Total.Cost,
    Total.Profit,
    Sales.Channel
      )
summary(data1)
##    Units.Sold     Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :   2   Min.   :  9.33   Min.   :  6.92   Min.   :     65  
##  1st Qu.:2453   1st Qu.: 81.73   1st Qu.: 35.84   1st Qu.: 257417  
##  Median :5123   Median :154.06   Median : 97.44   Median : 779409  
##  Mean   :5031   Mean   :265.75   Mean   :187.49   Mean   :1325738  
##  3rd Qu.:7576   3rd Qu.:437.20   3rd Qu.:263.33   3rd Qu.:1839975  
##  Max.   :9999   Max.   :668.27   Max.   :524.96   Max.   :6672676  
##    Total.Cost       Total.Profit       Sales.Channel 
##  Min.   :     48   Min.   :     16.9   Offline:2504  
##  1st Qu.: 154748   1st Qu.:  85339.3   Online :2496  
##  Median : 468181   Median : 279095.2                 
##  Mean   : 933093   Mean   : 392644.6                 
##  3rd Qu.:1189578   3rd Qu.: 565106.4                 
##  Max.   :5248025   Max.   :1726007.5

Normalizing the Data

The variables with larger values or have wider ranges will disproportionately affect the distance calculation. So it is important to normalize the variables and standardize their scales between 0 and 1 before using KNN.

normalize <- function(x){
  return((x-min(x))/(max(x) - min(x)))
}
data1 <- data1 %>%
  mutate(Units.Sold = normalize(Units.Sold)) %>%
  mutate(Unit.Price = normalize(Unit.Price)) %>%
  mutate(Unit.Cost = normalize(Unit.Cost)) %>%
  mutate(Total.Revenue = normalize(Total.Revenue)) %>%
  mutate(Total.Cost = normalize(Total.Cost)) %>%
  mutate(Total.Profit = normalize(Total.Profit))
summary(data1)
##    Units.Sold       Unit.Price       Unit.Cost       Total.Revenue    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.2452   1st Qu.:0.1099   1st Qu.:0.05583   1st Qu.:0.03857  
##  Median :0.5123   Median :0.2196   Median :0.17474   Median :0.11680  
##  Mean   :0.5030   Mean   :0.3891   Mean   :0.34857   Mean   :0.19867  
##  3rd Qu.:0.7577   3rd Qu.:0.6493   3rd Qu.:0.49496   3rd Qu.:0.27574  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##    Total.Cost       Total.Profit     Sales.Channel 
##  Min.   :0.00000   Min.   :0.00000   Offline:2504  
##  1st Qu.:0.02948   1st Qu.:0.04943   Online :2496  
##  Median :0.08920   Median :0.16169                 
##  Mean   :0.17779   Mean   :0.22748                 
##  3rd Qu.:0.22666   3rd Qu.:0.32740                 
##  Max.   :1.00000   Max.   :1.00000

The statistical summary shows that our features all now have values within the range of 0 and 1

Splitting and Balancing the Data

I split the data set into training and test data sets. I partition 75% of the data as training examples and 25% as test data.

data1<- data.frame(data1)

set.seed(1234)
sample_index <- sample(nrow(data1), round(nrow(data1)* .75), replace = FALSE)
data1_train <- data1[sample_index,]
data1_test <- data1[-sample_index,]

data1_train_lables <- as.factor(pull(data1_train, Sales.Channel))
data1_test_lables <- as.factor(pull(data1_test, Sales.Channel))

data1_train <- data.frame(select(data1_train, -Sales.Channel))
data1_test <- data.frame(select(data1_test, -Sales.Channel))

Building the Model

data1_pred <-
  knn(
    train=data1_train,
    test= data1_test,
    cl = data1_train_lables,
    k=5
  )
head(data1_pred)
## [1] Offline Offline Online  Online  Offline Offline
## Levels: Offline Online

The output provides an ordered list of the predicted response for first six instances in the test data set.

Evaluating the Model

data1_pred_table <- table(data1_test_lables, data1_pred)
data1_pred_table
##                  data1_pred
## data1_test_lables Offline Online
##           Offline     310    310
##           Online      300    330
sum(diag(data1_pred_table))/nrow(data1_test)
## [1] 0.512

The results show that the predictive accuracy is 51.2%. I will attempt to vary the value of K to see if I can improve the performance of my model. I will try a smaller K and larger K than 5.

data1_pred2 <- 
  knn(
    train=data1_train,
    test= data1_test,
    cl = data1_train_lables,
    k=1
  )
head(data1_pred2)
## [1] Online  Offline Online  Offline Online  Offline
## Levels: Offline Online
data1_pred2_table <- table(data1_test_lables, data1_pred2)
data1_pred2_table
##                  data1_pred2
## data1_test_lables Offline Online
##           Offline     282    338
##           Online      314    316
sum(diag(data1_pred2_table))/nrow(data1_test)
## [1] 0.4784
data1_pred3 <- 
  knn(
    train=data1_train,
    test= data1_test,
    cl = data1_train_lables,
    k=10
  )
head(data1_pred3)
## [1] Offline Offline Offline Online  Online  Offline
## Levels: Offline Online
data1_pred3_table <- table(data1_test_lables, data1_pred3)
data1_pred3_table
##                  data1_pred3
## data1_test_lables Offline Online
##           Offline     308    312
##           Online      301    329
sum(diag(data1_pred3_table))/nrow(data1_test)
## [1] 0.5096

After trying different K values, it seems that the accuracy level is not increased. Overall, the predict accuracy is 51.2% at K=5, 47.8% accuracy at K=1 and 51.0% accuracy at K=10.Now let me try KNN on 500000 Sales Records to see if the algorithm can do better.

head(data2)
##                         Region          Country Item.Type Sales.Channel
## 1           Sub-Saharan Africa     South Africa    Fruits       Offline
## 2 Middle East and North Africa          Morocco   Clothes        Online
## 3        Australia and Oceania Papua New Guinea      Meat       Offline
## 4           Sub-Saharan Africa         Djibouti   Clothes       Offline
## 5                       Europe         Slovakia Beverages       Offline
## 6                         Asia        Sri Lanka    Fruits        Online
##   Order.Priority Order.Date  Order.ID  Ship.Date Units.Sold Unit.Price
## 1              M  7/27/2012 443368995  7/28/2012       1593       9.33
## 2              M  9/14/2013 667593514 10/19/2013       4611     109.28
## 3              M  5/15/2015 940995585   6/4/2015        360     421.89
## 4              H  5/17/2017 880811536   7/2/2017        562     109.28
## 5              L 10/26/2016 174590194  12/4/2016       3973      47.45
## 6              L  11/7/2011 830192887 12/18/2011       1379       9.33
##   Unit.Cost Total.Revenue Total.Cost Total.Profit
## 1      6.92      14862.69   11023.56      3839.13
## 2     35.84     503890.08  165258.24    338631.84
## 3    364.69     151880.40  131288.40     20592.00
## 4     35.84      61415.36   20142.08     41273.28
## 5     31.79     188518.85  126301.67     62217.18
## 6      6.92      12866.07    9542.68      3323.39
data2$Region <-as.factor(data2$Region)
data2$Country <- as.factor(data2$Country)
data2$Item.Type <- as.factor(data2$Item.Type)
data2$Sales.Channel <- as.factor(data2$Sales.Channel)
data2$Order.Priority <- as.factor(data2$Order.Priority)
data2$Order.Date <- as.Date(data2$Order.Date, "%m/%d/%Y")
data2$Ship.Date <- as.Date(data2$Ship.Date, "%m/%d/%Y")

summary(data2)
##                                Region              Country      
##  Asia                             : 72958   Cape Verde :  2840  
##  Australia and Oceania            : 40508   Guinea     :  2805  
##  Central America and the Caribbean: 53964   Liberia    :  2805  
##  Europe                           :129286   Singapore  :  2804  
##  Middle East and North Africa     : 62020   New Zealand:  2797  
##  North America                    : 10842   Malta      :  2791  
##  Sub-Saharan Africa               :130422   (Other)    :483158  
##          Item.Type      Sales.Channel    Order.Priority   Order.Date        
##  Personal Care: 41789   Offline:250161   C:125042       Min.   :2010-01-01  
##  Cosmetics    : 41717   Online :249839   H:124987       1st Qu.:2011-11-23  
##  Snacks       : 41706                    L:125138       Median :2013-10-15  
##  Clothes      : 41689                    M:124833       Mean   :2013-10-15  
##  Fruits       : 41684                                   3rd Qu.:2015-09-08  
##  Meat         : 41673                                   Max.   :2017-07-28  
##  (Other)      :249742                                                       
##     Order.ID           Ship.Date            Units.Sold      Unit.Price    
##  Min.   :100002896   Min.   :2010-01-01   Min.   :    1   Min.   :  9.33  
##  1st Qu.:325181424   1st Qu.:2011-12-18   1st Qu.: 2502   1st Qu.: 81.73  
##  Median :549184286   Median :2013-11-09   Median : 4999   Median :154.06  
##  Mean   :550131907   Mean   :2013-11-09   Mean   : 4999   Mean   :266.04  
##  3rd Qu.:775629138   3rd Qu.:2015-10-03   3rd Qu.: 7497   3rd Qu.:421.89  
##  Max.   :999999463   Max.   :2017-09-16   Max.   :10000   Max.   :668.27  
##                                                                           
##    Unit.Cost      Total.Revenue       Total.Cost       Total.Profit      
##  Min.   :  6.92   Min.   :      9   Min.   :      7   Min.   :      2.4  
##  1st Qu.: 56.67   1st Qu.: 278306   1st Qu.: 162060   1st Qu.:  95385.1  
##  Median : 97.44   Median : 786243   Median : 467712   Median : 281749.2  
##  Mean   :187.53   Mean   :1330096   Mean   : 937616   Mean   : 392480.0  
##  3rd Qu.:263.33   3rd Qu.:1824236   3rd Qu.:1198736   3rd Qu.: 565392.3  
##  Max.   :524.96   Max.   :6682700   Max.   :5249600   Max.   :1738700.0  
## 
data2 <- data2 %>%
  select(
    Units.Sold,
    Unit.Price,
    Unit.Cost,
    Total.Revenue,
    Total.Cost,
    Total.Profit,
    Sales.Channel
  )
summary(data2)
##    Units.Sold      Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :    1   Min.   :  9.33   Min.   :  6.92   Min.   :      9  
##  1st Qu.: 2502   1st Qu.: 81.73   1st Qu.: 56.67   1st Qu.: 278306  
##  Median : 4999   Median :154.06   Median : 97.44   Median : 786243  
##  Mean   : 4999   Mean   :266.04   Mean   :187.53   Mean   :1330096  
##  3rd Qu.: 7497   3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1824236  
##  Max.   :10000   Max.   :668.27   Max.   :524.96   Max.   :6682700  
##    Total.Cost       Total.Profit       Sales.Channel   
##  Min.   :      7   Min.   :      2.4   Offline:250161  
##  1st Qu.: 162060   1st Qu.:  95385.1   Online :249839  
##  Median : 467712   Median : 281749.2                   
##  Mean   : 937616   Mean   : 392480.0                   
##  3rd Qu.:1198736   3rd Qu.: 565392.3                   
##  Max.   :5249600   Max.   :1738700.0
# Normalizing the data
normalize <- function(x){
  return((x-min(x))/(max(x) - min(x)))
}

# Pass each of our features to the function to standardize their scales between 0 and 1
data2<- data2%>%
  mutate(Units.Sold = normalize(Units.Sold)) %>%
  mutate(Unit.Price = normalize(Unit.Price)) %>%
  mutate(Unit.Cost = normalize(Unit.Cost)) %>%
  mutate(Total.Revenue = normalize(Total.Revenue)) %>%
  mutate(Total.Cost = normalize(Total.Cost)) %>%
  mutate(Total.Profit = normalize(Total.Profit))

summary(data2)
##    Units.Sold       Unit.Price       Unit.Cost       Total.Revenue    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.2501   1st Qu.:0.1099   1st Qu.:0.09604   1st Qu.:0.04164  
##  Median :0.4999   Median :0.2196   Median :0.17474   Median :0.11765  
##  Mean   :0.4999   Mean   :0.3896   Mean   :0.34864   Mean   :0.19903  
##  3rd Qu.:0.7497   3rd Qu.:0.6261   3rd Qu.:0.49496   3rd Qu.:0.27298  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##    Total.Cost       Total.Profit     Sales.Channel   
##  Min.   :0.00000   Min.   :0.00000   Offline:250161  
##  1st Qu.:0.03087   1st Qu.:0.05486   Online :249839  
##  Median :0.08909   Median :0.16204                   
##  Mean   :0.17861   Mean   :0.22573                   
##  3rd Qu.:0.22835   3rd Qu.:0.32518                   
##  Max.   :1.00000   Max.   :1.00000
# The statistical summary shows that our features all now have values within the range of 0 and 1

# Splitting and Balancing the Data: to split the data using 75:25 ratio. Let me convert the data into a data frame
data2<- data.frame(data2)

set.seed(1234)
sample_index2 <- sample(nrow(data2), round(nrow(data2)* .75), replace = FALSE)
data2_train <- data2[sample_index2,]
data2_test <- data2[-sample_index2,]

# 
data2_train_lables <- as.factor(pull(data2_train, Sales.Channel))
data2_test_lables <- as.factor(pull(data2_test, Sales.Channel))

data2_train <- data.frame(select(data2_train, -Sales.Channel))
data2_test <- data.frame(select(data2_test, -Sales.Channel))


# Building the Model

data2_pred <-
  knn(
    train=data2_train,
    test= data2_test,
    cl = data2_train_lables,
    k=5
  )
head(data2_pred)
## [1] Online  Online  Online  Online  Offline Online 
## Levels: Offline Online
# Evaluating the Model
data2_pred_table <- table(data2_test_lables, data2_pred)
data2_pred_table
##                  data2_pred
## data2_test_lables Offline Online
##           Offline   29908  32456
##           Online    32682  29954
sum(diag(data2_pred_table))/nrow(data2_test)
## [1] 0.478896
# Improve the Model
data2_pred2 <- 
  knn(
    train=data2_train,
    test= data2_test,
    cl = data2_train_lables,
    k=1
  )
head(data2_pred2)
## [1] Online  Online  Online  Online  Offline Online 
## Levels: Offline Online
data2_pred2_table <- table(data2_test_lables, data2_pred2)
data2_pred2_table
##                  data2_pred2
## data2_test_lables Offline Online
##           Offline   29171  33193
##           Online    33522  29114
sum(diag(data2_pred2_table))/nrow(data2_test)
## [1] 0.46628
# make k bigger
data2_pred3 <- 
  knn(
    train=data2_train,
    test= data2_test,
    cl = data2_train_lables,
    k=10
  )
head(data2_pred3)
## [1] Online  Online  Online  Online  Offline Online 
## Levels: Offline Online
data2_pred3_table <- table(data2_test_lables, data2_pred3)
data2_pred3_table
##                  data2_pred3
## data2_test_lables Offline Online
##           Offline   30497  31867
##           Online    32537  30099
sum(diag(data2_pred3_table))/nrow(data2_test)
## [1] 0.484768

For 500000 Sales Records, the KNN predicts 47.9% accuracy at K=5, 46.6% accuracy at K=1 and 48.5% accuracy at K=10.

Conclusion

For KNN algorithm, the size of the data set seems are not impacting the accuracy of the model. I probably won’t use KNN algorithm for business decision from sales records data set. But I think it’s a good practice and probably I will select a different algorithm for analysis next time.