Amazon Orders 2021-01-01 through 2022-08-14

#load libraries

library(float)
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.1
library(arules)
## Warning: package 'arules' was built under R version 4.2.1
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.1
library(janitor)
## Warning: package 'janitor' was built under R version 4.2.1
#read data
amazonorders <- read_csv("D:/Al/DataMining Final Project/amazonorders.csv")

#summarize data
head(amazonorders)
## # A tibble: 6 × 36
##   Order_Date Order_ID Title           Category `ASIN/ISBN` `UNSPSC Code` Website
##   <date>        <dbl> <chr>           <chr>    <chr>               <dbl> <chr>  
## 1 2021-01-01  1.13e16 "PawMedica Glu… ANIMAL_… B085W81GTG       50500000 Amazon…
## 2 2021-01-01  1.13e16 "ULTRAIDEAS Wo… SLIPPER  B07RSPVR8H       53110000 Amazon…
## 3 2021-01-01  1.13e16 "ULTRAIDEAS Wo… SLIPPER  B07RSPTGGW       53110000 Amazon…
## 4 2021-01-01  1.13e16 "Leather Desk … DESK_PAD B084FFRPLX       44000000 Amazon…
## 5 2021-01-01  1.13e16 "Car Cup Holde… CUP_HOL… B082VB76QX       43191600 Amazon…
## 6 2021-01-01  1.14e16 "Arctic Paw Wo… HAT      B07HRGMV3C       53100000 Amazon…
## # … with 29 more variables: Release_Date <dttm>, Condition <chr>, Seller <chr>,
## #   Seller_Credentials <chr>, List_Price_Per_Unit <dbl>,
## #   Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## #   Payment_Instrument_Type <chr>, Purchase_Order_Number <lgl>,
## #   PO_Line_Number <lgl>, Ordering_Customer_Email <chr>, Shipment_Date <date>,
## #   Shipping_Address_Name <chr>, Shipping_Address_Street_1 <chr>,
## #   Shipping_Address_Street_2 <chr>, Shipping_Address_City <chr>, …
library(janitor)
#SCRUB DATA: delete columns
remove_empty(amazonorders, which = c("cols"), quiet = TRUE)
## # A tibble: 1,016 × 32
##    Order_Date Order_ID Title          Category `ASIN/ISBN` `UNSPSC Code` Website
##    <date>        <dbl> <chr>          <chr>    <chr>               <dbl> <chr>  
##  1 2021-01-01  1.13e16 "PawMedica Gl… ANIMAL_… B085W81GTG       50500000 Amazon…
##  2 2021-01-01  1.13e16 "ULTRAIDEAS W… SLIPPER  B07RSPVR8H       53110000 Amazon…
##  3 2021-01-01  1.13e16 "ULTRAIDEAS W… SLIPPER  B07RSPTGGW       53110000 Amazon…
##  4 2021-01-01  1.13e16 "Leather Desk… DESK_PAD B084FFRPLX       44000000 Amazon…
##  5 2021-01-01  1.13e16 "Car Cup Hold… CUP_HOL… B082VB76QX       43191600 Amazon…
##  6 2021-01-01  1.14e16 "Arctic Paw W… HAT      B07HRGMV3C       53100000 Amazon…
##  7 2021-01-01  1.14e16 "Arctic Paw W… HAT      B071RDGY5Z       53100000 Amazon…
##  8 2021-01-01  1.14e16 "Lifewit Larg… STORAGE… B07PCNGJP8       44111500 Amazon…
##  9 2021-01-01  1.14e16 "1000 Synthet… PROTECT… B08MVCRCPJ       46181500 Amazon…
## 10 2021-01-08  1.14e16 "Henry’s Hous… COFFEE   B071CZV4LN       50201706 Amazon…
## # … with 1,006 more rows, and 25 more variables: Release_Date <dttm>,
## #   Condition <chr>, Seller <chr>, Seller_Credentials <chr>,
## #   List_Price_Per_Unit <dbl>, Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## #   Payment_Instrument_Type <chr>, Ordering_Customer_Email <chr>,
## #   Shipment_Date <date>, Shipping_Address_Name <chr>,
## #   Shipping_Address_Street_1 <chr>, Shipping_Address_Street_2 <chr>,
## #   Shipping_Address_City <chr>, Shipping_Address_State <chr>, …
#SCRUB DATA: remove irrelevant columns
edit_amazonorders <-select(amazonorders,-one_of(c("Release_Date")))
edit_amazonorders <-select(amazonorders,-one_of(c("Purchase_Order_Number")))
edit_amazonorders <-select(amazonorders,-one_of(c("PO_Line_Number")))
edit_amazonorders <-select(amazonorders,-one_of(c("Tax_Exemption_Type")))
edit_amazonorders <-select(amazonorders,-one_of(c("Group_Name")))
edit_amazonorders <-select(amazonorders,-one_of(c("Ordering_Customer_Email")))
edit_amazonorders <-select(amazonorders,-one_of(c("Buyer_Name")))
edit_amazonorders <-select(amazonorders,-one_of(c("Shipping_Address_Street_2")))
edit_amazonorders <-select(amazonorders,-one_of(c("Website")))
edit_amazonorders <-select(amazonorders,-one_of(c("Condition")))
edit_amazonorders <-select(amazonorders,-one_of(c("ASIN/ISBN")))
edit_amazonorders <-select(amazonorders,-one_of(c("UNSPSC Code")))



#summarize edited data
summary(edit_amazonorders)
##    Order_Date            Order_ID            Title             Category        
##  Min.   :2021-01-01   Min.   :1.110e+16   Length:1016        Length:1016       
##  1st Qu.:2021-07-13   1st Qu.:1.122e+16   Class :character   Class :character  
##  Median :2021-11-28   Median :1.135e+16   Mode  :character   Mode  :character  
##  Mean   :2021-11-13   Mean   :1.132e+16                                        
##  3rd Qu.:2022-03-18   3rd Qu.:1.143e+16                                        
##  Max.   :2022-08-14   Max.   :1.150e+16                                        
##                                                                                
##   ASIN/ISBN           Website           Release_Date                
##  Length:1016        Length:1016        Min.   :2016-12-01 00:00:01  
##  Class :character   Class :character   1st Qu.:2019-05-01 18:00:01  
##  Mode  :character   Mode  :character   Median :2020-10-11 12:00:01  
##                                        Mean   :2020-03-09 16:48:01  
##                                        3rd Qu.:2021-03-23 06:00:01  
##                                        Max.   :2021-09-30 00:00:01  
##                                        NA's   :1006                 
##   Condition            Seller          Seller_Credentials List_Price_Per_Unit
##  Length:1016        Length:1016        Length:1016        Min.   :  0.00     
##  Class :character   Class :character   Class :character   1st Qu.:  0.00     
##  Mode  :character   Mode  :character   Mode  :character   Median : 12.99     
##                                                           Mean   : 25.03     
##                                                           3rd Qu.: 29.99     
##                                                           Max.   :999.10     
##                                                                              
##  Purchase_Price_Per_Unit    Quantity     Payment_Instrument_Type
##  Min.   :  2.99          Min.   :0.000   Length:1016            
##  1st Qu.: 10.77          1st Qu.:1.000   Class :character       
##  Median : 18.00          Median :1.000   Mode  :character       
##  Mean   : 23.69          Mean   :1.124                          
##  3rd Qu.: 25.00          3rd Qu.:1.000                          
##  Max.   :449.25          Max.   :6.000                          
##                                                                 
##  Purchase_Order_Number PO_Line_Number Ordering_Customer_Email
##  Mode:logical          Mode:logical   Length:1016            
##  NA's:1016             NA's:1016      Class :character       
##                                       Mode  :character       
##                                                              
##                                                              
##                                                              
##                                                              
##  Shipment_Date        Shipping_Address_Name Shipping_Address_Street_1
##  Min.   :2021-01-02   Length:1016           Length:1016              
##  1st Qu.:2021-07-18   Class :character      Class :character         
##  Median :2021-11-29   Mode  :character      Mode  :character         
##  Mean   :2021-11-12                                                  
##  3rd Qu.:2022-03-18                                                  
##  Max.   :2022-08-08                                                  
##  NA's   :10                                                          
##  Shipping_Address_Street_2 Shipping_Address_City Shipping_Address_State
##  Length:1016               Length:1016           Length:1016           
##  Class :character          Class :character      Class :character      
##  Mode  :character          Mode  :character      Mode  :character      
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  Shipping_Address_Zip Order_Status       Carrier_Name_Tracking_Number
##  Length:1016          Length:1016        Length:1016                 
##  Class :character     Class :character   Class :character            
##  Mode  :character     Mode  :character   Mode  :character            
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  Item_Subtotal    Item_Subtotal_Tax   Item_Total     Tax_Exemption_Applied
##  Min.   :  0.00   Min.   : 0.000    Min.   :  0.00   Mode :logical        
##  1st Qu.: 11.92   1st Qu.: 0.755    1st Qu.: 12.84   FALSE:914            
##  Median : 19.58   Median : 1.240    Median : 21.10   TRUE :102            
##  Mean   : 25.44   Mean   : 1.722    Mean   : 27.16                        
##  3rd Qu.: 26.97   3rd Qu.: 1.940    3rd Qu.: 28.87                        
##  Max.   :449.25   Max.   :34.820    Max.   :484.07                        
##                                                                           
##  Tax_Exemption_Type Exemption_OptOut  Buyer_Name          Currency        
##  Mode:logical       Mode :logical    Length:1016        Length:1016       
##  NA's:1016          FALSE:914        Class :character   Class :character  
##                     TRUE :101        Mode  :character   Mode  :character  
##                     NA's :1                                               
##                                                                           
##                                                                           
##                                                                           
##  Group_Name    
##  Mode:logical  
##  NA's:1016     
##                
##                
##                
##                
## 
#SCRUB DATA: replace empty cell in column with 'NA'
library(dplyr)  
class(amazonorders)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
edit_amazonorders <- edit_amazonorders %>% mutate_if(is.character, ~na_if(., ''))
print(edit_amazonorders)
## # A tibble: 1,016 × 35
##    Order_Date Order_ID Title    Category `ASIN/ISBN` Website Release_Date
##    <date>        <dbl> <chr>    <chr>    <chr>       <chr>   <dttm>      
##  1 2021-01-01  1.13e16 "PawMed… ANIMAL_… B085W81GTG  Amazon… NA          
##  2 2021-01-01  1.13e16 "ULTRAI… SLIPPER  B07RSPVR8H  Amazon… NA          
##  3 2021-01-01  1.13e16 "ULTRAI… SLIPPER  B07RSPTGGW  Amazon… NA          
##  4 2021-01-01  1.13e16 "Leathe… DESK_PAD B084FFRPLX  Amazon… NA          
##  5 2021-01-01  1.13e16 "Car Cu… CUP_HOL… B082VB76QX  Amazon… NA          
##  6 2021-01-01  1.14e16 "Arctic… HAT      B07HRGMV3C  Amazon… NA          
##  7 2021-01-01  1.14e16 "Arctic… HAT      B071RDGY5Z  Amazon… NA          
##  8 2021-01-01  1.14e16 "Lifewi… STORAGE… B07PCNGJP8  Amazon… NA          
##  9 2021-01-01  1.14e16 "1000 S… PROTECT… B08MVCRCPJ  Amazon… NA          
## 10 2021-01-08  1.14e16 "Henry’… COFFEE   B071CZV4LN  Amazon… NA          
## # … with 1,006 more rows, and 28 more variables: Condition <chr>, Seller <chr>,
## #   Seller_Credentials <chr>, List_Price_Per_Unit <dbl>,
## #   Purchase_Price_Per_Unit <dbl>, Quantity <dbl>,
## #   Payment_Instrument_Type <chr>, Purchase_Order_Number <lgl>,
## #   PO_Line_Number <lgl>, Ordering_Customer_Email <chr>, Shipment_Date <date>,
## #   Shipping_Address_Name <chr>, Shipping_Address_Street_1 <chr>,
## #   Shipping_Address_Street_2 <chr>, Shipping_Address_City <chr>, …
#plot
str(edit_amazonorders)
## tibble [1,016 × 35] (S3: tbl_df/tbl/data.frame)
##  $ Order_Date                  : Date[1:1016], format: "2021-01-01" "2021-01-01" ...
##  $ Order_ID                    : num [1:1016] 1.13e+16 1.13e+16 1.13e+16 1.13e+16 1.13e+16 ...
##  $ Title                       : chr [1:1016] "PawMedica Glucosamine for Dogs, Hip & Joint Care Chews for Dogs, Joint Support for Dog Joint Supplements with H"| __truncated__ "ULTRAIDEAS Women's Cozy Memory Foam Loafer Slippers with Warm Fleece Lining, Ladies Closed Back House Shoes wit"| __truncated__ "ULTRAIDEAS Women's Cozy Memory Foam Loafer Slippers with Warm Fleece Lining, Ladies Closed Back House Shoes wit"| __truncated__ "Leather Desk Pad Protector,Mouse Pad,Office Desk Mat, Non-Slip PU Leather Desk Blotter,Laptop Desk Pad,Waterpro"| __truncated__ ...
##  $ Category                    : chr [1:1016] "ANIMAL_NUTRITIONAL_SUPPLEMENT" "SLIPPER" "SLIPPER" "DESK_PAD" ...
##  $ ASIN/ISBN                   : chr [1:1016] "B085W81GTG" "B07RSPVR8H" "B07RSPTGGW" "B084FFRPLX" ...
##  $ Website                     : chr [1:1016] "Amazon.com" "Amazon.com" "Amazon.com" "Amazon.com" ...
##  $ Release_Date                : POSIXct[1:1016], format: NA NA ...
##  $ Condition                   : chr [1:1016] "new" "new" "new" "new" ...
##  $ Seller                      : chr [1:1016] "Trustwell Brands" "Vital Store" "Vital Store" "Aothia Life" ...
##  $ Seller_Credentials          : chr [1:1016] NA NA NA "ISO 9001" ...
##  $ List_Price_Per_Unit         : num [1:1016] 50 30 30 17 0 ...
##  $ Purchase_Price_Per_Unit     : num [1:1016] 45 22 22 10.1 14 ...
##  $ Quantity                    : num [1:1016] 1 1 1 1 1 1 1 3 1 1 ...
##  $ Payment_Instrument_Type     : chr [1:1016] "Visa - 7058" "Visa - 7058" "Visa - 7058" "Visa - 7058" ...
##  $ Purchase_Order_Number       : logi [1:1016] NA NA NA NA NA NA ...
##  $ PO_Line_Number              : logi [1:1016] NA NA NA NA NA NA ...
##  $ Ordering_Customer_Email     : chr [1:1016] "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" "kaseyn617@yahoo.com" ...
##  $ Shipment_Date               : Date[1:1016], format: "2021-01-07" "2021-01-07" ...
##  $ Shipping_Address_Name       : chr [1:1016] "Kasey Nguyen" "Kasey Nguyen" "Kasey Nguyen" "Kasey Nguyen" ...
##  $ Shipping_Address_Street_1   : chr [1:1016] "11782 Carlisle Ct" "11782 Carlisle Ct" "11782 Carlisle Ct" "11782 Carlisle Ct" ...
##  $ Shipping_Address_Street_2   : chr [1:1016] NA NA NA NA ...
##  $ Shipping_Address_City       : chr [1:1016] "Moreno Valley" "Moreno Valley" "Moreno Valley" "Moreno Valley" ...
##  $ Shipping_Address_State      : chr [1:1016] "Ca" "Ca" "Ca" "Ca" ...
##  $ Shipping_Address_Zip        : chr [1:1016] "92557" "92557" "92557" "92557" ...
##  $ Order_Status                : chr [1:1016] "Shipped" "Shipped" "Shipped" "Shipped" ...
##  $ Carrier_Name_Tracking_Number: chr [1:1016] "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" "AMZN_US(TBA291879438000)" ...
##  $ Item_Subtotal               : num [1:1016] 45 22 22 10.1 14 ...
##  $ Item_Subtotal_Tax           : num [1:1016] 3.48 1.71 1.71 0.78 1.08 ...
##  $ Item_Total                  : num [1:1016] 48.4 23.7 23.7 10.9 15.1 ...
##  $ Tax_Exemption_Applied       : logi [1:1016] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Tax_Exemption_Type          : logi [1:1016] NA NA NA NA NA NA ...
##  $ Exemption_OptOut            : logi [1:1016] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Buyer_Name                  : chr [1:1016] "Kase" "Kase" "Kase" "Kase" ...
##  $ Currency                    : chr [1:1016] "USD" "USD" "USD" "USD" ...
##  $ Group_Name                  : logi [1:1016] NA NA NA NA NA NA ...
quant <- amazonorders$Quantity
hist(quant)

#factor columns

edit_amazonorders$Order_Date <-factor(edit_amazonorders$Order_Date)
edit_amazonorders$Order_ID <-factor(edit_amazonorders$Order_ID)
edit_amazonorders$Category <-factor(edit_amazonorders$Category)
edit_amazonorders$Quantity <- cut(edit_amazonorders$Quantity,breaks=4,labels=c("very few","few","many","very many"))
edit_amazonorders$List_Price_Per_Unit <-factor(edit_amazonorders$List_Price_Per_Unit)
edit_amazonorders$Purchase_Price_Per_Unit <-factor(edit_amazonorders$Purchase_Price_Per_Unit)
edit_amazonorders$Item_Subtotal <-factor(edit_amazonorders$Item_Subtotal)
edit_amazonorders$Item_Subtotal_Tax <-factor(edit_amazonorders$Item_Subtotal_Tax)
edit_amazonorders$Item_Total<-factor(edit_amazonorders$Item_Total)
edit_amazonorders$Payment_Instrument_Type<-factor(edit_amazonorders$Payment_Instrument_Type)
edit_amazonorders$Shipment_Date <-factor(edit_amazonorders$Shipment_Date)
edit_amazonorders$Shipping_Address_Name <-factor(edit_amazonorders$Shipping_Address_Name)
edit_amazonorders$Shipping_Address_Street_1 <-factor(edit_amazonorders$Shipping_Address_Street_1)
edit_amazonorders$Shipping_Address_City<-factor(edit_amazonorders$Shipping_Address_City)
edit_amazonorders$Shipping_Address_State <-factor(edit_amazonorders$Shipping_Address_State)
edit_amazonorders$Shipping_Address_Zip <-factor(edit_amazonorders$Shipping_Address_Zip)
edit_amazonorders$Order_Status <-factor(edit_amazonorders$Order_Status)
edit_amazonorders$Carrier_Name_Tracking_Number <-factor(edit_amazonorders$Carrier_Name_Tracking_Number)
edit_amazonorders$Tax_Exemption_Applied <-factor(edit_amazonorders$Tax_Exemption_Applied)
edit_amazonorders$Exemption_OptOut <-factor(edit_amazonorders$Exemption_OptOut)
edit_amazonorders$Currency <-factor(edit_amazonorders$Currency)
#convert to transactions

class(edit_amazonorders$Category)
## [1] "factor"
edit_amazonorders$Category <- as.factor(edit_amazonorders$Category)
Subcategories <- subset(edit_amazonorders$Category, edit_amazonorders$Category != 'NA')
Transactions <- subset(edit_amazonorders$Quantity, edit_amazonorders$Quantity != '0')
Transactions_by_Subcategory <- split(Subcategories, Transactions)
## Warning in split.default(Subcategories, Transactions): data length is not a
## multiple of split variable
class(Transactions_by_Subcategory)
## [1] "list"
# Arules
ars <- apriori(Transactions_by_Subcategory, parameter=list(support=0.7, confidence=0.7))
## Warning in asMethod(object): removing duplicated items in transactions
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5     0.7      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 2 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[216 item(s), 4 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [43 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
summary(ars)
## set of 43 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2  3  4 
##  6 18 15  4 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.395   3.000   4.000 
## 
## summary of quality measures:
##     support         confidence        coverage          lift      
##  Min.   :0.7500   Min.   :0.7500   Min.   :0.750   Min.   :1.000  
##  1st Qu.:0.7500   1st Qu.:1.0000   1st Qu.:0.750   1st Qu.:1.000  
##  Median :0.7500   Median :1.0000   Median :0.750   Median :1.333  
##  Mean   :0.7558   Mean   :0.9419   Mean   :0.814   Mean   :1.171  
##  3rd Qu.:0.7500   3rd Qu.:1.0000   3rd Qu.:0.875   3rd Qu.:1.333  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000   Max.   :1.333  
##      count      
##  Min.   :3.000  
##  1st Qu.:3.000  
##  Median :3.000  
##  Mean   :3.023  
##  3rd Qu.:3.000  
##  Max.   :4.000  
## 
## mining info:
##                         data ntransactions support confidence
##  Transactions_by_Subcategory             4     0.7        0.7
##                                                                                            call
##  apriori(data = Transactions_by_Subcategory, parameter = list(support = 0.7, confidence = 0.7))
class(ars)
## [1] "rules"
## attr(,"package")
## [1] "arules"
inspect(head(ars,10, by="lift"))
##      lhs                                rhs                      support
## [1]  {SKIN_MOISTURIZER}              => {PET_FOOD}               0.75   
## [2]  {PET_FOOD}                      => {SKIN_MOISTURIZER}       0.75   
## [3]  {SAUTE_FRY_PAN}                 => {HAT}                    0.75   
## [4]  {HAT}                           => {SAUTE_FRY_PAN}          0.75   
## [5]  {SAUTE_FRY_PAN}                 => {PET_ACTIVITY_STRUCTURE} 0.75   
## [6]  {PET_ACTIVITY_STRUCTURE}        => {SAUTE_FRY_PAN}          0.75   
## [7]  {HAT}                           => {PET_ACTIVITY_STRUCTURE} 0.75   
## [8]  {PET_ACTIVITY_STRUCTURE}        => {HAT}                    0.75   
## [9]  {PET_BED_MAT, SKIN_MOISTURIZER} => {PET_FOOD}               0.75   
## [10] {PET_BED_MAT, PET_FOOD}         => {SKIN_MOISTURIZER}       0.75   
##      confidence coverage lift     count
## [1]  1          0.75     1.333333 3    
## [2]  1          0.75     1.333333 3    
## [3]  1          0.75     1.333333 3    
## [4]  1          0.75     1.333333 3    
## [5]  1          0.75     1.333333 3    
## [6]  1          0.75     1.333333 3    
## [7]  1          0.75     1.333333 3    
## [8]  1          0.75     1.333333 3    
## [9]  1          0.75     1.333333 3    
## [10] 1          0.75     1.333333 3
library(arulesViz)
plot(ars, method = "graph",
measure = "confidence", shading = "lift")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.