This document provides information about:

Load and inspect data

str(df)
## tibble [9,994 x 21] (S3: tbl_df/tbl/data.frame)
##  $ Row ID       : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Order ID     : chr [1:9994] "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
##  $ Order Date   : POSIXct[1:9994], format: "2016-11-08" "2016-11-08" ...
##  $ Ship Date    : POSIXct[1:9994], format: "2016-11-11" "2016-11-11" ...
##  $ Ship Mode    : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer ID  : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ Customer Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
##  $ Segment      : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
##  $ Country      : chr [1:9994] "United States" "United States" "United States" "United States" ...
##  $ City         : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal Code  : num [1:9994] 42420 42420 90036 33311 33311 ...
##  $ Region       : chr [1:9994] "South" "South" "West" "South" ...
##  $ Product ID   : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub-Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
##  $ Quantity     : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
##  $ Discount     : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
##  $ Profit       : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...
head(df)
## # A tibble: 6 x 21
##   `Row ID` `Order ID` `Order Date`        `Ship Date`         `Ship Mode`
##      <dbl> <chr>      <dttm>              <dttm>              <chr>      
## 1        1 CA-2016-1~ 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Cla~
## 2        2 CA-2016-1~ 2016-11-08 00:00:00 2016-11-11 00:00:00 Second Cla~
## 3        3 CA-2016-1~ 2016-06-12 00:00:00 2016-06-16 00:00:00 Second Cla~
## 4        4 US-2015-1~ 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard C~
## 5        5 US-2015-1~ 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard C~
## 6        6 CA-2014-1~ 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard C~
## # ... with 16 more variables: `Customer ID` <chr>, `Customer Name` <chr>,
## #   Segment <chr>, Country <chr>, City <chr>, State <chr>, `Postal Code` <dbl>,
## #   Region <chr>, `Product ID` <chr>, Category <chr>, `Sub-Category` <chr>,
## #   `Product Name` <chr>, Sales <dbl>, Quantity <dbl>, Discount <dbl>,
## #   Profit <dbl>

Trim spaces in colnames to prevent confusion

names(df) <- gsub(" ", "_", names(df)) #trim spaces
names(df) <- gsub("-", "_", names(df)) #change - with _
names(df)
##  [1] "Row_ID"        "Order_ID"      "Order_Date"    "Ship_Date"    
##  [5] "Ship_Mode"     "Customer_ID"   "Customer_Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal_Code"  
## [13] "Region"        "Product_ID"    "Category"      "Sub_Category" 
## [17] "Product_Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

Testing for missing values

missing<-sapply(df, function(x) sum(is.na(x)))
missing
##        Row_ID      Order_ID    Order_Date     Ship_Date     Ship_Mode 
##             0             0             0             0             0 
##   Customer_ID Customer_Name       Segment       Country          City 
##             0             0             0             0             0 
##         State   Postal_Code        Region    Product_ID      Category 
##             0             0             0             0             0 
##  Sub_Category  Product_Name         Sales      Quantity      Discount 
##             0             0             0             0             0 
##        Profit 
##             0

There is no null values.

Select specific columns only

profilinganalysis <- df %>%
  select(Customer_ID,Sub_Category,Sales)
head(profilinganalysis)
## # A tibble: 6 x 3
##   Customer_ID Sub_Category Sales
##   <chr>       <chr>        <dbl>
## 1 CG-12520    Bookcases    262. 
## 2 CG-12520    Chairs       732. 
## 3 DV-13045    Labels        14.6
## 4 SO-20335    Tables       958. 
## 5 SO-20335    Storage       22.4
## 6 BH-11710    Furnishings   48.9

Data transformation with dcast; transform subcategories from long format into wide format then sum by Sales Revenue

profilinganalysis$Sub_Category = as.factor(profilinganalysis$Sub_Category)
purchasedist<- dcast(profilinganalysis, Customer_ID  ~ Sub_Category, value.var = "Sales", fun.aggregate = sum)
head(purchasedist)
##   Customer_ID Accessories Appliances    Art  Binders Bookcases   Chairs Copiers
## 1    AA-10315       41.72     415.92  0.000  685.108     0.000    0.000       0
## 2    AA-10375      649.95       0.00  5.248   74.004     0.000    0.000       0
## 3    AA-10480      479.97       0.00  8.640    0.000     0.000    0.000       0
## 4    AA-10645        0.00       0.00  5.280  113.940  1279.165 2347.888       0
## 5    AB-10015        0.00       0.00 17.940    0.000   341.960   48.712       0
## 6    AB-10060      195.74     160.32  0.000 4404.216     0.000  449.568       0
##   Envelopes Fasteners Furnishings Labels Machines   Paper  Phones Storage
## 1      0.00     2.304       14.56      0        0  14.940 431.976  26.960
## 2      0.00     0.000       28.40      0        0  80.080 116.980 101.728
## 3      0.00     0.000       27.46      0        0 108.522 209.970 508.110
## 4     43.96     0.000      123.66      0        0 119.280 354.682 699.080
## 5      0.00     0.000        0.00      0        0   0.000 221.980 255.564
## 6      0.00     0.000      127.88      0        0 353.106   0.000   0.000
##   Supplies  Tables
## 1 3930.072    0.00
## 2    0.000    0.00
## 3    0.000  447.84
## 4    0.000    0.00
## 5    0.000    0.00
## 6   27.930 2036.86

Add some more variables

##Add TotalRevenue column
purchasedist$TotalSales <- as.numeric(apply(purchasedist[,2:18], 1, sum))
#count how many subcategories each customer has bought
purchasedist$TotalCat<- apply(purchasedist[2:18], 1, function(x) sum(x>0))

head(purchasedist)
##   Customer_ID Accessories Appliances    Art  Binders Bookcases   Chairs Copiers
## 1    AA-10315       41.72     415.92  0.000  685.108     0.000    0.000       0
## 2    AA-10375      649.95       0.00  5.248   74.004     0.000    0.000       0
## 3    AA-10480      479.97       0.00  8.640    0.000     0.000    0.000       0
## 4    AA-10645        0.00       0.00  5.280  113.940  1279.165 2347.888       0
## 5    AB-10015        0.00       0.00 17.940    0.000   341.960   48.712       0
## 6    AB-10060      195.74     160.32  0.000 4404.216     0.000  449.568       0
##   Envelopes Fasteners Furnishings Labels Machines   Paper  Phones Storage
## 1      0.00     2.304       14.56      0        0  14.940 431.976  26.960
## 2      0.00     0.000       28.40      0        0  80.080 116.980 101.728
## 3      0.00     0.000       27.46      0        0 108.522 209.970 508.110
## 4     43.96     0.000      123.66      0        0 119.280 354.682 699.080
## 5      0.00     0.000        0.00      0        0   0.000 221.980 255.564
## 6      0.00     0.000      127.88      0        0 353.106   0.000   0.000
##   Supplies  Tables TotalSales TotalCat
## 1 3930.072    0.00   5563.560        9
## 2    0.000    0.00   1056.390        7
## 3    0.000  447.84   1790.512        7
## 4    0.000    0.00   5086.935        9
## 5    0.000    0.00    886.156        5
## 6   27.930 2036.86   7755.620        8

Add subcategory names of max and min values

##max subcategory
purchasedist$maxcol<-apply(purchasedist[2:18], 1, function(x) colnames(purchasedist[2:18])[which.max(x)])

##min subcategory
#this is more tricky since we want to eliminate 0 values and find first >0 value
min <- purchasedist[1:18] %>% 
  pivot_longer(cols =colnames(purchasedist[2:18]))%>%
  filter(value > 0) %>%
  group_by(Customer_ID) %>%
  summarise(mincol = name[order(value)][1]
  ) %>%
  select(Customer_ID,mincol)
## `summarise()` ungrouping output (override with `.groups` argument)
purchasedist<- left_join(purchasedist,min) 
## Joining, by = "Customer_ID"
rm(min)

head(purchasedist)
##   Customer_ID Accessories Appliances    Art  Binders Bookcases   Chairs Copiers
## 1    AA-10315       41.72     415.92  0.000  685.108     0.000    0.000       0
## 2    AA-10375      649.95       0.00  5.248   74.004     0.000    0.000       0
## 3    AA-10480      479.97       0.00  8.640    0.000     0.000    0.000       0
## 4    AA-10645        0.00       0.00  5.280  113.940  1279.165 2347.888       0
## 5    AB-10015        0.00       0.00 17.940    0.000   341.960   48.712       0
## 6    AB-10060      195.74     160.32  0.000 4404.216     0.000  449.568       0
##   Envelopes Fasteners Furnishings Labels Machines   Paper  Phones Storage
## 1      0.00     2.304       14.56      0        0  14.940 431.976  26.960
## 2      0.00     0.000       28.40      0        0  80.080 116.980 101.728
## 3      0.00     0.000       27.46      0        0 108.522 209.970 508.110
## 4     43.96     0.000      123.66      0        0 119.280 354.682 699.080
## 5      0.00     0.000        0.00      0        0   0.000 221.980 255.564
## 6      0.00     0.000      127.88      0        0 353.106   0.000   0.000
##   Supplies  Tables TotalSales TotalCat      maxcol    mincol
## 1 3930.072    0.00   5563.560        9    Supplies Fasteners
## 2    0.000    0.00   1056.390        7 Accessories       Art
## 3    0.000  447.84   1790.512        7     Storage       Art
## 4    0.000    0.00   5086.935        9      Chairs       Art
## 5    0.000    0.00    886.156        5   Bookcases       Art
## 6   27.930 2036.86   7755.620        8     Binders  Supplies

add max-min ratio

for (i in 1:nrow(purchasedist)){
  purchasedist$MaxRatio[i]<-  (purchasedist[i,which(colnames(purchasedist[1:18]) == purchasedist$maxcol[i])])/(purchasedist$TotalSales[i])
  purchasedist$MinRatio[i]<-  (purchasedist[i,which(colnames(purchasedist[1:18]) == purchasedist$mincol[i])])/(purchasedist$TotalSales[i])
}
head(purchasedist)
##   Customer_ID Accessories Appliances    Art  Binders Bookcases   Chairs Copiers
## 1    AA-10315       41.72     415.92  0.000  685.108     0.000    0.000       0
## 2    AA-10375      649.95       0.00  5.248   74.004     0.000    0.000       0
## 3    AA-10480      479.97       0.00  8.640    0.000     0.000    0.000       0
## 4    AA-10645        0.00       0.00  5.280  113.940  1279.165 2347.888       0
## 5    AB-10015        0.00       0.00 17.940    0.000   341.960   48.712       0
## 6    AB-10060      195.74     160.32  0.000 4404.216     0.000  449.568       0
##   Envelopes Fasteners Furnishings Labels Machines   Paper  Phones Storage
## 1      0.00     2.304       14.56      0        0  14.940 431.976  26.960
## 2      0.00     0.000       28.40      0        0  80.080 116.980 101.728
## 3      0.00     0.000       27.46      0        0 108.522 209.970 508.110
## 4     43.96     0.000      123.66      0        0 119.280 354.682 699.080
## 5      0.00     0.000        0.00      0        0   0.000 221.980 255.564
## 6      0.00     0.000      127.88      0        0 353.106   0.000   0.000
##   Supplies  Tables TotalSales TotalCat      maxcol    mincol  MaxRatio
## 1 3930.072    0.00   5563.560        9    Supplies Fasteners 0.7063952
## 2    0.000    0.00   1056.390        7 Accessories       Art 0.6152557
## 3    0.000  447.84   1790.512        7     Storage       Art 0.2837792
## 4    0.000    0.00   5086.935        9      Chairs       Art 0.4615526
## 5    0.000    0.00    886.156        5   Bookcases       Art 0.3858914
## 6   27.930 2036.86   7755.620        8     Binders  Supplies 0.5678741
##       MinRatio
## 1 0.0004141233
## 2 0.0049678622
## 3 0.0048254354
## 4 0.0010379531
## 5 0.0202447425
## 6 0.0036012595

With this table we can target customers more precisely