Awesome_Chocolates_Data_Analysis

Author

Shadrack Yegon

Joining Awesome Chocolates Data

Awesome_Chocolatate_image

#Loading the libraries
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr   1.1.4     ✔ readr   2.1.5
✔ forcats 1.0.0     ✔ stringr 1.5.1
✔ ggplot2 3.5.1     ✔ tibble  3.2.1
✔ purrr   1.0.2     ✔ tidyr   1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Loading the data
sales<-read.csv("Sales.csv")
Geo<-read.csv("GEO.csv")
People<-read.csv("People.csv")
Products<-read.csv("Products.csv")
sales %>% head(5)
  SPID GeoID PID   SaleDate Amount Customers Boxes
1 SP01    G2 P16 2021-02-01  10983       185   379
2 SP01    G1 P22 2021-02-02   2436       203    94
3 SP01    G1 P02 2021-02-02   3521        50   235
4 SP01    G2 P20 2021-02-04   3724       322   266
5 SP01    G4 P13 2021-02-08   6678        24   334
Products %>% head(5)
  PID         Product Category  Size Cost_per_box
1 P01       Milk Bars     Bars LARGE         1.52
2 P02  50% Dark Bites    Bites LARGE         2.57
3 P03    Almond Choco     Bars LARGE         9.60
4 P04 Raspberry Choco     Bars LARGE         4.09
5 P05 Mint Chip Choco     Bars LARGE         1.54
People %>% head(5)
          Salesperson SPID    Team  Location
1        Barr Faughny SP01 Yummies Hyderabad
2 Dennison Crosswaite SP02 Yummies Hyderabad
3     Gunar Cockshoot SP03 Yummies Hyderabad
4      Wilone O'Kielt SP04  Delish Hyderabad
5        Gigi Bohling SP05  Delish Hyderabad
Geo %>% head(5)
  GeoID         Geo   Region
1    G1       India     APAC
2    G2         USA Americas
3    G3      Canada Americas
4    G4 New Zealand     APAC
5    G5   Australia     APAC

This analysis seeks to find:

  1. What are the names of salespersons who had at least one shipment (sale) in the first 7 days of January 2021?

    #Assessing the top 5 data
    sales %>% head(5)
      SPID GeoID PID   SaleDate Amount Customers Boxes
    1 SP01    G2 P16 2021-02-01  10983       185   379
    2 SP01    G1 P22 2021-02-02   2436       203    94
    3 SP01    G1 P02 2021-02-02   3521        50   235
    4 SP01    G2 P20 2021-02-04   3724       322   266
    5 SP01    G4 P13 2021-02-08   6678        24   334
    People %>% head(5)
              Salesperson SPID    Team  Location
    1        Barr Faughny SP01 Yummies Hyderabad
    2 Dennison Crosswaite SP02 Yummies Hyderabad
    3     Gunar Cockshoot SP03 Yummies Hyderabad
    4      Wilone O'Kielt SP04  Delish Hyderabad
    5        Gigi Bohling SP05  Delish Hyderabad
#Joining the sales and  persons data
#Return all rows from 'sales'  and matching rows from the 'People'
Sales_persons<-left_join(sales,People,by=c("SPID"="SPID"))
Sales_persons %>% head(5)
  SPID GeoID PID   SaleDate Amount Customers Boxes  Salesperson    Team
1 SP01    G2 P16 2021-02-01  10983       185   379 Barr Faughny Yummies
2 SP01    G1 P22 2021-02-02   2436       203    94 Barr Faughny Yummies
3 SP01    G1 P02 2021-02-02   3521        50   235 Barr Faughny Yummies
4 SP01    G2 P20 2021-02-04   3724       322   266 Barr Faughny Yummies
5 SP01    G4 P13 2021-02-08   6678        24   334 Barr Faughny Yummies
   Location
1 Hyderabad
2 Hyderabad
3 Hyderabad
4 Hyderabad
5 Hyderabad
#structure of the joined dataframe
Sales_persons %>% glimpse()
Rows: 993
Columns: 10
$ SPID        <chr> "SP01", "SP01", "SP01", "SP01", "SP01", "SP01", "SP01", "S…
$ GeoID       <chr> "G2", "G1", "G1", "G2", "G4", "G1", "G3", "G4", "G2", "G6"…
$ PID         <chr> "P16", "P22", "P02", "P20", "P13", "P20", "P17", "P19", "P…
$ SaleDate    <chr> "2021-02-01", "2021-02-02", "2021-02-02", "2021-02-04", "2…
$ Amount      <int> 10983, 2436, 3521, 3724, 6678, 13293, 1750, 8099, 4249, 80…
$ Customers   <int> 185, 203, 50, 322, 24, 199, 9, 178, 19, 48, 150, 387, 86, …
$ Boxes       <int> 379, 94, 235, 266, 334, 665, 63, 675, 164, 54, 67, 50, 367…
$ Salesperson <chr> "Barr Faughny", "Barr Faughny", "Barr Faughny", "Barr Faug…
$ Team        <chr> "Yummies", "Yummies", "Yummies", "Yummies", "Yummies", "Yu…
$ Location    <chr> "Hyderabad", "Hyderabad", "Hyderabad", "Hyderabad", "Hyder…
#changing 'saleDate' datatype to date
Sales_persons$SaleDate<-as.Date(Sales_persons$SaleDate)
#confirming the data type of the new date
Sales_persons$SaleDate %>% class()
[1] "Date"
DATE1 <- as.Date("2021-01-01")
DATE2 <- as.Date("2021-01-07")
#filtering the sales that occured between the first week of Jan,2021
Sales_persons_filtered<-Sales_persons %>% select(Salesperson,SaleDate)%>% 
  filter(between(SaleDate,DATE1,DATE2))
#displaying filtered data
Sales_persons_filtered
           Salesperson   SaleDate
1      Gunar Cockshoot 2021-01-06
2       Wilone O'Kielt 2021-01-01
3       Wilone O'Kielt 2021-01-05
4       Wilone O'Kielt 2021-01-06
5         Gigi Bohling 2021-01-07
6       Curtice Advani 2021-01-01
7       Curtice Advani 2021-01-01
8       Curtice Advani 2021-01-04
9       Curtice Advani 2021-01-05
10         Kaine Padly 2021-01-05
11         Kaine Padly 2021-01-05
12         Kaine Padly 2021-01-06
13        Ches Bonnell 2021-01-06
14        Ches Bonnell 2021-01-07
15      Andria Kimpton 2021-01-01
16      Andria Kimpton 2021-01-06
17      Andria Kimpton 2021-01-07
18         Brien Boise 2021-01-01
19         Brien Boise 2021-01-01
20         Brien Boise 2021-01-06
21        Husein Augar 2021-01-04
22        Husein Augar 2021-01-05
23        Husein Augar 2021-01-06
24    Karlen McCaffrey 2021-01-01
25    Karlen McCaffrey 2021-01-01
26    Karlen McCaffrey 2021-01-04
27        Jan Morforth 2021-01-05
28        Jan Morforth 2021-01-06
29      Dotty Strutley 2021-01-01
30       Kelci Walkden 2021-01-01
31       Kelci Walkden 2021-01-04
32       Kelci Walkden 2021-01-07
33      Marney O'Breen 2021-01-05
34 Rafaelita Blaksland 2021-01-01
35 Rafaelita Blaksland 2021-01-05
36     Madelene Upcott 2021-01-01
37     Madelene Upcott 2021-01-04
38     Madelene Upcott 2021-01-05
39     Madelene Upcott 2021-01-05
40     Madelene Upcott 2021-01-07
41      Beverie Moffet 2021-01-01
42      Beverie Moffet 2021-01-01
43      Beverie Moffet 2021-01-04
44      Beverie Moffet 2021-01-07
45          Oby Sorrel 2021-01-01
46          Oby Sorrel 2021-01-04
47          Oby Sorrel 2021-01-06
48      Jehu Rudeforth 2021-01-01
49      Jehu Rudeforth 2021-01-01
50      Jehu Rudeforth 2021-01-01
51         Van Tuxwell 2021-01-04
52         Van Tuxwell 2021-01-05
53      Camilla Castle 2021-01-01
# Select unique salespersons who made sales in the first 7 days
salespersons_in_first_week<-unique(Sales_persons_filtered$Salesperson)
#displaying uniques sales persons 
salespersons_in_first_week
 [1] "Gunar Cockshoot"     "Wilone O'Kielt"      "Gigi Bohling"       
 [4] "Curtice Advani"      "Kaine Padly"         "Ches Bonnell"       
 [7] "Andria Kimpton"      "Brien Boise"         "Husein Augar"       
[10] "Karlen McCaffrey"    "Jan Morforth"        "Dotty Strutley"     
[13] "Kelci Walkden"       "Marney O'Breen"      "Rafaelita Blaksland"
[16] "Madelene Upcott"     "Beverie Moffet"      "Oby Sorrel"         
[19] "Jehu Rudeforth"      "Van Tuxwell"         "Camilla Castle"     
  1. Which salespersons did not make any shipments in the first 7 days of January 2021?
#Select unique salespersons who made sales in the first 7 days

DATE1 <- as.Date("2021-01-01")
DATE2 <- as.Date("2021-01-07")
#filtering for the sales that happened in the first week of jan,2021
Sales_persons_filtered<-Sales_persons %>% select(Salesperson,SaleDate)%>% 
  filter(between(SaleDate,DATE1,DATE2))
# Select unique salespersons who made sales in the first 7 days
salespersons_in_first_week<-unique(Sales_persons_filtered$Salesperson)
salespersons_in_first_week
 [1] "Gunar Cockshoot"     "Wilone O'Kielt"      "Gigi Bohling"       
 [4] "Curtice Advani"      "Kaine Padly"         "Ches Bonnell"       
 [7] "Andria Kimpton"      "Brien Boise"         "Husein Augar"       
[10] "Karlen McCaffrey"    "Jan Morforth"        "Dotty Strutley"     
[13] "Kelci Walkden"       "Marney O'Breen"      "Rafaelita Blaksland"
[16] "Madelene Upcott"     "Beverie Moffet"      "Oby Sorrel"         
[19] "Jehu Rudeforth"      "Van Tuxwell"         "Camilla Castle"     
#List of all salespersons in the joined data
all_salespersons<-unique(Sales_persons$Salesperson)
all_salespersons
 [1] "Barr Faughny"        "Dennison Crosswaite" "Gigi Bohling"       
 [4] "Curtice Advani"      "Ches Bonnell"        "Brien Boise"        
 [7] "Karlen McCaffrey"    "Rafaelita Blaksland" "Beverie Moffet"     
[10] "Jehu Rudeforth"      "Roddy Speechley"     "Kelci Walkden"      
[13] "Marney O'Breen"      "Van Tuxwell"         "Wilone O'Kielt"     
[16] "Husein Augar"        "Jan Morforth"        "Dotty Strutley"     
[19] "Gunar Cockshoot"     "Kaine Padly"         "Andria Kimpton"     
[22] "Mallorie Waber"      "Madelene Upcott"     "Oby Sorrel"         
[25] "Camilla Castle"     
# Find salespersons who did NOT make sales in the first 7 days
salespersons_no_sales <- setdiff(all_salespersons, salespersons_in_first_week)
# Display the result
salespersons_no_sales
[1] "Barr Faughny"        "Dennison Crosswaite" "Roddy Speechley"    
[4] "Mallorie Waber"     
  1. Which product sold more boxes in the first 7 days of February 2021? Milk Bars or Eclairs?
This question is solved by joining sales and products data.
sales %>% head(5)
  SPID GeoID PID   SaleDate Amount Customers Boxes
1 SP01    G2 P16 2021-02-01  10983       185   379
2 SP01    G1 P22 2021-02-02   2436       203    94
3 SP01    G1 P02 2021-02-02   3521        50   235
4 SP01    G2 P20 2021-02-04   3724       322   266
5 SP01    G4 P13 2021-02-08   6678        24   334
Products %>% head(5)
  PID         Product Category  Size Cost_per_box
1 P01       Milk Bars     Bars LARGE         1.52
2 P02  50% Dark Bites    Bites LARGE         2.57
3 P03    Almond Choco     Bars LARGE         9.60
4 P04 Raspberry Choco     Bars LARGE         4.09
5 P05 Mint Chip Choco     Bars LARGE         1.54
#Joining the products and sales data
sales_product<-left_join(sales,Products,by=c("PID"="PID"))
sales_product %>% head(5)
  SPID GeoID PID   SaleDate Amount Customers Boxes             Product Category
1 SP01    G2 P16 2021-02-01  10983       185   379 Organic Choco Syrup    Other
2 SP01    G1 P22 2021-02-02   2436       203    94 Peanut Butter Cubes    Bites
3 SP01    G1 P02 2021-02-02   3521        50   235      50% Dark Bites    Bites
4 SP01    G2 P20 2021-02-04   3724       322   266  Smooth Sliky Salty     Bars
5 SP01    G4 P13 2021-02-08   6678        24   334       85% Dark Bars     Bars
   Size Cost_per_box
1 SMALL         1.47
2 SMALL         3.57
3 LARGE         2.57
4 SMALL         2.97
5 SMALL         0.64
sales_product %>% glimpse()
Rows: 993
Columns: 11
$ SPID         <chr> "SP01", "SP01", "SP01", "SP01", "SP01", "SP01", "SP01", "…
$ GeoID        <chr> "G2", "G1", "G1", "G2", "G4", "G1", "G3", "G4", "G2", "G6…
$ PID          <chr> "P16", "P22", "P02", "P20", "P13", "P20", "P17", "P19", "…
$ SaleDate     <chr> "2021-02-01", "2021-02-02", "2021-02-02", "2021-02-04", "…
$ Amount       <int> 10983, 2436, 3521, 3724, 6678, 13293, 1750, 8099, 4249, 8…
$ Customers    <int> 185, 203, 50, 322, 24, 199, 9, 178, 19, 48, 150, 387, 86,…
$ Boxes        <int> 379, 94, 235, 266, 334, 665, 63, 675, 164, 54, 67, 50, 36…
$ Product      <chr> "Organic Choco Syrup", "Peanut Butter Cubes", "50% Dark B…
$ Category     <chr> "Other", "Bites", "Bites", "Bars", "Bars", "Bars", "Bars"…
$ Size         <chr> "SMALL", "SMALL", "LARGE", "SMALL", "SMALL", "SMALL", "SM…
$ Cost_per_box <dbl> 1.47, 3.57, 2.57, 2.97, 0.64, 2.97, 0.54, 4.97, 5.79, 0.1…
#Converting saleDate datatype to date
sales_product$SaleDate<-as.Date(sales_product$SaleDate)
date1<-as.Date("2021-02-01")
date2<-as.Date("2021-02-07")
#Filtering out Milk Bars or Eclairs  sold  within the first 7 days of February 2021
Sales_product_filtered<-sales_product %>% select(Product,Boxes,SaleDate) %>% filter(between(SaleDate,date1,date2) & Product %in%c('Milk Bars','Eclairs'))
#Diaplaying the result
Sales_product_filtered
     Product Boxes   SaleDate
1  Milk Bars   233 2021-02-01
2    Eclairs   276 2021-02-05
3  Milk Bars   282 2021-02-01
4    Eclairs   265 2021-02-02
5  Milk Bars   294 2021-02-03
6  Milk Bars   194 2021-02-04
7    Eclairs   581 2021-02-05
8    Eclairs   275 2021-02-05
9    Eclairs   166 2021-02-04
10 Milk Bars   338 2021-02-04
11 Milk Bars   240 2021-02-03
#Finding the product with the highest boxes sold
Sales_product_filtered %>%
  group_by(Product) %>% 
  summarise(`Total boxes`=sum(Boxes)) %>% 
  arrange(desc(`Total boxes`))
# A tibble: 2 × 2
  Product   `Total boxes`
  <chr>             <int>
1 Milk Bars          1581
2 Eclairs            1563
  1. How many boxes were shipped to every country?

    This is achieved by joining sales and the Geography table

    sales %>% head(5)
      SPID GeoID PID   SaleDate Amount Customers Boxes
    1 SP01    G2 P16 2021-02-01  10983       185   379
    2 SP01    G1 P22 2021-02-02   2436       203    94
    3 SP01    G1 P02 2021-02-02   3521        50   235
    4 SP01    G2 P20 2021-02-04   3724       322   266
    5 SP01    G4 P13 2021-02-08   6678        24   334
    Geo %>% head(5)
      GeoID         Geo   Region
    1    G1       India     APAC
    2    G2         USA Americas
    3    G3      Canada Americas
    4    G4 New Zealand     APAC
    5    G5   Australia     APAC
    #Joining sales and GEO data
    Sales_Geo<-left_join(sales,Geo,by=c("GeoID"="GeoID"))
    Sales_Geo %>% head(5)
      SPID GeoID PID   SaleDate Amount Customers Boxes         Geo   Region
    1 SP01    G2 P16 2021-02-01  10983       185   379         USA Americas
    2 SP01    G1 P22 2021-02-02   2436       203    94       India     APAC
    3 SP01    G1 P02 2021-02-02   3521        50   235       India     APAC
    4 SP01    G2 P20 2021-02-04   3724       322   266         USA Americas
    5 SP01    G4 P13 2021-02-08   6678        24   334 New Zealand     APAC
    Sales_Geo %>% select(Geo,Boxes,Amount) %>%
      group_by(Geo) %>% summarise(`Total Boxes`=sum(Boxes)) %>% 
      arrange(desc(`Total Boxes`))
    # A tibble: 6 × 2
      Geo         `Total Boxes`
      <chr>               <int>
    1 India               62738
    2 Canada              55731
    3 UK                  54023
    4 USA                 53563
    5 New Zealand         50666
    6 Australia           43240