Product_Sales

Onyedika Obiakarije 1/18/2022

Questions are:

● What was the best month for sales? How much was earned that month?

● What city sold the most product?

● What time should we display advertisements to maximize the likelihood of customers buying products?

● What products are most often sold together?

● What product sold the most? Why do you think it sold the most?

Import Necessary Liberaries

library(tidyverse) #for data wrangling

## Warning: package 'tidyverse' was built under R version 4.0.5

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'purrr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate) #for time and date

## Warning: package 'lubridate' was built under R version 4.0.5

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr) #for bind_row
library(ggplot2) #for visualize
library(plotly) #for visualize

## Warning: package 'plotly' was built under R version 4.0.5

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

theme_set(theme_bw(16)) #to make the x-axis test appear vertical.
library(qdapRegex) #extract the city and state from purchase_address column

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:dplyr':
## 
##     explain

## The following object is masked from 'package:ggplot2':
## 
##     %+%

Set working directories.

setwd("C:/Users/harry/Desktop/Class project/sales")

Load Dataset and Verify

# load datasets
Jan_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_January_2019.csv")

## Rows: 9723 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Feb_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_February_2019.csv")

## Rows: 12036 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Mar_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_March_2019.csv")

## Rows: 15226 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Apr_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_April_2019.csv")

## Rows: 18383 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

May_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_May_2019.csv")

## Rows: 16635 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Jun_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_June_2019.csv")

## Rows: 13622 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Jul_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_July_2019.csv")

## Rows: 14371 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Aug_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_August_2019.csv")

## Rows: 12011 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Sep_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_September_2019.csv")

## Rows: 11686 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Oct_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_October_2019.csv")

## Rows: 20379 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Nov_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_November_2019.csv")

## Rows: 17661 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Dec_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_December_2019.csv")

## Rows: 25117 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

# a quick check to see if all column names are named equally
colnames(Dec_2019)

## [1] "Order ID"         "Product"          "Quantity Ordered" "Price Each"      
## [5] "Order Date"       "Purchase Address"

colnames(Mar_2019)

## [1] "Order ID"         "Product"          "Quantity Ordered" "Price Each"      
## [5] "Order Date"       "Purchase Address"

colnames(Jun_2019)

## [1] "Order ID"         "Product"          "Quantity Ordered" "Price Each"      
## [5] "Order Date"       "Purchase Address"

colnames(Sep_2019)

## [1] "Order ID"         "Product"          "Quantity Ordered" "Price Each"      
## [5] "Order Date"       "Purchase Address"

Merge Datasets

#Combining the data sets into Seasons
Winter <- bind_rows(Dec_2019, Jan_2019, Feb_2019) #as q1

Spring <- bind_rows(Mar_2019,Apr_2019, May_2019)  #as q2
  
Summer <- bind_rows(Jun_2019, Jul_2019, Aug_2019) #as q3

Autumn <- bind_rows(Sep_2019, Oct_2019, Nov_2019) #as q4

#Inspect the data
str(Winter)

## spec_tbl_df [46,876 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Order ID        : chr [1:46876] "295665" "295666" "295667" "295668" ...
##  $ Product         : chr [1:46876] "Macbook Pro Laptop" "LG Washing Machine" "USB-C Charging Cable" "27in FHD Monitor" ...
##  $ Quantity Ordered: chr [1:46876] "1" "1" "1" "1" ...
##  $ Price Each      : chr [1:46876] "1700" "600.0" "11.95" "149.99" ...
##  $ Order Date      : chr [1:46876] "12/30/19 00:01" "12/29/19 07:03" "12/12/19 18:21" "12/22/19 15:13" ...
##  $ Purchase Address: chr [1:46876] "136 Church St, New York City, NY 10001" "562 2nd St, New York City, NY 10001" "277 Main St, New York City, NY 10001" "410 6th St, San Francisco, CA 94016" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Order ID` = col_character(),
##   ..   Product = col_character(),
##   ..   `Quantity Ordered` = col_character(),
##   ..   `Price Each` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Purchase Address` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(Spring)

## spec_tbl_df [50,244 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Order ID        : chr [1:50244] "162009" "162009" "162009" "162010" ...
##  $ Product         : chr [1:50244] "iPhone" "Lightning Charging Cable" "Wired Headphones" "Bose SoundSport Headphones" ...
##  $ Quantity Ordered: chr [1:50244] "1" "1" "2" "1" ...
##  $ Price Each      : chr [1:50244] "700" "14.95" "11.99" "99.99" ...
##  $ Order Date      : chr [1:50244] "03/28/19 20:59" "03/28/19 20:59" "03/28/19 20:59" "03/17/19 05:39" ...
##  $ Purchase Address: chr [1:50244] "942 Church St, Austin, TX 73301" "942 Church St, Austin, TX 73301" "942 Church St, Austin, TX 73301" "261 10th St, San Francisco, CA 94016" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Order ID` = col_character(),
##   ..   Product = col_character(),
##   ..   `Quantity Ordered` = col_character(),
##   ..   `Price Each` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Purchase Address` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(Summer)

## spec_tbl_df [40,004 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Order ID        : chr [1:40004] "209921" "209922" "209923" "209924" ...
##  $ Product         : chr [1:40004] "USB-C Charging Cable" "Macbook Pro Laptop" "ThinkPad Laptop" "27in FHD Monitor" ...
##  $ Quantity Ordered: chr [1:40004] "1" "1" "1" "1" ...
##  $ Price Each      : chr [1:40004] "11.95" "1700.0" "999.99" "149.99" ...
##  $ Order Date      : chr [1:40004] "06/23/19 19:34" "06/30/19 10:05" "06/24/19 20:18" "06/05/19 10:21" ...
##  $ Purchase Address: chr [1:40004] "950 Walnut St, Portland, ME 04101" "80 4th St, San Francisco, CA 94016" "402 Jackson St, Los Angeles, CA 90001" "560 10th St, Seattle, WA 98101" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Order ID` = col_character(),
##   ..   Product = col_character(),
##   ..   `Quantity Ordered` = col_character(),
##   ..   `Price Each` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Purchase Address` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(Autumn)

## spec_tbl_df [49,726 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Order ID        : chr [1:49726] "248151" "248152" "248153" "248154" ...
##  $ Product         : chr [1:49726] "AA Batteries (4-pack)" "USB-C Charging Cable" "USB-C Charging Cable" "27in FHD Monitor" ...
##  $ Quantity Ordered: chr [1:49726] "4" "2" "1" "1" ...
##  $ Price Each      : chr [1:49726] "3.84" "11.95" "11.95" "149.99" ...
##  $ Order Date      : chr [1:49726] "09/17/19 14:44" "09/29/19 10:19" "09/16/19 17:48" "09/27/19 07:52" ...
##  $ Purchase Address: chr [1:49726] "380 North St, Los Angeles, CA 90001" "511 8th St, Austin, TX 73301" "151 Johnson St, Los Angeles, CA 90001" "355 Hickory St, Seattle, WA 98101" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Order ID` = col_character(),
##   ..   Product = col_character(),
##   ..   `Quantity Ordered` = col_character(),
##   ..   `Price Each` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Purchase Address` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

#Combine all data into one
all_sales <- bind_rows(Jan_2019, Feb_2019, Mar_2019,
                     Apr_2019, May_2019, Jun_2019,
                     Jul_2019, Aug_2019, Sep_2019,
                     Oct_2019, Nov_2019, Dec_2019)
#View(all_sales)

colnames(all_sales) #Show colnames

## [1] "Order ID"         "Product"          "Quantity Ordered" "Price Each"      
## [5] "Order Date"       "Purchase Address"

dim(all_sales) #Dimensions of the data

## [1] 186850      6

head(all_sales) #load the first 6 row of the data

## # A tibble: 6 x 6
##   `Order ID` Product `Quantity Orde~` `Price Each` `Order Date` `Purchase Addr~`
##   <chr>      <chr>   <chr>            <chr>        <chr>        <chr>           
## 1 141234     iPhone  1                700          01/22/19 21~ 944 Walnut St, ~
## 2 141235     Lightn~ 1                14.95        01/28/19 14~ 185 Maple St, P~
## 3 141236     Wired ~ 2                11.99        01/17/19 13~ 538 Adams St, S~
## 4 141237     27in F~ 1                149.99       01/05/19 20~ 738 10th St, Lo~
## 5 141238     Wired ~ 1                11.99        01/25/19 11~ 387 10th St, Au~
## 6 141239     AAA Ba~ 1                2.99         01/29/19 20~ 775 Willow St, ~

summary(all_sales) #Shows the statistical summary

##    Order ID           Product          Quantity Ordered    Price Each       
##  Length:186850      Length:186850      Length:186850      Length:186850     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##   Order Date        Purchase Address  
##  Length:186850      Length:186850     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Cleaning the Dataset.

# get the sum of NA value.
sum(is.na(all_sales))

## [1] 3270

# the value of NA sum tells me that entire rows were left empty
# So I am going to remove all NA value instead of replacing NA value

#drop values with NA
all_sales <- na.omit(all_sales)

# get the sum of NA value after cleaning the data for verfication.
sum(is.na(all_sales))

## [1] 0

Cleaning for Question 1

#Save file as a CSV
#all_sales %>% write.csv("clean_sales.csv")

#change all column names aside from "Product"
names(all_sales)[5] <- 'Order_Date'
names(all_sales)[6] <- 'Purchase_Address'
names(all_sales)[1] <- 'Order_ID'
names(all_sales)[3] <- 'Quantity_Ordered'
names(all_sales)[4] <- 'Price_Each'


#rename(all_sales, Order_Data = Order Date)

#get the name of the column in position 6
#names(all_sales)[6]

Convert some columns for easy of caculation

# Change the data type of Quantity ordered to get the Sales number for the sales column
all_sales <- mutate(all_sales, Quantity_Ordered = as.numeric(Quantity_Ordered))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

# Change the data type of Price Each to get the Sales number for the sales column
all_sales <- mutate(all_sales, Price_Each = as.numeric(Price_Each))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

sapply(all_sales, class)

##         Order_ID          Product Quantity_Ordered       Price_Each 
##      "character"      "character"        "numeric"        "numeric" 
##       Order_Date Purchase_Address 
##      "character"      "character"

Question 1

● What was the best month for sales? How much was earned that month?

Task 1: Convert the “Order Date” Column to datetime

# Convert the "Order_Date" datatype to datetime
all_sales$Order_Date = as.POSIXct(all_sales$Order_Date, format = "%m/%d/%y %H:%M")

# Delete the NA value that shows after the previous code

# get the sum of NA value.
sum(is.na(all_sales))

## [1] 1068

# the value of NA sum tells me that entire rows were left empty
# So I am going to remove all NA value instead of replacing NA value

#drop values with NA
all_sales <- na.omit(all_sales)

# get the sum of NA value.
sum(is.na(all_sales))

## [1] 0

Task 1: Create a new column for “Months”

all_sales$Month = format(all_sales$Order_Date, "%m")

Task 2: Create a new column for “Sales”

all_sales$Sales <- all_sales$Quantity_Ordered * all_sales$Price_Each

Answer to Question 1

# Group Month and Sales together
all_sales %>% group_by(Month) %>% summarise(sales = sum(Sales))

## # A tibble: 12 x 2
##    Month    sales
##    <chr>    <dbl>
##  1 01    1822257.
##  2 02    2202022.
##  3 03    2806547.
##  4 04    3390670.
##  5 05    3152607.
##  6 06    2577802.
##  7 07    2647776.
##  8 08    2244468.
##  9 09    2097560.
## 10 10    3736727.
## 11 11    3199603.
## 12 12    4613443.

#quantile(result$sales, probs = c(.25, .5, .75))

#Visualization
ggplot(all_sales, mapping = aes(x = Month, y = Sales)) +
  geom_col() +
  theme_classic() + #make the plot look nice
  labs(title = "Best Month for Sales", x = "Months", y = "Sales in hundred thousands $") #labels

Question 2

● What city sold the most product?

Task 1: Create a new column and name it “City”

#Get the city and the state out of the "Purchase_Address" column
all_sales$City <- ex_city_state(all_sales$Purchase_Address)

# Change City column data type to character
all_sales <- mutate(all_sales, City = as.character(City))
sapply(all_sales, class)

## $Order_ID
## [1] "character"
## 
## $Product
## [1] "character"
## 
## $Quantity_Ordered
## [1] "numeric"
## 
## $Price_Each
## [1] "numeric"
## 
## $Order_Date
## [1] "POSIXct" "POSIXt" 
## 
## $Purchase_Address
## [1] "character"
## 
## $Month
## [1] "character"
## 
## $Sales
## [1] "numeric"
## 
## $City
## [1] "character"

Answer to Question 2

#Group City and Sales column together
all_sales %>% group_by(City) %>% summarise(sales = sum(Sales))

## # A tibble: 10 x 2
##    City                 sales
##    <chr>                <dbl>
##  1 Atlanta, GA       2795499.
##  2 Austin, TX        1819432.
##  3 Boston, MA        3661642.
##  4 Dallas, TX        2767975.
##  5 Los Angeles, CA   5452571.
##  6 New York City, NY 4664314.
##  7 Portland, ME       449758.
##  8 Portland, OR      1870732.
##  9 San Francisco, CA 8261804.
## 10 Seattle, WA       2747755.

#visualization
ggplot(all_sales, aes(x=City, y=Sales)) +
  geom_col() +
  theme_classic() + #make the plot look nice
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  #make the x label vertical
  labs(title = "City sold the most product", x = "Cities", y = "Sales in hundred thousands $")  #labels

Question 3

● What time should we display advertisements to maximize the likelihood of customers buying products?

Task 1: Create a new column for hour and minute from the order_date column

#get the hour and minute from the "order_date" into the hour and minute column
all_sales$Hour = format(all_sales$Order_Date, "%H")

all_sales$Minute = format(all_sales$Order_Date, "%M")

Answer to Question 3

#group the hour and the hour seq a hour is counted
all_sales %>% group_by(Hour) %>% summarise(hour_count = n())

## # A tibble: 24 x 2
##    Hour  hour_count
##    <chr>      <int>
##  1 00          3910
##  2 01          2350
##  3 02          1240
##  4 03           831
##  5 04           854
##  6 05          1321
##  7 06          2482
##  8 07          4011
##  9 08          6256
## 10 09          8748
## # ... with 14 more rows

#visualization
all_sales %>% group_by(Hour) %>% summarise(hour_count = n()) %>%
ggplot(all_sales, mapping = aes(x=Hour, y=hour_count)) +
  geom_line() +
  geom_point() +
  #scale_y_continuous(breaks = seq(0, 16000, by =2000)) +
  #scale_y_continuous(name = "Hours count", limits = c(0, 15000)) +  #expand the y-axis
  theme_classic() +  #make the plot look nice
  theme(panel.grid.major = element_line(linetype = "dashed")) +  #make a dashed grid
  labs(title = "Time suitable to display advertisements", x = "Hours", y = "Hours count")  #labels

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Question 4

● What products are most often sold together?

temp_sales <- 
  all_sales[duplicated(all_sales$Order_ID), ]

temp_sales <- all_sales[duplicated(all_sales$Order_ID) | duplicated(all_sales$Order_ID, fromLast = TRUE), ]

temp_sales

## # A tibble: 14,649 x 11
##    Order_ID Product              Quantity_Ordered Price_Each Order_Date         
##    <chr>    <chr>                           <dbl>      <dbl> <dttm>             
##  1 141275   USB-C Charging Cable                1      12.0  2019-01-07 16:06:00
##  2 141275   Wired Headphones                    1      12.0  2019-01-07 16:06:00
##  3 141290   Apple Airpods Headp~                1     150    2019-01-02 08:25:00
##  4 141290   AA Batteries (4-pac~                3       3.84 2019-01-02 08:25:00
##  5 141365   Vareebadd Phone                     1     400    2019-01-10 11:19:00
##  6 141365   Wired Headphones                    1      12.0  2019-01-10 11:19:00
##  7 141384   Google Phone                        1     600    2019-01-03 00:14:00
##  8 141384   USB-C Charging Cable                1      12.0  2019-01-03 00:14:00
##  9 141450   Google Phone                        1     600    2019-01-12 11:16:00
## 10 141450   Bose SoundSport Hea~                1     100.   2019-01-12 11:16:00
## # ... with 14,639 more rows, and 6 more variables: Purchase_Address <chr>,
## #   Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>

#temp_sales$Group_Product <-temp_sales %>% group_by(Order_ID, Product) %>%
  #mutate(pro = paste('temp_sales$Product', row_number(), sep = ", ")) 
  #spread(pro, Product)%>%
  #paste
#temp_sales

temp <- temp_sales %>%
              group_by(Order_ID) %>%
              mutate(`Group_Product` = paste0(Product, collapse = ", ")) %>%
              ungroup()
temp

## # A tibble: 14,649 x 12
##    Order_ID Product              Quantity_Ordered Price_Each Order_Date         
##    <chr>    <chr>                           <dbl>      <dbl> <dttm>             
##  1 141275   USB-C Charging Cable                1      12.0  2019-01-07 16:06:00
##  2 141275   Wired Headphones                    1      12.0  2019-01-07 16:06:00
##  3 141290   Apple Airpods Headp~                1     150    2019-01-02 08:25:00
##  4 141290   AA Batteries (4-pac~                3       3.84 2019-01-02 08:25:00
##  5 141365   Vareebadd Phone                     1     400    2019-01-10 11:19:00
##  6 141365   Wired Headphones                    1      12.0  2019-01-10 11:19:00
##  7 141384   Google Phone                        1     600    2019-01-03 00:14:00
##  8 141384   USB-C Charging Cable                1      12.0  2019-01-03 00:14:00
##  9 141450   Google Phone                        1     600    2019-01-12 11:16:00
## 10 141450   Bose SoundSport Hea~                1     100.   2019-01-12 11:16:00
## # ... with 14,639 more rows, and 7 more variables: Purchase_Address <chr>,
## #   Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>,
## #   Group_Product <chr>

temp[!duplicated(temp$Group_Product), ]

## # A tibble: 366 x 12
##    Order_ID Product              Quantity_Ordered Price_Each Order_Date         
##    <chr>    <chr>                           <dbl>      <dbl> <dttm>             
##  1 141275   USB-C Charging Cable                1       12.0 2019-01-07 16:06:00
##  2 141290   Apple Airpods Headp~                1      150   2019-01-02 08:25:00
##  3 141365   Vareebadd Phone                     1      400   2019-01-10 11:19:00
##  4 141384   Google Phone                        1      600   2019-01-03 00:14:00
##  5 141450   Google Phone                        1      600   2019-01-12 11:16:00
##  6 141457   iPhone                              1      700   2019-01-09 22:11:00
##  7 141478   Google Phone                        1      600   2019-01-26 13:32:00
##  8 141550   iPhone                              1      700   2019-01-31 10:58:00
##  9 141587   USB-C Charging Cable                1       12.0 2019-01-29 15:14:00
## 10 141645   Lightning Charging ~                1       15.0 2019-01-30 15:30:00
## # ... with 356 more rows, and 7 more variables: Purchase_Address <chr>,
## #   Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>,
## #   Group_Product <chr>

Question 5

● What product sold the most? Why do you think it sold the most?

Task 1: Create a new column for hour and minute from the order_date column

all_sales %>% group_by(Product) %>% summarise(Quantity = sum(Quantity_Ordered))

## # A tibble: 19 x 2
##    Product                    Quantity
##    <chr>                         <dbl>
##  1 20in Monitor                   4129
##  2 27in 4K Gaming Monitor         6244
##  3 27in FHD Monitor               7550
##  4 34in Ultrawide Monitor         6199
##  5 AA Batteries (4-pack)         27635
##  6 AAA Batteries (4-pack)        31016
##  7 Apple Airpods Headphones      15660
##  8 Bose SoundSport Headphones    13457
##  9 Flatscreen TV                  4819
## 10 Google Phone                   5532
## 11 iPhone                         6849
## 12 LG Dryer                        646
## 13 LG Washing Machine              666
## 14 Lightning Charging Cable      23217
## 15 Macbook Pro Laptop             4728
## 16 ThinkPad Laptop                4130
## 17 USB-C Charging Cable          23975
## 18 Vareebadd Phone                2067
## 19 Wired Headphones              20557

new <- all_sales %>% group_by(Product, Price_Each) %>% summarise(Quantity = sum(Quantity_Ordered), Mean_Price = (mean(Price_Each)))

## `summarise()` has grouped output by 'Product'. You can override using the `.groups` argument.

new

## # A tibble: 19 x 4
## # Groups:   Product [19]
##    Product                    Price_Each Quantity Mean_Price
##    <chr>                           <dbl>    <dbl>      <dbl>
##  1 20in Monitor                   110.       4129     110.  
##  2 27in 4K Gaming Monitor         390.       6244     390.  
##  3 27in FHD Monitor               150.       7550     150.  
##  4 34in Ultrawide Monitor         380.       6199     380.  
##  5 AA Batteries (4-pack)            3.84    27635       3.84
##  6 AAA Batteries (4-pack)           2.99    31016       2.99
##  7 Apple Airpods Headphones       150       15660     150   
##  8 Bose SoundSport Headphones     100.      13457     100.  
##  9 Flatscreen TV                  300        4819     300   
## 10 Google Phone                   600        5532     600   
## 11 iPhone                         700        6849     700   
## 12 LG Dryer                       600         646     600   
## 13 LG Washing Machine             600         666     600   
## 14 Lightning Charging Cable        15.0     23217      15.0 
## 15 Macbook Pro Laptop            1700        4728    1700   
## 16 ThinkPad Laptop               1000.       4130    1000.  
## 17 USB-C Charging Cable            12.0     23975      12.0 
## 18 Vareebadd Phone                400        2067     400   
## 19 Wired Headphones                12.0     20557      12.0

#visualization
all_sales %>% group_by(Product) %>% summarise(Quantity = sum(Quantity_Ordered)) %>%
ggplot(all_sales, mapping = aes(x=Product, y=Quantity)) +
  geom_col () +
  #geom_line() +
  theme_classic() + #make the plot look nice
  #scale_y_continuous(name = "Product", limits = c(0, 100))
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  #make the x label vertical
  labs(title = "Product sold the most", x = "Products", y = "Quantity Ordered") #labels

p <- ggplot(new, aes(x = Product))
   p <- p + geom_col(aes(y = Quantity))
 
 p <- p + geom_point(aes(y = Price_Each), col="red")
 
 
   p <- p + scale_y_continuous(sec.axis = sec_axis(~. *0.03, name = "Price [$]"))
 
p <- p + labs(
              title = "Product sold the most", 
              x = "Products", 
              y = "Quantity Ordered"
                  )
p <- p + theme(axis.text.x = element_text(angle = 45, hjust = 1))  

#p <- p + theme_classic() 
    
p