Onyedika Obiakarije 1/18/2022
● What was the best month for sales? How much was earned that month?
● What city sold the most product?
● What time should we display advertisements to maximize the likelihood of customers buying products?
● What products are most often sold together?
● What product sold the most? Why do you think it sold the most?
library(tidyverse) #for data wrangling## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate) #for time and date## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr) #for bind_row
library(ggplot2) #for visualize
library(plotly) #for visualize## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
theme_set(theme_bw(16)) #to make the x-axis test appear vertical.
library(qdapRegex) #extract the city and state from purchase_address column##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## The following object is masked from 'package:ggplot2':
##
## %+%
setwd("C:/Users/harry/Desktop/Class project/sales")# load datasets
Jan_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_January_2019.csv")## Rows: 9723 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Feb_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_February_2019.csv")## Rows: 12036 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Mar_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_March_2019.csv")## Rows: 15226 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Apr_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_April_2019.csv")## Rows: 18383 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
May_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_May_2019.csv")## Rows: 16635 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Jun_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_June_2019.csv")## Rows: 13622 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Jul_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_July_2019.csv")## Rows: 14371 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Aug_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_August_2019.csv")## Rows: 12011 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Sep_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_September_2019.csv")## Rows: 11686 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Oct_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_October_2019.csv")## Rows: 20379 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Nov_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_November_2019.csv")## Rows: 17661 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Dec_2019 <- read_csv("/Users/harry/Desktop/Class project/sales/Sales_December_2019.csv")## Rows: 25117 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Order ID, Product, Quantity Ordered, Price Each, Order Date, Purcha...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# a quick check to see if all column names are named equally
colnames(Dec_2019)## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
colnames(Mar_2019)## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
colnames(Jun_2019)## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
colnames(Sep_2019)## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
#Combining the data sets into Seasons
Winter <- bind_rows(Dec_2019, Jan_2019, Feb_2019) #as q1
Spring <- bind_rows(Mar_2019,Apr_2019, May_2019) #as q2
Summer <- bind_rows(Jun_2019, Jul_2019, Aug_2019) #as q3
Autumn <- bind_rows(Sep_2019, Oct_2019, Nov_2019) #as q4#Inspect the data
str(Winter)## spec_tbl_df [46,876 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Order ID : chr [1:46876] "295665" "295666" "295667" "295668" ...
## $ Product : chr [1:46876] "Macbook Pro Laptop" "LG Washing Machine" "USB-C Charging Cable" "27in FHD Monitor" ...
## $ Quantity Ordered: chr [1:46876] "1" "1" "1" "1" ...
## $ Price Each : chr [1:46876] "1700" "600.0" "11.95" "149.99" ...
## $ Order Date : chr [1:46876] "12/30/19 00:01" "12/29/19 07:03" "12/12/19 18:21" "12/22/19 15:13" ...
## $ Purchase Address: chr [1:46876] "136 Church St, New York City, NY 10001" "562 2nd St, New York City, NY 10001" "277 Main St, New York City, NY 10001" "410 6th St, San Francisco, CA 94016" ...
## - attr(*, "spec")=
## .. cols(
## .. `Order ID` = col_character(),
## .. Product = col_character(),
## .. `Quantity Ordered` = col_character(),
## .. `Price Each` = col_character(),
## .. `Order Date` = col_character(),
## .. `Purchase Address` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Spring)## spec_tbl_df [50,244 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Order ID : chr [1:50244] "162009" "162009" "162009" "162010" ...
## $ Product : chr [1:50244] "iPhone" "Lightning Charging Cable" "Wired Headphones" "Bose SoundSport Headphones" ...
## $ Quantity Ordered: chr [1:50244] "1" "1" "2" "1" ...
## $ Price Each : chr [1:50244] "700" "14.95" "11.99" "99.99" ...
## $ Order Date : chr [1:50244] "03/28/19 20:59" "03/28/19 20:59" "03/28/19 20:59" "03/17/19 05:39" ...
## $ Purchase Address: chr [1:50244] "942 Church St, Austin, TX 73301" "942 Church St, Austin, TX 73301" "942 Church St, Austin, TX 73301" "261 10th St, San Francisco, CA 94016" ...
## - attr(*, "spec")=
## .. cols(
## .. `Order ID` = col_character(),
## .. Product = col_character(),
## .. `Quantity Ordered` = col_character(),
## .. `Price Each` = col_character(),
## .. `Order Date` = col_character(),
## .. `Purchase Address` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Summer)## spec_tbl_df [40,004 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Order ID : chr [1:40004] "209921" "209922" "209923" "209924" ...
## $ Product : chr [1:40004] "USB-C Charging Cable" "Macbook Pro Laptop" "ThinkPad Laptop" "27in FHD Monitor" ...
## $ Quantity Ordered: chr [1:40004] "1" "1" "1" "1" ...
## $ Price Each : chr [1:40004] "11.95" "1700.0" "999.99" "149.99" ...
## $ Order Date : chr [1:40004] "06/23/19 19:34" "06/30/19 10:05" "06/24/19 20:18" "06/05/19 10:21" ...
## $ Purchase Address: chr [1:40004] "950 Walnut St, Portland, ME 04101" "80 4th St, San Francisco, CA 94016" "402 Jackson St, Los Angeles, CA 90001" "560 10th St, Seattle, WA 98101" ...
## - attr(*, "spec")=
## .. cols(
## .. `Order ID` = col_character(),
## .. Product = col_character(),
## .. `Quantity Ordered` = col_character(),
## .. `Price Each` = col_character(),
## .. `Order Date` = col_character(),
## .. `Purchase Address` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Autumn)## spec_tbl_df [49,726 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Order ID : chr [1:49726] "248151" "248152" "248153" "248154" ...
## $ Product : chr [1:49726] "AA Batteries (4-pack)" "USB-C Charging Cable" "USB-C Charging Cable" "27in FHD Monitor" ...
## $ Quantity Ordered: chr [1:49726] "4" "2" "1" "1" ...
## $ Price Each : chr [1:49726] "3.84" "11.95" "11.95" "149.99" ...
## $ Order Date : chr [1:49726] "09/17/19 14:44" "09/29/19 10:19" "09/16/19 17:48" "09/27/19 07:52" ...
## $ Purchase Address: chr [1:49726] "380 North St, Los Angeles, CA 90001" "511 8th St, Austin, TX 73301" "151 Johnson St, Los Angeles, CA 90001" "355 Hickory St, Seattle, WA 98101" ...
## - attr(*, "spec")=
## .. cols(
## .. `Order ID` = col_character(),
## .. Product = col_character(),
## .. `Quantity Ordered` = col_character(),
## .. `Price Each` = col_character(),
## .. `Order Date` = col_character(),
## .. `Purchase Address` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#Combine all data into one
all_sales <- bind_rows(Jan_2019, Feb_2019, Mar_2019,
Apr_2019, May_2019, Jun_2019,
Jul_2019, Aug_2019, Sep_2019,
Oct_2019, Nov_2019, Dec_2019)
#View(all_sales)colnames(all_sales) #Show colnames## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
dim(all_sales) #Dimensions of the data## [1] 186850 6
head(all_sales) #load the first 6 row of the data## # A tibble: 6 x 6
## `Order ID` Product `Quantity Orde~` `Price Each` `Order Date` `Purchase Addr~`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 141234 iPhone 1 700 01/22/19 21~ 944 Walnut St, ~
## 2 141235 Lightn~ 1 14.95 01/28/19 14~ 185 Maple St, P~
## 3 141236 Wired ~ 2 11.99 01/17/19 13~ 538 Adams St, S~
## 4 141237 27in F~ 1 149.99 01/05/19 20~ 738 10th St, Lo~
## 5 141238 Wired ~ 1 11.99 01/25/19 11~ 387 10th St, Au~
## 6 141239 AAA Ba~ 1 2.99 01/29/19 20~ 775 Willow St, ~
summary(all_sales) #Shows the statistical summary## Order ID Product Quantity Ordered Price Each
## Length:186850 Length:186850 Length:186850 Length:186850
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Order Date Purchase Address
## Length:186850 Length:186850
## Class :character Class :character
## Mode :character Mode :character
# get the sum of NA value.
sum(is.na(all_sales))## [1] 3270
# the value of NA sum tells me that entire rows were left empty
# So I am going to remove all NA value instead of replacing NA value
#drop values with NA
all_sales <- na.omit(all_sales)# get the sum of NA value after cleaning the data for verfication.
sum(is.na(all_sales))## [1] 0
#Save file as a CSV
#all_sales %>% write.csv("clean_sales.csv")#change all column names aside from "Product"
names(all_sales)[5] <- 'Order_Date'
names(all_sales)[6] <- 'Purchase_Address'
names(all_sales)[1] <- 'Order_ID'
names(all_sales)[3] <- 'Quantity_Ordered'
names(all_sales)[4] <- 'Price_Each'
#rename(all_sales, Order_Data = Order Date)
#get the name of the column in position 6
#names(all_sales)[6]# Change the data type of Quantity ordered to get the Sales number for the sales column
all_sales <- mutate(all_sales, Quantity_Ordered = as.numeric(Quantity_Ordered))## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
# Change the data type of Price Each to get the Sales number for the sales column
all_sales <- mutate(all_sales, Price_Each = as.numeric(Price_Each))## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
sapply(all_sales, class)## Order_ID Product Quantity_Ordered Price_Each
## "character" "character" "numeric" "numeric"
## Order_Date Purchase_Address
## "character" "character"
● What was the best month for sales? How much was earned that month?
# Convert the "Order_Date" datatype to datetime
all_sales$Order_Date = as.POSIXct(all_sales$Order_Date, format = "%m/%d/%y %H:%M")# Delete the NA value that shows after the previous code
# get the sum of NA value.
sum(is.na(all_sales))## [1] 1068
# the value of NA sum tells me that entire rows were left empty
# So I am going to remove all NA value instead of replacing NA value
#drop values with NA
all_sales <- na.omit(all_sales)# get the sum of NA value.
sum(is.na(all_sales))## [1] 0
all_sales$Month = format(all_sales$Order_Date, "%m")all_sales$Sales <- all_sales$Quantity_Ordered * all_sales$Price_Each# Group Month and Sales together
all_sales %>% group_by(Month) %>% summarise(sales = sum(Sales))## # A tibble: 12 x 2
## Month sales
## <chr> <dbl>
## 1 01 1822257.
## 2 02 2202022.
## 3 03 2806547.
## 4 04 3390670.
## 5 05 3152607.
## 6 06 2577802.
## 7 07 2647776.
## 8 08 2244468.
## 9 09 2097560.
## 10 10 3736727.
## 11 11 3199603.
## 12 12 4613443.
#quantile(result$sales, probs = c(.25, .5, .75))#Visualization
ggplot(all_sales, mapping = aes(x = Month, y = Sales)) +
geom_col() +
theme_classic() + #make the plot look nice
labs(title = "Best Month for Sales", x = "Months", y = "Sales in hundred thousands $") #labels● What city sold the most product?
#Get the city and the state out of the "Purchase_Address" column
all_sales$City <- ex_city_state(all_sales$Purchase_Address)# Change City column data type to character
all_sales <- mutate(all_sales, City = as.character(City))
sapply(all_sales, class)## $Order_ID
## [1] "character"
##
## $Product
## [1] "character"
##
## $Quantity_Ordered
## [1] "numeric"
##
## $Price_Each
## [1] "numeric"
##
## $Order_Date
## [1] "POSIXct" "POSIXt"
##
## $Purchase_Address
## [1] "character"
##
## $Month
## [1] "character"
##
## $Sales
## [1] "numeric"
##
## $City
## [1] "character"
#Group City and Sales column together
all_sales %>% group_by(City) %>% summarise(sales = sum(Sales))## # A tibble: 10 x 2
## City sales
## <chr> <dbl>
## 1 Atlanta, GA 2795499.
## 2 Austin, TX 1819432.
## 3 Boston, MA 3661642.
## 4 Dallas, TX 2767975.
## 5 Los Angeles, CA 5452571.
## 6 New York City, NY 4664314.
## 7 Portland, ME 449758.
## 8 Portland, OR 1870732.
## 9 San Francisco, CA 8261804.
## 10 Seattle, WA 2747755.
#visualization
ggplot(all_sales, aes(x=City, y=Sales)) +
geom_col() +
theme_classic() + #make the plot look nice
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #make the x label vertical
labs(title = "City sold the most product", x = "Cities", y = "Sales in hundred thousands $") #labels● What time should we display advertisements to maximize the likelihood of customers buying products?
#get the hour and minute from the "order_date" into the hour and minute column
all_sales$Hour = format(all_sales$Order_Date, "%H")
all_sales$Minute = format(all_sales$Order_Date, "%M")#group the hour and the hour seq a hour is counted
all_sales %>% group_by(Hour) %>% summarise(hour_count = n())## # A tibble: 24 x 2
## Hour hour_count
## <chr> <int>
## 1 00 3910
## 2 01 2350
## 3 02 1240
## 4 03 831
## 5 04 854
## 6 05 1321
## 7 06 2482
## 8 07 4011
## 9 08 6256
## 10 09 8748
## # ... with 14 more rows
#visualization
all_sales %>% group_by(Hour) %>% summarise(hour_count = n()) %>%
ggplot(all_sales, mapping = aes(x=Hour, y=hour_count)) +
geom_line() +
geom_point() +
#scale_y_continuous(breaks = seq(0, 16000, by =2000)) +
#scale_y_continuous(name = "Hours count", limits = c(0, 15000)) + #expand the y-axis
theme_classic() + #make the plot look nice
theme(panel.grid.major = element_line(linetype = "dashed")) + #make a dashed grid
labs(title = "Time suitable to display advertisements", x = "Hours", y = "Hours count") #labels## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
● What products are most often sold together?
temp_sales <-
all_sales[duplicated(all_sales$Order_ID), ]temp_sales <- all_sales[duplicated(all_sales$Order_ID) | duplicated(all_sales$Order_ID, fromLast = TRUE), ]
temp_sales## # A tibble: 14,649 x 11
## Order_ID Product Quantity_Ordered Price_Each Order_Date
## <chr> <chr> <dbl> <dbl> <dttm>
## 1 141275 USB-C Charging Cable 1 12.0 2019-01-07 16:06:00
## 2 141275 Wired Headphones 1 12.0 2019-01-07 16:06:00
## 3 141290 Apple Airpods Headp~ 1 150 2019-01-02 08:25:00
## 4 141290 AA Batteries (4-pac~ 3 3.84 2019-01-02 08:25:00
## 5 141365 Vareebadd Phone 1 400 2019-01-10 11:19:00
## 6 141365 Wired Headphones 1 12.0 2019-01-10 11:19:00
## 7 141384 Google Phone 1 600 2019-01-03 00:14:00
## 8 141384 USB-C Charging Cable 1 12.0 2019-01-03 00:14:00
## 9 141450 Google Phone 1 600 2019-01-12 11:16:00
## 10 141450 Bose SoundSport Hea~ 1 100. 2019-01-12 11:16:00
## # ... with 14,639 more rows, and 6 more variables: Purchase_Address <chr>,
## # Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>
#temp_sales$Group_Product <-temp_sales %>% group_by(Order_ID, Product) %>%
#mutate(pro = paste('temp_sales$Product', row_number(), sep = ", "))
#spread(pro, Product)%>%
#paste
#temp_salestemp <- temp_sales %>%
group_by(Order_ID) %>%
mutate(`Group_Product` = paste0(Product, collapse = ", ")) %>%
ungroup()
temp## # A tibble: 14,649 x 12
## Order_ID Product Quantity_Ordered Price_Each Order_Date
## <chr> <chr> <dbl> <dbl> <dttm>
## 1 141275 USB-C Charging Cable 1 12.0 2019-01-07 16:06:00
## 2 141275 Wired Headphones 1 12.0 2019-01-07 16:06:00
## 3 141290 Apple Airpods Headp~ 1 150 2019-01-02 08:25:00
## 4 141290 AA Batteries (4-pac~ 3 3.84 2019-01-02 08:25:00
## 5 141365 Vareebadd Phone 1 400 2019-01-10 11:19:00
## 6 141365 Wired Headphones 1 12.0 2019-01-10 11:19:00
## 7 141384 Google Phone 1 600 2019-01-03 00:14:00
## 8 141384 USB-C Charging Cable 1 12.0 2019-01-03 00:14:00
## 9 141450 Google Phone 1 600 2019-01-12 11:16:00
## 10 141450 Bose SoundSport Hea~ 1 100. 2019-01-12 11:16:00
## # ... with 14,639 more rows, and 7 more variables: Purchase_Address <chr>,
## # Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>,
## # Group_Product <chr>
temp[!duplicated(temp$Group_Product), ]## # A tibble: 366 x 12
## Order_ID Product Quantity_Ordered Price_Each Order_Date
## <chr> <chr> <dbl> <dbl> <dttm>
## 1 141275 USB-C Charging Cable 1 12.0 2019-01-07 16:06:00
## 2 141290 Apple Airpods Headp~ 1 150 2019-01-02 08:25:00
## 3 141365 Vareebadd Phone 1 400 2019-01-10 11:19:00
## 4 141384 Google Phone 1 600 2019-01-03 00:14:00
## 5 141450 Google Phone 1 600 2019-01-12 11:16:00
## 6 141457 iPhone 1 700 2019-01-09 22:11:00
## 7 141478 Google Phone 1 600 2019-01-26 13:32:00
## 8 141550 iPhone 1 700 2019-01-31 10:58:00
## 9 141587 USB-C Charging Cable 1 12.0 2019-01-29 15:14:00
## 10 141645 Lightning Charging ~ 1 15.0 2019-01-30 15:30:00
## # ... with 356 more rows, and 7 more variables: Purchase_Address <chr>,
## # Month <chr>, Sales <dbl>, City <chr>, Hour <chr>, Minute <chr>,
## # Group_Product <chr>
● What product sold the most? Why do you think it sold the most?
all_sales %>% group_by(Product) %>% summarise(Quantity = sum(Quantity_Ordered))## # A tibble: 19 x 2
## Product Quantity
## <chr> <dbl>
## 1 20in Monitor 4129
## 2 27in 4K Gaming Monitor 6244
## 3 27in FHD Monitor 7550
## 4 34in Ultrawide Monitor 6199
## 5 AA Batteries (4-pack) 27635
## 6 AAA Batteries (4-pack) 31016
## 7 Apple Airpods Headphones 15660
## 8 Bose SoundSport Headphones 13457
## 9 Flatscreen TV 4819
## 10 Google Phone 5532
## 11 iPhone 6849
## 12 LG Dryer 646
## 13 LG Washing Machine 666
## 14 Lightning Charging Cable 23217
## 15 Macbook Pro Laptop 4728
## 16 ThinkPad Laptop 4130
## 17 USB-C Charging Cable 23975
## 18 Vareebadd Phone 2067
## 19 Wired Headphones 20557
new <- all_sales %>% group_by(Product, Price_Each) %>% summarise(Quantity = sum(Quantity_Ordered), Mean_Price = (mean(Price_Each))) ## `summarise()` has grouped output by 'Product'. You can override using the `.groups` argument.
new## # A tibble: 19 x 4
## # Groups: Product [19]
## Product Price_Each Quantity Mean_Price
## <chr> <dbl> <dbl> <dbl>
## 1 20in Monitor 110. 4129 110.
## 2 27in 4K Gaming Monitor 390. 6244 390.
## 3 27in FHD Monitor 150. 7550 150.
## 4 34in Ultrawide Monitor 380. 6199 380.
## 5 AA Batteries (4-pack) 3.84 27635 3.84
## 6 AAA Batteries (4-pack) 2.99 31016 2.99
## 7 Apple Airpods Headphones 150 15660 150
## 8 Bose SoundSport Headphones 100. 13457 100.
## 9 Flatscreen TV 300 4819 300
## 10 Google Phone 600 5532 600
## 11 iPhone 700 6849 700
## 12 LG Dryer 600 646 600
## 13 LG Washing Machine 600 666 600
## 14 Lightning Charging Cable 15.0 23217 15.0
## 15 Macbook Pro Laptop 1700 4728 1700
## 16 ThinkPad Laptop 1000. 4130 1000.
## 17 USB-C Charging Cable 12.0 23975 12.0
## 18 Vareebadd Phone 400 2067 400
## 19 Wired Headphones 12.0 20557 12.0
#visualization
all_sales %>% group_by(Product) %>% summarise(Quantity = sum(Quantity_Ordered)) %>%
ggplot(all_sales, mapping = aes(x=Product, y=Quantity)) +
geom_col () +
#geom_line() +
theme_classic() + #make the plot look nice
#scale_y_continuous(name = "Product", limits = c(0, 100))
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #make the x label vertical
labs(title = "Product sold the most", x = "Products", y = "Quantity Ordered") #labelsp <- ggplot(new, aes(x = Product))
p <- p + geom_col(aes(y = Quantity))
p <- p + geom_point(aes(y = Price_Each), col="red")
p <- p + scale_y_continuous(sec.axis = sec_axis(~. *0.03, name = "Price [$]"))
p <- p + labs(
title = "Product sold the most",
x = "Products",
y = "Quantity Ordered"
)
p <- p + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#p <- p + theme_classic()
p