Packages

#install.packages("tidyverse")
#install.packages("lubridate")
#install.packages("scales")
#install.packages("plotly")
library(tidyverse)
library(lubridate)
library(scales)
library(plotly)

Questions we are attempting to answer

Question 1: What month was the best for sales? How much was earned that month?

Question 2: What city did we have the best sales in?

Question 3: What product did we sell the most of?

Question 4: What time can we best utilize advertisements?

Cleaning the data

# Clean data
options(scipen = 999)# Removes scientific notation
setwd("/cloud/project/data")
file_list <- list.files(path="/cloud/project/data")
# Create the data frame and concatenate several months worth of sales data into one
dataset <- data.frame()

for (i in 1:length(file_list)){
  temp_data <- read_csv(file_list[i]) #each file will be read in, specify which columns you need read in to avoid any errors
  temp_data$Class <- sapply(strsplit(gsub(".csv", "", file_list[i]), "_"), function(x){x[2]}) #clean the data as needed, in this case I am creating a new column that indicates which file each row of data came from
  dataset <- rbind(dataset, temp_data) #for each iteration, bind the new data to the building dataset
  
}

# Fixing the data type 
dataset$`Quantity Ordered` <- as.numeric(dataset$`Quantity Ordered`)
dataset$`Price Each` <- as.numeric(dataset$`Price Each`)

# Removing NA values
dataset <- na.omit(dataset)

# Fixing date format for the Order date column
dataset$`Order Date` <- as.POSIXct(dataset$`Order Date`, format="%m/%d/%Y %H:%M")


#Fixing naming conventions

dataset<- rename(dataset, order_id = `Order ID`)
dataset<-rename(dataset, quantity_ordered = `Quantity Ordered`)
dataset<-rename(dataset, price_each = `Price Each`)
dataset<-rename(dataset, order_date = `Order Date`)
dataset<-rename(dataset, purchase_address = `Purchase Address`)
glimpse(dataset)
## Rows: 185,950
## Columns: 7
## $ order_id         <chr> "176558", "176559", "176560", "176560", "176561", "17…
## $ Product          <chr> "USB-C Charging Cable", "Bose SoundSport Headphones",…
## $ quantity_ordered <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ price_each       <dbl> 11.95, 99.99, 600.00, 11.99, 11.99, 11.95, 99.99, 11.…
## $ order_date       <dttm> 19-04-19 08:46:00, 19-04-07 22:30:00, 19-04-12 14:38…
## $ purchase_address <chr> "917 1st St, Dallas, TX 75001", "682 Chestnut St, Bos…
## $ Class            <chr> "April", "April", "April", "April", "April", "April",…

Graphing the Data

Monthly Sales Data

Question 1: What month was the best for sales? How much was earned that month?

# Monthly Sales Data
dataset %>%
  mutate(month = month(order_date, label = TRUE)) %>% # Creating a month column
  group_by(month) %>% # Group data by month
  summarise(monthly_sales = sum(price_each*quantity_ordered)) %>% # Calculate the monthly sales 
  ggplot()+
  aes(x = month, y = monthly_sales, fill = monthly_sales)+
  geom_bar(stat = "identity")+
  labs(title = "Monthly Sales", x= "Months", y = "Sales", fill = "Monthly Sales")+
  scale_y_continuous(labels = dollar_format())

Sales by City

Question 2: What city did we have the best sales in?

#What city sold the most product

dataset %>%
  separate(purchase_address,c('address', 'city','state'),",") %>% #Seperates the data in the purchase address so we can use city information to group by
  group_by(city) %>%
  summarise(sales = sum(price_each*quantity_ordered)) %>% #sums the sales per city
  ggplot()+
  aes(x = reorder(city, -sales), y = sales, fill = sales)+
  geom_bar(stat = "identity")+
  labs(title = "Sales by City", x = "City", y = "Sales", fill = "Sales")+
  scale_y_continuous(labels = dollar_format())+ #changes the format for hte y axis to dollars
  theme(axis.text.x = element_text(angle = 90))

Products and Quantity Ordered Graph

Question 3: What product did we sell the most of?

# What are the most sold products

dataset %>%
  group_by(Product) %>% #group the data by Products
  summarise(sum_of_products=sum(quantity_ordered)) %>% #sums the quantity ordered by products 
  ggplot()+
  aes(x =reorder(Product, -sum_of_products), y= sum_of_products, fill = sum_of_products)+
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle = 90))+ #Rotates the x axis labels
  labs(title = "Quantity order Per Prouduct", x = "Product", y= "Quantity Ordered", fill = "Quantity Ordered")

Time of day purchases are made

Question 4: What time can we best utilize advertisements?

dataset %>%
  mutate(hour = hour(order_date)) %>% #Create a column that stores the hour that purchases are made
  count(hour) %>% #Counting the amount of times a purchase is made every hour
  ggplot()+
  aes(x= hour, y = n, colour = n)+
  geom_line()+
  labs(title = "Times purchases Were Made", x = "Time (Hour)", y = "Number of Purchases Made", colour = "Number of Purchases Made")