library(odbc)
library(DBI)
library(RSQLite)
library(tidyverse)
options(scipen = 999)sales<-readr::read_csv("sales.csv")names(sales)## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
sapply(sales[1,],class)## Order ID Product Quantity Ordered Price Each
## "character" "character" "character" "character"
## Order Date Purchase Address
## "character" "character"
colSums(is.na(sales))## Order ID Product Quantity Ordered Price Each
## 545 545 545 545
## Order Date Purchase Address
## 545 545
try to find out where the missing values are coming from , will check the row numbers with missing values.
sales |>
mutate(id=row_number()) |>
relocate(id) |>
filter(is.na(`Price Each`))now that we have seen that the columns are missing on common rows ,we can now remove all these!
sales<-sales |>
filter(!is.na(`Order ID`))glimpse(sales)## Rows: 186,305
## Columns: 6
## $ `Order ID` <chr> "295665", "295666", "295667", "295668", "295669", "~
## $ Product <chr> "Macbook Pro Laptop", "LG Washing Machine", "USB-C ~
## $ `Quantity Ordered` <chr> "1", "1", "1", "1", "1", "1", "1", "2", "1", "4", "~
## $ `Price Each` <chr> "1700", "600.0", "11.95", "149.99", "11.95", "3.84"~
## $ `Order Date` <chr> "12/30/19 00:01", "12/29/19 07:03", "12/12/19 18:21~
## $ `Purchase Address` <chr> "136 Church St, New York City, NY 10001", "562 2nd ~
why are
price eachandorder quantityvariables characters?
why is
price eacha character?…from the output ofglimpsewhat can we see?
sales |>
mutate(id=row_number()) |>
relocate(id) |>
filter(grepl("Price",`Price Each`,perl = T))Looks like there has been some typo in the data as all the column observations contain the
column nameso let us drop these rows and rename certain columns
sales<-sales |>
filter(!grepl("Price",`Price Each`,perl = T)) |> #drop unwanted rows
rename(quantity=`Quantity Ordered`) |>
rename(address = `Purchase Address`) |>
rename(Price = `Price Each`)dim(sales)## [1] 185950 6
we have 186305 rows in our data
sales |>
summarise(unique_rows=n_distinct(`Order ID`))but we have 178438 unique rows … this is because one person could order different products
sales |>
janitor::tabyl(quantity) |>
as_tibble() |>
mutate(percent=round(percent,4)) look at class again
sapply(sales[1,],class)## Order ID Product quantity Price Order Date address
## "character" "character" "character" "character" "character" "character"
let’s begin off by using R to achieve this and then later on use SQL for this!
sales |>
relocate(address)one should note that this column contains useful information such as the street ,
cityand zip . we need to extract thecityand create a column for this!
separate() to achieve
this!(sales_df<-sales |>
separate(address,into=c("address1","city","code"),",") |>
mutate_at(c(3,4),as.numeric) |>
mutate_if(is.character,as.factor) |>
relocate(c("address1","city","code")))nice , now we have the variable
citystanding on its own!
sales_df<-sales_df |>
mutate(Date=mdy_hm(`Order Date`),
Date1=as.character(Date))
sold<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(sold,sales_df)sales_df |>
summarise(n_unique=n_distinct(city))SELECT COUNT(DISTINCT city) AS n_unique
FROM sales_df;| n_unique |
|---|
| 9 |
SELECT DISTINCT city AS n_unique
FROM sales_df;| n_unique |
|---|
| New York City |
| San Francisco |
| Atlanta |
| Portland |
| Dallas |
| Los Angeles |
| Boston |
| Austin |
| Seattle |
sales_df |>
group_by(city) |>
dplyr::summarise(sum_quantity=sum(quantity,na.rm=T)) |>
arrange(desc(sum_quantity))
San Franciscohad the most number of quantities sold as compared to other cities.
SELECT city ,SUM(quantity) AS sum_quantity
FROM sales_df
GROUP BY city
ORDER BY sum_quantity DESC;| city | sum_quantity |
|---|---|
| San Francisco | 50239 |
| Los Angeles | 33289 |
| New York City | 27932 |
| Boston | 22528 |
| Dallas | 16730 |
| Atlanta | 16602 |
| Seattle | 16553 |
| Portland | 14053 |
| Austin | 11153 |
sales_df |>
group_by(city) |>
dplyr::summarise(mean_sale=mean(Price,na.rm=T)) |>
arrange(desc(mean_sale))
Atlantahad the greatest average sales as compared to other cities.
SELECT city ,AVG(Price) AS mean_price
FROM sales_df
GROUP BY city
ORDER BY mean_price DESC;| city | mean_price |
|---|---|
| Atlanta | 186.8092 |
| New York City | 186.3391 |
| Dallas | 185.7374 |
| Seattle | 185.5346 |
| Portland | 185.1382 |
| San Francisco | 183.5702 |
| Los Angeles | 183.1257 |
| Austin | 182.7232 |
| Boston | 182.4726 |
sales_df |>
group_by(Product) |>
dplyr::summarise(num_quantity=sum(quantity,na.rm=T)) |>
arrange(desc(num_quantity)) |>
head(n=10)
AAA Batteries (4-pack)had the most number of quantities sold
SELECT Product ,SUM(quantity) AS num_quantity
FROM sales_df
GROUP BY Product
ORDER BY num_quantity DESC
LIMIT 10;| Product | num_quantity |
|---|---|
| AAA Batteries (4-pack) | 31017 |
| AA Batteries (4-pack) | 27635 |
| USB-C Charging Cable | 23975 |
| Lightning Charging Cable | 23217 |
| Wired Headphones | 20557 |
| Apple Airpods Headphones | 15661 |
| Bose SoundSport Headphones | 13457 |
| 27in FHD Monitor | 7550 |
| iPhone | 6849 |
| 27in 4K Gaming Monitor | 6244 |
sales_df |>
group_by(Product) |>
dplyr::summarise(sum_price=sum(Price*quantity)) |>
arrange(desc(sum_price)) |>
head(n=10)SELECT Product ,SUM(Price*quantity) AS sum_price
FROM sales_df
GROUP BY Product
ORDER BY sum_price DESC
LIMIT 10;| Product | sum_price |
|---|---|
| Macbook Pro Laptop | 8037600 |
| iPhone | 4794300 |
| ThinkPad Laptop | 4129959 |
| Google Phone | 3319200 |
| 27in 4K Gaming Monitor | 2435098 |
| 34in Ultrawide Monitor | 2355558 |
| Apple Airpods Headphones | 2349150 |
| Flatscreen TV | 1445700 |
| Bose SoundSport Headphones | 1345565 |
| 27in FHD Monitor | 1132424 |
sales_df |>
relocate(`Order Date`)the date column contains some wealth of information so we will extract
year,month,dayandhourusinglubridatepackage.
sales_df<-sales_df |>
mutate(Date=mdy_hm(`Order Date`), # change to date
year=year(Date), # extract year
Month=month(Date,label=T), # extract month
Day=wday(Date,label=T), # extract weekday
hour=hour(Date), # extract hour
quarter=quarter(Date), # extract quarter
am_pm=pm(Date), # extract day or evening
Date1=as.character(Date))
sales_dfsales_df |>
filter(year==2019) |>
group_by(Month) |>
dplyr::summarise(sum_price=sum(quantity*Price)) |>
arrange(desc(sum_price)) On average , May had the greatest amount of sales
SELECT strftime('%m',Date1) AS month,
strftime('%Y',Date1) AS year,
SUM(quantity*Price) AS total_price
FROM sales_df
GROUP BY month
HAVING strftime('%Y',Date1) = '2019'
ORDER BY total_price DESC;
| month | year | total_price |
|---|---|---|
| 12 | 2019 | 4613443 |
| 10 | 2019 | 3736727 |
| 04 | 2019 | 3390670 |
| 11 | 2019 | 3199603 |
| 05 | 2019 | 3152607 |
| 03 | 2019 | 2807100 |
| 07 | 2019 | 2647776 |
| 06 | 2019 | 2577802 |
| 08 | 2019 | 2244468 |
| 02 | 2019 | 2202022 |
sales_df |>
filter(year==2019) |>
group_by(hour) |>
dplyr::summarise(sum_price=sum(Price*quantity)) |>
arrange(desc(sum_price)) we have like 24 hours arranged in assending where
1-24is increasing order of time from early moring to midnight
sales_df |>
filter(year==2019) |>
group_by(hour) |>
dplyr::summarise(sum_price=sum(Price*quantity)) |>
arrange(desc(sum_price)) |>
ggplot(aes(x=hour,sum_price))+
geom_point()+
geom_line()+
tvthemes::theme_simpsons()we notice a huge peek between 10am and 1pm then between 6pm and 8pm
SELECT strftime('%Y',Date1) AS year,
strftime('%H',Date1) AS hour,
SUM(quantity*Price) AS total_price
FROM sales_df
GROUP BY hour
HAVING strftime('%Y',Date1) = '2019'
ORDER BY total_price DESC;
| year | hour | total_price |
|---|---|---|
| 2019 | 19 | 2412939 |
| 2019 | 12 | 2316821 |
| 2019 | 11 | 2300610 |
| 2019 | 20 | 2281716 |
| 2019 | 18 | 2219348 |
| 2019 | 13 | 2155390 |
| 2019 | 17 | 2129362 |
| 2019 | 14 | 2083673 |
| 2019 | 21 | 2042001 |
| 2019 | 10 | 1944287 |