library(odbc)
library(DBI)
library(RSQLite)
library(tidyverse)
options(scipen = 999)
<-readr::read_csv("sales.csv") sales
names(sales)
## [1] "Order ID" "Product" "Quantity Ordered" "Price Each"
## [5] "Order Date" "Purchase Address"
sapply(sales[1,],class)
## Order ID Product Quantity Ordered Price Each
## "character" "character" "character" "character"
## Order Date Purchase Address
## "character" "character"
colSums(is.na(sales))
## Order ID Product Quantity Ordered Price Each
## 545 545 545 545
## Order Date Purchase Address
## 545 545
try to find out where the missing values are coming from , will check the row numbers with missing values.
|>
sales mutate(id=row_number()) |>
relocate(id) |>
filter(is.na(`Price Each`))
now that we have seen that the columns are missing on common rows ,we can now remove all these!
<-sales |>
salesfilter(!is.na(`Order ID`))
glimpse(sales)
## Rows: 186,305
## Columns: 6
## $ `Order ID` <chr> "295665", "295666", "295667", "295668", "295669", "~
## $ Product <chr> "Macbook Pro Laptop", "LG Washing Machine", "USB-C ~
## $ `Quantity Ordered` <chr> "1", "1", "1", "1", "1", "1", "1", "2", "1", "4", "~
## $ `Price Each` <chr> "1700", "600.0", "11.95", "149.99", "11.95", "3.84"~
## $ `Order Date` <chr> "12/30/19 00:01", "12/29/19 07:03", "12/12/19 18:21~
## $ `Purchase Address` <chr> "136 Church St, New York City, NY 10001", "562 2nd ~
why are
price each
andorder quantity
variables characters?
why is
price each
a character?…from the output ofglimpse
what can we see?
|>
sales mutate(id=row_number()) |>
relocate(id) |>
filter(grepl("Price",`Price Each`,perl = T))
Looks like there has been some typo in the data as all the column observations contain the
column name
so let us drop these rows and rename certain columns
<-sales |>
salesfilter(!grepl("Price",`Price Each`,perl = T)) |> #drop unwanted rows
rename(quantity=`Quantity Ordered`) |>
rename(address = `Purchase Address`) |>
rename(Price = `Price Each`)
dim(sales)
## [1] 185950 6
we have 186305 rows in our data
|>
sales summarise(unique_rows=n_distinct(`Order ID`))
but we have 178438 unique rows … this is because one person could order different products
|>
sales ::tabyl(quantity) |>
janitoras_tibble() |>
mutate(percent=round(percent,4))
look at class again
sapply(sales[1,],class)
## Order ID Product quantity Price Order Date address
## "character" "character" "character" "character" "character" "character"
let’s begin off by using R to achieve this and then later on use SQL for this!
|>
sales relocate(address)
one should note that this column contains useful information such as the street ,
city
and zip . we need to extract thecity
and create a column for this!
separate()
to achieve
this!<-sales |>
(sales_dfseparate(address,into=c("address1","city","code"),",") |>
mutate_at(c(3,4),as.numeric) |>
mutate_if(is.character,as.factor) |>
relocate(c("address1","city","code")))
nice , now we have the variable
city
standing on its own!
<-sales_df |>
sales_dfmutate(Date=mdy_hm(`Order Date`),
Date1=as.character(Date))
<-dbConnect(RSQLite::SQLite(), ":memory:")
soldcopy_to(sold,sales_df)
|>
sales_df summarise(n_unique=n_distinct(city))
SELECT COUNT(DISTINCT city) AS n_unique
FROM sales_df;
n_unique |
---|
9 |
SELECT DISTINCT city AS n_unique
FROM sales_df;
n_unique |
---|
New York City |
San Francisco |
Atlanta |
Portland |
Dallas |
Los Angeles |
Boston |
Austin |
Seattle |
|>
sales_df group_by(city) |>
::summarise(sum_quantity=sum(quantity,na.rm=T)) |>
dplyrarrange(desc(sum_quantity))
San Francisco
had the most number of quantities sold as compared to other cities.
SELECT city ,SUM(quantity) AS sum_quantity
FROM sales_df
GROUP BY city
ORDER BY sum_quantity DESC;
city | sum_quantity |
---|---|
San Francisco | 50239 |
Los Angeles | 33289 |
New York City | 27932 |
Boston | 22528 |
Dallas | 16730 |
Atlanta | 16602 |
Seattle | 16553 |
Portland | 14053 |
Austin | 11153 |
|>
sales_df group_by(city) |>
::summarise(mean_sale=mean(Price,na.rm=T)) |>
dplyrarrange(desc(mean_sale))
Atlanta
had the greatest average sales as compared to other cities.
SELECT city ,AVG(Price) AS mean_price
FROM sales_df
GROUP BY city
ORDER BY mean_price DESC;
city | mean_price |
---|---|
Atlanta | 186.8092 |
New York City | 186.3391 |
Dallas | 185.7374 |
Seattle | 185.5346 |
Portland | 185.1382 |
San Francisco | 183.5702 |
Los Angeles | 183.1257 |
Austin | 182.7232 |
Boston | 182.4726 |
|>
sales_df group_by(Product) |>
::summarise(num_quantity=sum(quantity,na.rm=T)) |>
dplyrarrange(desc(num_quantity)) |>
head(n=10)
AAA Batteries (4-pack)
had the most number of quantities sold
SELECT Product ,SUM(quantity) AS num_quantity
FROM sales_df
GROUP BY Product
ORDER BY num_quantity DESC
LIMIT 10;
Product | num_quantity |
---|---|
AAA Batteries (4-pack) | 31017 |
AA Batteries (4-pack) | 27635 |
USB-C Charging Cable | 23975 |
Lightning Charging Cable | 23217 |
Wired Headphones | 20557 |
Apple Airpods Headphones | 15661 |
Bose SoundSport Headphones | 13457 |
27in FHD Monitor | 7550 |
iPhone | 6849 |
27in 4K Gaming Monitor | 6244 |
|>
sales_df group_by(Product) |>
::summarise(sum_price=sum(Price*quantity)) |>
dplyrarrange(desc(sum_price)) |>
head(n=10)
SELECT Product ,SUM(Price*quantity) AS sum_price
FROM sales_df
GROUP BY Product
ORDER BY sum_price DESC
LIMIT 10;
Product | sum_price |
---|---|
Macbook Pro Laptop | 8037600 |
iPhone | 4794300 |
ThinkPad Laptop | 4129959 |
Google Phone | 3319200 |
27in 4K Gaming Monitor | 2435098 |
34in Ultrawide Monitor | 2355558 |
Apple Airpods Headphones | 2349150 |
Flatscreen TV | 1445700 |
Bose SoundSport Headphones | 1345565 |
27in FHD Monitor | 1132424 |
|>
sales_df relocate(`Order Date`)
the date column contains some wealth of information so we will extract
year
,month
,day
andhour
usinglubridate
package.
<-sales_df |>
sales_dfmutate(Date=mdy_hm(`Order Date`), # change to date
year=year(Date), # extract year
Month=month(Date,label=T), # extract month
Day=wday(Date,label=T), # extract weekday
hour=hour(Date), # extract hour
quarter=quarter(Date), # extract quarter
am_pm=pm(Date), # extract day or evening
Date1=as.character(Date))
sales_df
|>
sales_df filter(year==2019) |>
group_by(Month) |>
::summarise(sum_price=sum(quantity*Price)) |>
dplyrarrange(desc(sum_price))
On average , May had the greatest amount of sales
SELECT strftime('%m',Date1) AS month,
'%Y',Date1) AS year,
strftime(SUM(quantity*Price) AS total_price
FROM sales_df
GROUP BY month
HAVING strftime('%Y',Date1) = '2019'
ORDER BY total_price DESC;
month | year | total_price |
---|---|---|
12 | 2019 | 4613443 |
10 | 2019 | 3736727 |
04 | 2019 | 3390670 |
11 | 2019 | 3199603 |
05 | 2019 | 3152607 |
03 | 2019 | 2807100 |
07 | 2019 | 2647776 |
06 | 2019 | 2577802 |
08 | 2019 | 2244468 |
02 | 2019 | 2202022 |
|>
sales_df filter(year==2019) |>
group_by(hour) |>
::summarise(sum_price=sum(Price*quantity)) |>
dplyrarrange(desc(sum_price))
we have like 24 hours arranged in assending where
1-24
is increasing order of time from early moring to midnight
|>
sales_df filter(year==2019) |>
group_by(hour) |>
::summarise(sum_price=sum(Price*quantity)) |>
dplyrarrange(desc(sum_price)) |>
ggplot(aes(x=hour,sum_price))+
geom_point()+
geom_line()+
::theme_simpsons() tvthemes
we notice a huge peek between 10am and 1pm then between 6pm and 8pm
SELECT strftime('%Y',Date1) AS year,
'%H',Date1) AS hour,
strftime(SUM(quantity*Price) AS total_price
FROM sales_df
GROUP BY hour
HAVING strftime('%Y',Date1) = '2019'
ORDER BY total_price DESC;
year | hour | total_price |
---|---|---|
2019 | 19 | 2412939 |
2019 | 12 | 2316821 |
2019 | 11 | 2300610 |
2019 | 20 | 2281716 |
2019 | 18 | 2219348 |
2019 | 13 | 2155390 |
2019 | 17 | 2129362 |
2019 | 14 | 2083673 |
2019 | 21 | 2042001 |
2019 | 10 | 1944287 |