install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library("readr")
Store_Transactions <- read.csv("Store_Transactions.csv", header = TRUE, sep = ',')
install.packages("janitor")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library("janitor")
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library("dplyr")
install.packages("skimr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library("skimr")
skim_without_charts(Store_Transactions)
| Name | Store_Transactions |
| Number of rows | 29 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| date | 0 | 1 | 15 | 15 | 0 | 24 | 0 |
| product | 0 | 1 | 0 | 8 | 2 | 11 | 0 |
| product_code | 0 | 1 | 8 | 8 | 0 | 12 | 0 |
| product_color | 0 | 1 | 4 | 6 | 0 | 9 | 0 |
| revenue | 0 | 1 | 7 | 10 | 0 | 15 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| transaction_id | 0 | 1 | 27283.28 | 15388.50 | 1675.00 | 12560.00 | 24785.00 | 44700 | 49430 |
| customer_id | 0 | 1 | 5456.66 | 3077.70 | 335.00 | 2512.00 | 4957.00 | 8940 | 9886 |
| product_price | 0 | 1 | 413.39 | 429.06 | 9.99 | 58.89 | 169.95 | 1000 | 1000 |
| purchase_size | 0 | 1 | 1.45 | 0.91 | 1.00 | 1.00 | 1.00 | 2 | 5 |
| purchase_price | 0 | 1 | 434.64 | 414.52 | 13.99 | 89.85 | 234.50 | 1000 | 1000 |
str(Store_Transactions)
## 'data.frame': 29 obs. of 10 variables:
## $ date : chr "29/08/2020 0:00" "01/05/2020 0:00" "12/12/2020 0:00" "16/02/2020 0:00" ...
## $ transaction_id: int 9900 12315 9890 46915 44700 44700 12560 9640 22620 49430 ...
## $ customer_id : int 1980 2463 1978 9383 8940 8940 2512 1928 4524 9886 ...
## $ product : chr "fan" "fan" "fan" "fan" ...
## $ product_code : chr "SKU83503" "SKU83503" "SKU83503" "SKU83503" ...
## $ product_color : chr "brass" "brass" "white" "black" ...
## $ product_price : num 14 14 14 14 14 ...
## $ purchase_size : int 2 2 1 1 2 5 1 1 1 1 ...
## $ purchase_price: num 28 28 14 14 28 ...
## $ revenue : chr "$27.98 " "$27.98 " "$13.99 " "$13.99 " ...
glimpse(Store_Transactions)
## Rows: 29
## Columns: 10
## $ date <chr> "29/08/2020 0:00", "01/05/2020 0:00", "12/12/2020 0:00"…
## $ transaction_id <int> 9900, 12315, 9890, 46915, 44700, 44700, 12560, 9640, 22…
## $ customer_id <int> 1980, 2463, 1978, 9383, 8940, 8940, 2512, 1928, 4524, 9…
## $ product <chr> "fan", "fan", "fan", "fan", "fan", "lamp", "bed", "couc…
## $ product_code <chr> "SKU83503", "SKU83503", "SKU83503", "SKU83503", "SKU835…
## $ product_color <chr> "brass", "brass", "white", "black", "brass", "brass", "…
## $ product_price <dbl> 13.99, 13.99, 13.99, 13.99, 13.99, 45.99, 799.99, 1000.…
## $ purchase_size <int> 2, 2, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1…
## $ purchase_price <dbl> 27.980, 27.980, 13.990, 13.990, 27.980, 160.965, 799.99…
## $ revenue <chr> "$27.98 ", "$27.98 ", "$13.99 ", "$13.99 ", "$27.98 ", …
colnames(Store_Transactions)
## [1] "date" "transaction_id" "customer_id" "product"
## [5] "product_code" "product_color" "product_price" "purchase_size"
## [9] "purchase_price" "revenue"
head(Store_Transactions)
## date transaction_id customer_id product product_code product_color
## 1 29/08/2020 0:00 9900 1980 fan SKU83503 brass
## 2 01/05/2020 0:00 12315 2463 fan SKU83503 brass
## 3 12/12/2020 0:00 9890 1978 fan SKU83503 white
## 4 16/02/2020 0:00 46915 9383 fan SKU83503 black
## 5 28/12/2020 0:00 44700 8940 fan SKU83503 brass
## 6 28/12/2020 0:00 44700 8940 lamp SKU95363 brass
## product_price purchase_size purchase_price revenue
## 1 13.99 2 27.980 $27.98
## 2 13.99 2 27.980 $27.98
## 3 13.99 1 13.990 $13.99
## 4 13.99 1 13.990 $13.99
## 5 13.99 2 27.980 $27.98
## 6 45.99 5 160.965 $229.95
Store_Transactions <- Store_Transactions %>%
rename(product_name=product) %>%
rename(units_purchased=purchase_size)
Store_Transactions <- rename_with(Store_Transactions, toupper)
head(Store_Transactions)
## DATE TRANSACTION_ID CUSTOMER_ID PRODUCT_NAME PRODUCT_CODE
## 1 29/08/2020 0:00 9900 1980 fan SKU83503
## 2 01/05/2020 0:00 12315 2463 fan SKU83503
## 3 12/12/2020 0:00 9890 1978 fan SKU83503
## 4 16/02/2020 0:00 46915 9383 fan SKU83503
## 5 28/12/2020 0:00 44700 8940 fan SKU83503
## 6 28/12/2020 0:00 44700 8940 lamp SKU95363
## PRODUCT_COLOR PRODUCT_PRICE UNITS_PURCHASED PURCHASE_PRICE REVENUE
## 1 brass 13.99 2 27.980 $27.98
## 2 brass 13.99 2 27.980 $27.98
## 3 white 13.99 1 13.990 $13.99
## 4 black 13.99 1 13.990 $13.99
## 5 brass 13.99 2 27.980 $27.98
## 6 brass 45.99 5 160.965 $229.95
class(Store_Transactions$DATE)
## [1] "character"
Store_Transactions$DATE <- ymd(Store_Transactions$DATE)
class(Store_Transactions$DATE)
## [1] "Date"
Store_Transaction <- Store_Transactions[!is.na(Store_Transactions$PRODUCT_NAME), ]
Store_Transaction <- Store_Transactions[-c(28,29),]
Store_Transaction <- Store_Transaction %>% mutate(Store_Transaction, NEW_REVENUE= PRODUCT_PRICE*UNITS_PURCHASED)
Store_Transaction <- Store_Transaction %>% select(-DATE,-PRODUCT_CODE,-PURCHASE_PRICE)
head(Store_Transaction)
## TRANSACTION_ID CUSTOMER_ID PRODUCT_NAME PRODUCT_COLOR PRODUCT_PRICE
## 1 9900 1980 fan brass 13.99
## 2 12315 2463 fan brass 13.99
## 3 9890 1978 fan white 13.99
## 4 46915 9383 fan black 13.99
## 5 44700 8940 fan brass 13.99
## 6 44700 8940 lamp brass 45.99
## UNITS_PURCHASED REVENUE NEW_REVENUE
## 1 2 $27.98 27.98
## 2 2 $27.98 27.98
## 3 1 $13.99 13.99
## 4 1 $13.99 13.99
## 5 2 $27.98 27.98
## 6 5 $229.95 229.95
# Grouping and summarizing in order to find total Revenue generated from each product
Products_vs_Revenue <- Store_Transaction %>% group_by(PRODUCT_NAME) %>%
summarize(Total_revenue_of_each_product = sum(NEW_REVENUE))
head(Products_vs_Revenue)
## # A tibble: 6 × 2
## PRODUCT_NAME Total_revenue_of_each_product
## <chr> <dbl>
## 1 bed 800.
## 2 bookcase 58.9
## 3 chair 234.
## 4 couch 9000
## 5 desk 510.
## 6 fan 112.
# Grouping and summarizing in order to find how many units of each product were sold.
Products_vs_units <- Store_Transaction %>% group_by(PRODUCT_NAME) %>%
summarize(Total_units_sold_of_each_product = sum(UNITS_PURCHASED))
head(Products_vs_units)
## # A tibble: 6 × 2
## PRODUCT_NAME Total_units_sold_of_each_product
## <chr> <int>
## 1 bed 1
## 2 bookcase 1
## 3 chair 1
## 4 couch 9
## 5 desk 3
## 6 fan 8
# Grouping and summarizing in order to find total revenue generated from each customer
Customer_vs_revenue <- Store_Transaction %>% group_by(CUSTOMER_ID) %>%
summarize(Total_revenue_by_each_customer = sum(NEW_REVENUE))
head(Customer_vs_revenue)
## # A tibble: 6 × 2
## CUSTOMER_ID Total_revenue_by_each_customer
## <int> <dbl>
## 1 335 1000
## 2 1268 170.
## 3 1928 1000
## 4 1978 14.0
## 5 1980 1028.
## 6 2463 28.0
# Grouping and summarizing in order to find total units bought by each customer
Customer_vs_units_purchased <- Store_Transaction %>% group_by(CUSTOMER_ID) %>%
summarize(Total_units_bought_by_each_customer = sum(UNITS_PURCHASED))
head(Customer_vs_units_purchased)
## # A tibble: 6 × 2
## CUSTOMER_ID Total_units_bought_by_each_customer
## <int> <int>
## 1 335 1
## 2 1268 1
## 3 1928 1
## 4 1978 1
## 5 1980 3
## 6 2463 2
# Filtering to pull out products named "FAN"
PRODUCT_FAN <- Store_Transaction %>% filter(PRODUCT_NAME=='fan')
# Creating a new column by uniting 2 columns.
PRODUCT_FAN <- unite(PRODUCT_FAN,'PRODUCT_NAME_and_COLOR', PRODUCT_NAME,PRODUCT_COLOR, sep = ' ')
# Grouping and summarizing in order to find revenue of product generated by each of its colour variations
PRODUCT_FAN <- PRODUCT_FAN %>% group_by(PRODUCT_NAME_and_COLOR) %>%
summarize(Total_revenue_by_each_color = sum(NEW_REVENUE))
head(PRODUCT_FAN)
## # A tibble: 3 × 2
## PRODUCT_NAME_and_COLOR Total_revenue_by_each_color
## <chr> <dbl>
## 1 fan black 14.0
## 2 fan brass 83.9
## 3 fan white 14.0
# Filtering to pull out products named "COUCH"
PRODUCT_COUCH <- Store_Transaction %>% filter(PRODUCT_NAME=='couch')
# Creating a new column by uniting 2 columns.
PRODUCT_COUCH <- unite(PRODUCT_COUCH,'PRODUCT_NAME_and_COLOR', PRODUCT_NAME,PRODUCT_COLOR, sep = ' ')
# Grouping and summarizing in order to find revenue of product generated by each of its colour variations
PRODUCT_COUCH <- PRODUCT_COUCH %>% group_by(PRODUCT_NAME_and_COLOR) %>%
summarize(Total_revenue_by_each_color = sum(NEW_REVENUE))
head(PRODUCT_COUCH)
## # A tibble: 6 × 2
## PRODUCT_NAME_and_COLOR Total_revenue_by_each_color
## <chr> <dbl>
## 1 couch black 1000
## 2 couch blue 1000
## 3 couch brown 1000
## 4 couch grey 3000
## 5 couch purple 1000
## 6 couch white 2000
# Filtering to pull out products named "RUG"
PRODUCT_RUG <- Store_Transaction %>% filter(PRODUCT_NAME=='rug')
# Creating a new column by uniting 2 columns.
PRODUCT_RUG <- unite(PRODUCT_RUG,'PRODUCT_NAME_and_COLOR', PRODUCT_NAME,PRODUCT_COLOR, sep = ' ')
# Grouping and summarizing in order to find revenue of product generated by each of its colour variations
PRODUCT_RUG <- PRODUCT_RUG %>% group_by(PRODUCT_NAME_and_COLOR) %>%
summarize(Total_revenue_by_each_color = sum(NEW_REVENUE))
head(PRODUCT_RUG)
## # A tibble: 2 × 2
## PRODUCT_NAME_and_COLOR Total_revenue_by_each_color
## <chr> <dbl>
## 1 rug beige 539.
## 2 rug grey 270.
# Filtering to pull out products named "DESK"
PRODUCT_DESK <- Store_Transaction %>% filter(PRODUCT_NAME=='desk')
# Creating a new column by uniting 2 columns.
PRODUCT_DESK <- unite(PRODUCT_DESK,'PRODUCT_NAME_and_COLOR', PRODUCT_NAME,PRODUCT_COLOR, sep = ' ')
# Grouping and summarizing in order to find revenue of product generated by each of its colour variations
PRODUCT_DESK <- PRODUCT_DESK %>% group_by(PRODUCT_NAME_and_COLOR) %>%
summarize(Total_revenue_by_each_color = sum(NEW_REVENUE))
head(PRODUCT_DESK)
## # A tibble: 2 × 2
## PRODUCT_NAME_and_COLOR Total_revenue_by_each_color
## <chr> <dbl>
## 1 desk brown 340.
## 2 desk white 170.