#Load Dataset
retail_sales_dataset <- read.csv("D:/retail_sales_dataset.csv")
Print the structure of your dataset.
str(retail_sales_dataset)
## 'data.frame': 1000 obs. of 9 variables:
## $ Transaction.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2023-11-24" "2023-02-27" "2023-01-13" "2023-05-21" ...
## $ Customer.ID : chr "CUST001" "CUST002" "CUST003" "CUST004" ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ Age : int 34 26 50 37 30 45 46 30 63 52 ...
## $ Product.Category: chr "Beauty" "Clothing" "Electronics" "Clothing" ...
## $ Quantity : int 3 2 1 1 2 1 2 4 2 4 ...
## $ Price.per.Unit : int 50 500 30 500 50 30 25 25 300 50 ...
## $ Total.Amount : int 150 1000 30 500 100 30 50 100 600 200 ...
List the variables in your data set.
names(retail_sales_dataset)
## [1] "Transaction.ID" "Date" "Customer.ID" "Gender"
## [5] "Age" "Product.Category" "Quantity" "Price.per.Unit"
## [9] "Total.Amount"
Print the top 15 rows of your data set.
head(retail_sales_dataset, 15)
## Transaction.ID Date Customer.ID Gender Age Product.Category Quantity
## 1 1 2023-11-24 CUST001 Male 34 Beauty 3
## 2 2 2023-02-27 CUST002 Female 26 Clothing 2
## 3 3 2023-01-13 CUST003 Male 50 Electronics 1
## 4 4 2023-05-21 CUST004 Male 37 Clothing 1
## 5 5 2023-05-06 CUST005 Male 30 Beauty 2
## 6 6 2023-04-25 CUST006 Female 45 Beauty 1
## 7 7 2023-03-13 CUST007 Male 46 Clothing 2
## 8 8 2023-02-22 CUST008 Male 30 Electronics 4
## 9 9 2023-12-13 CUST009 Male 63 Electronics 2
## 10 10 2023-10-07 CUST010 Female 52 Clothing 4
## 11 11 2023-02-14 CUST011 Male 23 Clothing 2
## 12 12 2023-10-30 CUST012 Male 35 Beauty 3
## 13 13 2023-08-05 CUST013 Male 22 Electronics 3
## 14 14 2023-01-17 CUST014 Male 64 Clothing 4
## 15 15 2023-01-16 CUST015 Female 42 Electronics 4
## Price.per.Unit Total.Amount
## 1 50 150
## 2 500 1000
## 3 30 30
## 4 500 500
## 5 50 100
## 6 30 30
## 7 25 50
## 8 25 100
## 9 300 600
## 10 50 200
## 11 50 100
## 12 25 75
## 13 500 1500
## 14 30 120
## 15 500 2000
Write a user defined function using any of the variables from the data set.
#Define function to calculate 10% discount
discount_price = function(total_amount){
return(total_amount * 0.10) # 10% discount
}
#Apply the function and create new columns
retail_sales_dataset <- retail_sales_dataset %>%
mutate(
Discount.Price = discount_price(Total.Amount),
Final.Price = Total.Amount - Discount.Price
)
#Display the updated data set as a formatted table
#Run knitr package
library(knitr)
kable(head(retail_sales_dataset, 15), caption = "Retail Sales Dataset with Discount & Final.price")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2023-11-24 | CUST001 | Male | 34 | Beauty | 3 | 50 | 150 | 15.0 | 135.0 |
| 2 | 2023-02-27 | CUST002 | Female | 26 | Clothing | 2 | 500 | 1000 | 100.0 | 900.0 |
| 3 | 2023-01-13 | CUST003 | Male | 50 | Electronics | 1 | 30 | 30 | 3.0 | 27.0 |
| 4 | 2023-05-21 | CUST004 | Male | 37 | Clothing | 1 | 500 | 500 | 50.0 | 450.0 |
| 5 | 2023-05-06 | CUST005 | Male | 30 | Beauty | 2 | 50 | 100 | 10.0 | 90.0 |
| 6 | 2023-04-25 | CUST006 | Female | 45 | Beauty | 1 | 30 | 30 | 3.0 | 27.0 |
| 7 | 2023-03-13 | CUST007 | Male | 46 | Clothing | 2 | 25 | 50 | 5.0 | 45.0 |
| 8 | 2023-02-22 | CUST008 | Male | 30 | Electronics | 4 | 25 | 100 | 10.0 | 90.0 |
| 9 | 2023-12-13 | CUST009 | Male | 63 | Electronics | 2 | 300 | 600 | 60.0 | 540.0 |
| 10 | 2023-10-07 | CUST010 | Female | 52 | Clothing | 4 | 50 | 200 | 20.0 | 180.0 |
| 11 | 2023-02-14 | CUST011 | Male | 23 | Clothing | 2 | 50 | 100 | 10.0 | 90.0 |
| 12 | 2023-10-30 | CUST012 | Male | 35 | Beauty | 3 | 25 | 75 | 7.5 | 67.5 |
| 13 | 2023-08-05 | CUST013 | Male | 22 | Electronics | 3 | 500 | 1500 | 150.0 | 1350.0 |
| 14 | 2023-01-17 | CUST014 | Male | 64 | Clothing | 4 | 30 | 120 | 12.0 | 108.0 |
| 15 | 2023-01-16 | CUST015 | Female | 42 | Electronics | 4 | 500 | 2000 | 200.0 | 1800.0 |
Use data manipulation techniques and filter rows based on any logical criteria that exist in your data set.
#Data set was filtered according to the age, gender, and product category.
Filter_data <- retail_sales_dataset %>%
filter(Age > 30 & Gender == "Female" & Product.Category == "Clothing")
knitr::kable(head(Filter_data), caption = "Filtered_data")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price |
|---|---|---|---|---|---|---|---|---|---|---|
| 10 | 2023-10-07 | CUST010 | Female | 52 | Clothing | 4 | 50 | 200 | 20 | 180 |
| 19 | 2023-09-16 | CUST019 | Female | 62 | Clothing | 2 | 25 | 50 | 5 | 45 |
| 23 | 2023-04-12 | CUST023 | Female | 35 | Clothing | 4 | 30 | 120 | 12 | 108 |
| 24 | 2023-11-29 | CUST024 | Female | 49 | Clothing | 1 | 300 | 300 | 30 | 270 |
| 34 | 2023-12-24 | CUST034 | Female | 51 | Clothing | 3 | 50 | 150 | 15 | 135 |
| 43 | 2023-07-14 | CUST043 | Female | 48 | Clothing | 1 | 300 | 300 | 30 | 270 |
Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your data set.
#Dependent variables = Quantity & Price
Product=retail_sales_dataset$Product.Category
Price = retail_sales_dataset$Price.per.Unit
#Independent variables = Product category
Qnty= retail_sales_dataset$Quantity
Reshaped_data= cbind(Product, Price, Qnty)
Reshaped_data= as.data.frame(Reshaped_data)
#Display the updated data set as a formatted table
knitr::kable(head(Reshaped_data), caption = "Reshaped_data")
| Product | Price | Qnty |
|---|---|---|
| Beauty | 50 | 3 |
| Clothing | 500 | 2 |
| Electronics | 30 | 1 |
| Clothing | 500 | 1 |
| Beauty | 50 | 2 |
| Beauty | 30 | 1 |
Remove missing values in your data set.
Updated_data= na.omit(retail_sales_dataset)
knitr::kable(head(Updated_data, 15), caption = "Updated Dataset (No Missing Values)")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2023-11-24 | CUST001 | Male | 34 | Beauty | 3 | 50 | 150 | 15.0 | 135.0 |
| 2 | 2023-02-27 | CUST002 | Female | 26 | Clothing | 2 | 500 | 1000 | 100.0 | 900.0 |
| 3 | 2023-01-13 | CUST003 | Male | 50 | Electronics | 1 | 30 | 30 | 3.0 | 27.0 |
| 4 | 2023-05-21 | CUST004 | Male | 37 | Clothing | 1 | 500 | 500 | 50.0 | 450.0 |
| 5 | 2023-05-06 | CUST005 | Male | 30 | Beauty | 2 | 50 | 100 | 10.0 | 90.0 |
| 6 | 2023-04-25 | CUST006 | Female | 45 | Beauty | 1 | 30 | 30 | 3.0 | 27.0 |
| 7 | 2023-03-13 | CUST007 | Male | 46 | Clothing | 2 | 25 | 50 | 5.0 | 45.0 |
| 8 | 2023-02-22 | CUST008 | Male | 30 | Electronics | 4 | 25 | 100 | 10.0 | 90.0 |
| 9 | 2023-12-13 | CUST009 | Male | 63 | Electronics | 2 | 300 | 600 | 60.0 | 540.0 |
| 10 | 2023-10-07 | CUST010 | Female | 52 | Clothing | 4 | 50 | 200 | 20.0 | 180.0 |
| 11 | 2023-02-14 | CUST011 | Male | 23 | Clothing | 2 | 50 | 100 | 10.0 | 90.0 |
| 12 | 2023-10-30 | CUST012 | Male | 35 | Beauty | 3 | 25 | 75 | 7.5 | 67.5 |
| 13 | 2023-08-05 | CUST013 | Male | 22 | Electronics | 3 | 500 | 1500 | 150.0 | 1350.0 |
| 14 | 2023-01-17 | CUST014 | Male | 64 | Clothing | 4 | 30 | 120 | 12.0 | 108.0 |
| 15 | 2023-01-16 | CUST015 | Female | 42 | Electronics | 4 | 500 | 2000 | 200.0 | 1800.0 |
Identify and remove duplicated data from your data set.
New_data = distinct(retail_sales_dataset)
knitr::kable(head(New_data, 15), caption = "De-duplicated Dataset (First 15 Rows)")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2023-11-24 | CUST001 | Male | 34 | Beauty | 3 | 50 | 150 | 15.0 | 135.0 |
| 2 | 2023-02-27 | CUST002 | Female | 26 | Clothing | 2 | 500 | 1000 | 100.0 | 900.0 |
| 3 | 2023-01-13 | CUST003 | Male | 50 | Electronics | 1 | 30 | 30 | 3.0 | 27.0 |
| 4 | 2023-05-21 | CUST004 | Male | 37 | Clothing | 1 | 500 | 500 | 50.0 | 450.0 |
| 5 | 2023-05-06 | CUST005 | Male | 30 | Beauty | 2 | 50 | 100 | 10.0 | 90.0 |
| 6 | 2023-04-25 | CUST006 | Female | 45 | Beauty | 1 | 30 | 30 | 3.0 | 27.0 |
| 7 | 2023-03-13 | CUST007 | Male | 46 | Clothing | 2 | 25 | 50 | 5.0 | 45.0 |
| 8 | 2023-02-22 | CUST008 | Male | 30 | Electronics | 4 | 25 | 100 | 10.0 | 90.0 |
| 9 | 2023-12-13 | CUST009 | Male | 63 | Electronics | 2 | 300 | 600 | 60.0 | 540.0 |
| 10 | 2023-10-07 | CUST010 | Female | 52 | Clothing | 4 | 50 | 200 | 20.0 | 180.0 |
| 11 | 2023-02-14 | CUST011 | Male | 23 | Clothing | 2 | 50 | 100 | 10.0 | 90.0 |
| 12 | 2023-10-30 | CUST012 | Male | 35 | Beauty | 3 | 25 | 75 | 7.5 | 67.5 |
| 13 | 2023-08-05 | CUST013 | Male | 22 | Electronics | 3 | 500 | 1500 | 150.0 | 1350.0 |
| 14 | 2023-01-17 | CUST014 | Male | 64 | Clothing | 4 | 30 | 120 | 12.0 | 108.0 |
| 15 | 2023-01-16 | CUST015 | Female | 42 | Electronics | 4 | 500 | 2000 | 200.0 | 1800.0 |
Reorder multiple rows in descending order.
#Choose Total amount and age for rearranging.
Reordered_data = retail_sales_dataset %>% arrange(desc(Total.Amount), desc(Age))
knitr::kable(head(Reordered_data, 10),
caption = "Top 10 Rows of Data (Ordered by Total Amount and Age, Descending)")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price |
|---|---|---|---|---|---|---|---|---|---|---|
| 561 | 2023-05-27 | CUST561 | Female | 64 | Clothing | 4 | 500 | 2000 | 200 | 1800 |
| 735 | 2023-10-04 | CUST735 | Female | 64 | Clothing | 4 | 500 | 2000 | 200 | 1800 |
| 547 | 2023-03-07 | CUST547 | Male | 63 | Clothing | 4 | 500 | 2000 | 200 | 1800 |
| 157 | 2023-06-24 | CUST157 | Male | 62 | Electronics | 4 | 500 | 2000 | 200 | 1800 |
| 946 | 2023-05-08 | CUST946 | Male | 62 | Electronics | 4 | 500 | 2000 | 200 | 1800 |
| 789 | 2023-09-30 | CUST789 | Female | 61 | Clothing | 4 | 500 | 2000 | 200 | 1800 |
| 634 | 2023-10-08 | CUST634 | Male | 60 | Electronics | 4 | 500 | 2000 | 200 | 1800 |
| 970 | 2023-05-16 | CUST970 | Male | 59 | Electronics | 4 | 500 | 2000 | 200 | 1800 |
| 89 | 2023-10-01 | CUST089 | Female | 55 | Electronics | 4 | 500 | 2000 | 200 | 1800 |
| 731 | 2023-05-10 | CUST731 | Male | 54 | Clothing | 4 | 500 | 2000 | 200 | 1800 |
Rename some of the column names in your data set.
#Transaction.ID = Trans.ID
#Customer.ID = Cust.ID
#Product.Category = Prod.cat
Renamed_data= retail_sales_dataset %>% rename(Trans.ID = Transaction.ID, Cust.ID= Customer.ID, Prod.cat= Product.Category)
head(Renamed_data)
## Trans.ID Date Cust.ID Gender Age Prod.cat Quantity Price.per.Unit
## 1 1 2023-11-24 CUST001 Male 34 Beauty 3 50
## 2 2 2023-02-27 CUST002 Female 26 Clothing 2 500
## 3 3 2023-01-13 CUST003 Male 50 Electronics 1 30
## 4 4 2023-05-21 CUST004 Male 37 Clothing 1 500
## 5 5 2023-05-06 CUST005 Male 30 Beauty 2 50
## 6 6 2023-04-25 CUST006 Female 45 Beauty 1 30
## Total.Amount Discount.Price Final.Price
## 1 150 15 135
## 2 1000 100 900
## 3 30 3 27
## 4 500 50 450
## 5 100 10 90
## 6 30 3 27
Add new variables in your data frame by using a mathematical function ( for e.g. - multiply and existing column by 2 and add it as a new variable to your data frame).
Latest_Maths_data <- retail_sales_dataset %>% mutate(Price.per.2 = Price.per.Unit * 2)
# Display the first few rows of the data set with the new variable
knitr::kable(head(Latest_Maths_data), caption = "Dataset with New Variable 'Price.per.2' (First 6 Rows)")
| Transaction.ID | Date | Customer.ID | Gender | Age | Product.Category | Quantity | Price.per.Unit | Total.Amount | Discount.Price | Final.Price | Price.per.2 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2023-11-24 | CUST001 | Male | 34 | Beauty | 3 | 50 | 150 | 15 | 135 | 100 |
| 2 | 2023-02-27 | CUST002 | Female | 26 | Clothing | 2 | 500 | 1000 | 100 | 900 | 1000 |
| 3 | 2023-01-13 | CUST003 | Male | 50 | Electronics | 1 | 30 | 30 | 3 | 27 | 60 |
| 4 | 2023-05-21 | CUST004 | Male | 37 | Clothing | 1 | 500 | 500 | 50 | 450 | 1000 |
| 5 | 2023-05-06 | CUST005 | Male | 30 | Beauty | 2 | 50 | 100 | 10 | 90 | 100 |
| 6 | 2023-04-25 | CUST006 | Female | 45 | Beauty | 1 | 30 | 30 | 3 | 27 | 60 |
Create a training set using a random number generator engine.
set.seed(123)
Trained_1 = sample(1: nrow(retail_sales_dataset), 0.7* nrow(retail_sales_dataset))
Trained_data= retail_sales_dataset[Trained_1, ]
#Display the updated data set as a formatted table
knitr::kable(t(as.data.frame(summary(Trained_data))), caption = "Summary Statistics of Training Set")
| Var1 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Var2 | Transaction.ID | Transaction.ID | Transaction.ID | Transaction.ID | Transaction.ID | Transaction.ID | Date | Date | Date | Date | Date | Date | Customer.ID | Customer.ID | Customer.ID | Customer.ID | Customer.ID | Customer.ID | Gender | Gender | Gender | Gender | Gender | Gender | Age | Age | Age | Age | Age | Age | Product.Category | Product.Category | Product.Category | Product.Category | Product.Category | Product.Category | Quantity | Quantity | Quantity | Quantity | Quantity | Quantity | Price.per.Unit | Price.per.Unit | Price.per.Unit | Price.per.Unit | Price.per.Unit | Price.per.Unit | Total.Amount | Total.Amount | Total.Amount | Total.Amount | Total.Amount | Total.Amount | Discount.Price | Discount.Price | Discount.Price | Discount.Price | Discount.Price | Discount.Price | Final.Price | Final.Price | Final.Price | Final.Price | Final.Price | Final.Price |
| Freq | Min. : 2.0 | 1st Qu.: 251.8 | Median : 496.0 | Mean : 499.2 | 3rd Qu.: 746.2 | Max. :1000.0 | Length:700 | Class :character | Mode :character | NA | NA | NA | Length:700 | Class :character | Mode :character | NA | NA | NA | Length:700 | Class :character | Mode :character | NA | NA | NA | Min. :18.0 | 1st Qu.:30.0 | Median :42.0 | Mean :41.6 | 3rd Qu.:54.0 | Max. :64.0 | Length:700 | Class :character | Mode :character | NA | NA | NA | Min. :1.000 | 1st Qu.:1.000 | Median :2.000 | Mean :2.466 | 3rd Qu.:3.250 | Max. :4.000 | Min. : 25.0 | 1st Qu.: 30.0 | Median : 50.0 | Mean :185.4 | 3rd Qu.:300.0 | Max. :500.0 | Min. : 25.0 | 1st Qu.: 60.0 | Median : 150.0 | Mean : 452.9 | 3rd Qu.: 900.0 | Max. :2000.0 | Min. : 2.50 | 1st Qu.: 6.00 | Median : 15.00 | Mean : 45.29 | 3rd Qu.: 90.00 | Max. :200.00 | Min. : 22.5 | 1st Qu.: 54.0 | Median : 135.0 | Mean : 407.6 | 3rd Qu.: 810.0 | Max. :1800.0 |
Print the summary statistics of your data set.
summary(retail_sales_dataset)
## Transaction.ID Date Customer.ID Gender
## Min. : 1.0 Length:1000 Length:1000 Length:1000
## 1st Qu.: 250.8 Class :character Class :character Class :character
## Median : 500.5 Mode :character Mode :character Mode :character
## Mean : 500.5
## 3rd Qu.: 750.2
## Max. :1000.0
## Age Product.Category Quantity Price.per.Unit
## Min. :18.00 Length:1000 Min. :1.000 Min. : 25.0
## 1st Qu.:29.00 Class :character 1st Qu.:1.000 1st Qu.: 30.0
## Median :42.00 Mode :character Median :3.000 Median : 50.0
## Mean :41.39 Mean :2.514 Mean :179.9
## 3rd Qu.:53.00 3rd Qu.:4.000 3rd Qu.:300.0
## Max. :64.00 Max. :4.000 Max. :500.0
## Total.Amount Discount.Price Final.Price
## Min. : 25 Min. : 2.5 Min. : 22.5
## 1st Qu.: 60 1st Qu.: 6.0 1st Qu.: 54.0
## Median : 135 Median : 13.5 Median : 121.5
## Mean : 456 Mean : 45.6 Mean : 410.4
## 3rd Qu.: 900 3rd Qu.: 90.0 3rd Qu.: 810.0
## Max. :2000 Max. :200.0 Max. :1800.0
Use any of the numerical variables from the data set and perform the following statistical functions. Mean, Median, Mode, Range.
#Mean
Mean_total= mean(retail_sales_dataset$Total.Amount, na.rm = TRUE)
#Median
Median_total= median(retail_sales_dataset$Total.Amount, na.rm = TRUE)
#Mode
#A new mode function was defined.
get_mode = function(v){
uniqv = unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Mode_total= get_mode(retail_sales_dataset$Total.Amount)
#Range
Range_total= range(retail_sales_dataset$Total.Amount, na.rm = TRUE)
# Create a data frame to display results
stats_summary <- data.frame(
Statistic = c("Mean", "Median", "Mode", "Minimum", "Maximum"),
Value = c(Mean_total, Median_total, Mode_total, Range_total[1], Range_total[2])
)
knitr::kable(stats_summary, caption = "Summary Statistics of Total Amount")
| Statistic | Value |
|---|---|
| Mean | 456 |
| Median | 135 |
| Mode | 50 |
| Minimum | 25 |
| Maximum | 2000 |
Plot a scatter plot for any 2 variables in your data set.
#Scatter plot was plotted for Quantity & Total amount
ggplot(retail_sales_dataset, aes(x= Quantity, y= Total.Amount)) + geom_point() + labs(title = "Scatter plot of Quantity vs Total amount")
Plot a bar plot for any 2 variables in your data set.
ggplot(retail_sales_dataset, aes(x= Quantity, y= Price.per.Unit)) + geom_bar(stat = "identity") + labs(title = "Bar plot for Quantity and price per unit")
Find the correlation between any 2 variables by applying least square
linear regression model.
correlation = cor(retail_sales_dataset$Age, retail_sales_dataset$Total.Amount, use = "complete.obs")
lm_model_data= lm(retail_sales_dataset$Total.Amount~retail_sales_dataset$Age, data = retail_sales_dataset)
summary(lm_model_data)
##
## Call:
## lm(formula = retail_sales_dataset$Total.Amount ~ retail_sales_dataset$Age,
## data = retail_sales_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -486.5 -387.0 -305.7 416.4 1600.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 558.616 56.377 9.909 <2e-16 ***
## retail_sales_dataset$Age -2.479 1.293 -1.917 0.0555 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 559.2 on 998 degrees of freedom
## Multiple R-squared: 0.003668, Adjusted R-squared: 0.00267
## F-statistic: 3.675 on 1 and 998 DF, p-value: 0.05553