#Load Dataset
retail_sales_dataset <- read.csv("D:/retail_sales_dataset.csv")

Print the structure of your dataset.

str(retail_sales_dataset)
## 'data.frame':    1000 obs. of  9 variables:
##  $ Transaction.ID  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date            : chr  "2023-11-24" "2023-02-27" "2023-01-13" "2023-05-21" ...
##  $ Customer.ID     : chr  "CUST001" "CUST002" "CUST003" "CUST004" ...
##  $ Gender          : chr  "Male" "Female" "Male" "Male" ...
##  $ Age             : int  34 26 50 37 30 45 46 30 63 52 ...
##  $ Product.Category: chr  "Beauty" "Clothing" "Electronics" "Clothing" ...
##  $ Quantity        : int  3 2 1 1 2 1 2 4 2 4 ...
##  $ Price.per.Unit  : int  50 500 30 500 50 30 25 25 300 50 ...
##  $ Total.Amount    : int  150 1000 30 500 100 30 50 100 600 200 ...

List the variables in your data set.

names(retail_sales_dataset)
## [1] "Transaction.ID"   "Date"             "Customer.ID"      "Gender"          
## [5] "Age"              "Product.Category" "Quantity"         "Price.per.Unit"  
## [9] "Total.Amount"

Print the top 15 rows of your data set.

head(retail_sales_dataset, 15)
##    Transaction.ID       Date Customer.ID Gender Age Product.Category Quantity
## 1               1 2023-11-24     CUST001   Male  34           Beauty        3
## 2               2 2023-02-27     CUST002 Female  26         Clothing        2
## 3               3 2023-01-13     CUST003   Male  50      Electronics        1
## 4               4 2023-05-21     CUST004   Male  37         Clothing        1
## 5               5 2023-05-06     CUST005   Male  30           Beauty        2
## 6               6 2023-04-25     CUST006 Female  45           Beauty        1
## 7               7 2023-03-13     CUST007   Male  46         Clothing        2
## 8               8 2023-02-22     CUST008   Male  30      Electronics        4
## 9               9 2023-12-13     CUST009   Male  63      Electronics        2
## 10             10 2023-10-07     CUST010 Female  52         Clothing        4
## 11             11 2023-02-14     CUST011   Male  23         Clothing        2
## 12             12 2023-10-30     CUST012   Male  35           Beauty        3
## 13             13 2023-08-05     CUST013   Male  22      Electronics        3
## 14             14 2023-01-17     CUST014   Male  64         Clothing        4
## 15             15 2023-01-16     CUST015 Female  42      Electronics        4
##    Price.per.Unit Total.Amount
## 1              50          150
## 2             500         1000
## 3              30           30
## 4             500          500
## 5              50          100
## 6              30           30
## 7              25           50
## 8              25          100
## 9             300          600
## 10             50          200
## 11             50          100
## 12             25           75
## 13            500         1500
## 14             30          120
## 15            500         2000

Write a user defined function using any of the variables from the data set.

#Define function to calculate 10% discount
discount_price = function(total_amount){
  return(total_amount * 0.10)  # 10% discount
}

#Apply the function and create new columns
retail_sales_dataset <- retail_sales_dataset %>%
  mutate(
    Discount.Price = discount_price(Total.Amount),
    Final.Price = Total.Amount - Discount.Price
  )
#Display the updated data set as a formatted table
#Run knitr package
library(knitr) 
kable(head(retail_sales_dataset, 15), caption = "Retail Sales Dataset with Discount & Final.price")
Retail Sales Dataset with Discount & Final.price
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price
1 2023-11-24 CUST001 Male 34 Beauty 3 50 150 15.0 135.0
2 2023-02-27 CUST002 Female 26 Clothing 2 500 1000 100.0 900.0
3 2023-01-13 CUST003 Male 50 Electronics 1 30 30 3.0 27.0
4 2023-05-21 CUST004 Male 37 Clothing 1 500 500 50.0 450.0
5 2023-05-06 CUST005 Male 30 Beauty 2 50 100 10.0 90.0
6 2023-04-25 CUST006 Female 45 Beauty 1 30 30 3.0 27.0
7 2023-03-13 CUST007 Male 46 Clothing 2 25 50 5.0 45.0
8 2023-02-22 CUST008 Male 30 Electronics 4 25 100 10.0 90.0
9 2023-12-13 CUST009 Male 63 Electronics 2 300 600 60.0 540.0
10 2023-10-07 CUST010 Female 52 Clothing 4 50 200 20.0 180.0
11 2023-02-14 CUST011 Male 23 Clothing 2 50 100 10.0 90.0
12 2023-10-30 CUST012 Male 35 Beauty 3 25 75 7.5 67.5
13 2023-08-05 CUST013 Male 22 Electronics 3 500 1500 150.0 1350.0
14 2023-01-17 CUST014 Male 64 Clothing 4 30 120 12.0 108.0
15 2023-01-16 CUST015 Female 42 Electronics 4 500 2000 200.0 1800.0

Use data manipulation techniques and filter rows based on any logical criteria that exist in your data set.

#Data set was filtered according to the age, gender, and product category.
Filter_data <- retail_sales_dataset %>% 
  filter(Age > 30 & Gender == "Female" & Product.Category == "Clothing")
knitr::kable(head(Filter_data), caption = "Filtered_data")
Filtered_data
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price
10 2023-10-07 CUST010 Female 52 Clothing 4 50 200 20 180
19 2023-09-16 CUST019 Female 62 Clothing 2 25 50 5 45
23 2023-04-12 CUST023 Female 35 Clothing 4 30 120 12 108
24 2023-11-29 CUST024 Female 49 Clothing 1 300 300 30 270
34 2023-12-24 CUST034 Female 51 Clothing 3 50 150 15 135
43 2023-07-14 CUST043 Female 48 Clothing 1 300 300 30 270

Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your data set.

#Dependent variables = Quantity & Price
Product=retail_sales_dataset$Product.Category
Price = retail_sales_dataset$Price.per.Unit

#Independent variables = Product category 
Qnty= retail_sales_dataset$Quantity

Reshaped_data= cbind(Product, Price, Qnty)
Reshaped_data= as.data.frame(Reshaped_data)
#Display the updated data set as a formatted table
knitr::kable(head(Reshaped_data), caption = "Reshaped_data")
Reshaped_data
Product Price Qnty
Beauty 50 3
Clothing 500 2
Electronics 30 1
Clothing 500 1
Beauty 50 2
Beauty 30 1

Remove missing values in your data set.

Updated_data= na.omit(retail_sales_dataset)
knitr::kable(head(Updated_data, 15), caption = "Updated Dataset (No Missing Values)")
Updated Dataset (No Missing Values)
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price
1 2023-11-24 CUST001 Male 34 Beauty 3 50 150 15.0 135.0
2 2023-02-27 CUST002 Female 26 Clothing 2 500 1000 100.0 900.0
3 2023-01-13 CUST003 Male 50 Electronics 1 30 30 3.0 27.0
4 2023-05-21 CUST004 Male 37 Clothing 1 500 500 50.0 450.0
5 2023-05-06 CUST005 Male 30 Beauty 2 50 100 10.0 90.0
6 2023-04-25 CUST006 Female 45 Beauty 1 30 30 3.0 27.0
7 2023-03-13 CUST007 Male 46 Clothing 2 25 50 5.0 45.0
8 2023-02-22 CUST008 Male 30 Electronics 4 25 100 10.0 90.0
9 2023-12-13 CUST009 Male 63 Electronics 2 300 600 60.0 540.0
10 2023-10-07 CUST010 Female 52 Clothing 4 50 200 20.0 180.0
11 2023-02-14 CUST011 Male 23 Clothing 2 50 100 10.0 90.0
12 2023-10-30 CUST012 Male 35 Beauty 3 25 75 7.5 67.5
13 2023-08-05 CUST013 Male 22 Electronics 3 500 1500 150.0 1350.0
14 2023-01-17 CUST014 Male 64 Clothing 4 30 120 12.0 108.0
15 2023-01-16 CUST015 Female 42 Electronics 4 500 2000 200.0 1800.0

Identify and remove duplicated data from your data set.

New_data = distinct(retail_sales_dataset)
knitr::kable(head(New_data, 15), caption = "De-duplicated Dataset (First 15 Rows)")
De-duplicated Dataset (First 15 Rows)
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price
1 2023-11-24 CUST001 Male 34 Beauty 3 50 150 15.0 135.0
2 2023-02-27 CUST002 Female 26 Clothing 2 500 1000 100.0 900.0
3 2023-01-13 CUST003 Male 50 Electronics 1 30 30 3.0 27.0
4 2023-05-21 CUST004 Male 37 Clothing 1 500 500 50.0 450.0
5 2023-05-06 CUST005 Male 30 Beauty 2 50 100 10.0 90.0
6 2023-04-25 CUST006 Female 45 Beauty 1 30 30 3.0 27.0
7 2023-03-13 CUST007 Male 46 Clothing 2 25 50 5.0 45.0
8 2023-02-22 CUST008 Male 30 Electronics 4 25 100 10.0 90.0
9 2023-12-13 CUST009 Male 63 Electronics 2 300 600 60.0 540.0
10 2023-10-07 CUST010 Female 52 Clothing 4 50 200 20.0 180.0
11 2023-02-14 CUST011 Male 23 Clothing 2 50 100 10.0 90.0
12 2023-10-30 CUST012 Male 35 Beauty 3 25 75 7.5 67.5
13 2023-08-05 CUST013 Male 22 Electronics 3 500 1500 150.0 1350.0
14 2023-01-17 CUST014 Male 64 Clothing 4 30 120 12.0 108.0
15 2023-01-16 CUST015 Female 42 Electronics 4 500 2000 200.0 1800.0

Reorder multiple rows in descending order.

#Choose Total amount and age for rearranging.
Reordered_data = retail_sales_dataset %>% arrange(desc(Total.Amount), desc(Age))

knitr::kable(head(Reordered_data, 10), 
             caption = "Top 10 Rows of Data (Ordered by Total Amount and Age, Descending)")
Top 10 Rows of Data (Ordered by Total Amount and Age, Descending)
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price
561 2023-05-27 CUST561 Female 64 Clothing 4 500 2000 200 1800
735 2023-10-04 CUST735 Female 64 Clothing 4 500 2000 200 1800
547 2023-03-07 CUST547 Male 63 Clothing 4 500 2000 200 1800
157 2023-06-24 CUST157 Male 62 Electronics 4 500 2000 200 1800
946 2023-05-08 CUST946 Male 62 Electronics 4 500 2000 200 1800
789 2023-09-30 CUST789 Female 61 Clothing 4 500 2000 200 1800
634 2023-10-08 CUST634 Male 60 Electronics 4 500 2000 200 1800
970 2023-05-16 CUST970 Male 59 Electronics 4 500 2000 200 1800
89 2023-10-01 CUST089 Female 55 Electronics 4 500 2000 200 1800
731 2023-05-10 CUST731 Male 54 Clothing 4 500 2000 200 1800

Rename some of the column names in your data set.

#Transaction.ID = Trans.ID
#Customer.ID = Cust.ID
#Product.Category = Prod.cat

Renamed_data= retail_sales_dataset %>% rename(Trans.ID = Transaction.ID, Cust.ID= Customer.ID, Prod.cat= Product.Category)
head(Renamed_data)
##   Trans.ID       Date Cust.ID Gender Age    Prod.cat Quantity Price.per.Unit
## 1        1 2023-11-24 CUST001   Male  34      Beauty        3             50
## 2        2 2023-02-27 CUST002 Female  26    Clothing        2            500
## 3        3 2023-01-13 CUST003   Male  50 Electronics        1             30
## 4        4 2023-05-21 CUST004   Male  37    Clothing        1            500
## 5        5 2023-05-06 CUST005   Male  30      Beauty        2             50
## 6        6 2023-04-25 CUST006 Female  45      Beauty        1             30
##   Total.Amount Discount.Price Final.Price
## 1          150             15         135
## 2         1000            100         900
## 3           30              3          27
## 4          500             50         450
## 5          100             10          90
## 6           30              3          27

Add new variables in your data frame by using a mathematical function ( for e.g. - multiply and existing column by 2 and add it as a new variable to your data frame).

Latest_Maths_data <- retail_sales_dataset %>% mutate(Price.per.2 = Price.per.Unit * 2)

# Display the first few rows of the data set with the new variable
knitr::kable(head(Latest_Maths_data), caption = "Dataset with New Variable 'Price.per.2' (First 6 Rows)")
Dataset with New Variable ‘Price.per.2’ (First 6 Rows)
Transaction.ID Date Customer.ID Gender Age Product.Category Quantity Price.per.Unit Total.Amount Discount.Price Final.Price Price.per.2
1 2023-11-24 CUST001 Male 34 Beauty 3 50 150 15 135 100
2 2023-02-27 CUST002 Female 26 Clothing 2 500 1000 100 900 1000
3 2023-01-13 CUST003 Male 50 Electronics 1 30 30 3 27 60
4 2023-05-21 CUST004 Male 37 Clothing 1 500 500 50 450 1000
5 2023-05-06 CUST005 Male 30 Beauty 2 50 100 10 90 100
6 2023-04-25 CUST006 Female 45 Beauty 1 30 30 3 27 60

Create a training set using a random number generator engine.

set.seed(123)
Trained_1 = sample(1: nrow(retail_sales_dataset), 0.7* nrow(retail_sales_dataset))
Trained_data= retail_sales_dataset[Trained_1, ]

#Display the updated data set as a formatted table
knitr::kable(t(as.data.frame(summary(Trained_data))), caption = "Summary Statistics of Training Set")
Summary Statistics of Training Set
Var1
Var2 Transaction.ID Transaction.ID Transaction.ID Transaction.ID Transaction.ID Transaction.ID Date Date Date Date Date Date Customer.ID Customer.ID Customer.ID Customer.ID Customer.ID Customer.ID Gender Gender Gender Gender Gender Gender Age Age Age Age Age Age Product.Category Product.Category Product.Category Product.Category Product.Category Product.Category Quantity Quantity Quantity Quantity Quantity Quantity Price.per.Unit Price.per.Unit Price.per.Unit Price.per.Unit Price.per.Unit Price.per.Unit Total.Amount Total.Amount Total.Amount Total.Amount Total.Amount Total.Amount Discount.Price Discount.Price Discount.Price Discount.Price Discount.Price Discount.Price Final.Price Final.Price Final.Price Final.Price Final.Price Final.Price
Freq Min. : 2.0 1st Qu.: 251.8 Median : 496.0 Mean : 499.2 3rd Qu.: 746.2 Max. :1000.0 Length:700 Class :character Mode :character NA NA NA Length:700 Class :character Mode :character NA NA NA Length:700 Class :character Mode :character NA NA NA Min. :18.0 1st Qu.:30.0 Median :42.0 Mean :41.6 3rd Qu.:54.0 Max. :64.0 Length:700 Class :character Mode :character NA NA NA Min. :1.000 1st Qu.:1.000 Median :2.000 Mean :2.466 3rd Qu.:3.250 Max. :4.000 Min. : 25.0 1st Qu.: 30.0 Median : 50.0 Mean :185.4 3rd Qu.:300.0 Max. :500.0 Min. : 25.0 1st Qu.: 60.0 Median : 150.0 Mean : 452.9 3rd Qu.: 900.0 Max. :2000.0 Min. : 2.50 1st Qu.: 6.00 Median : 15.00 Mean : 45.29 3rd Qu.: 90.00 Max. :200.00 Min. : 22.5 1st Qu.: 54.0 Median : 135.0 Mean : 407.6 3rd Qu.: 810.0 Max. :1800.0

Print the summary statistics of your data set.

summary(retail_sales_dataset)
##  Transaction.ID       Date           Customer.ID           Gender         
##  Min.   :   1.0   Length:1000        Length:1000        Length:1000       
##  1st Qu.: 250.8   Class :character   Class :character   Class :character  
##  Median : 500.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 500.5                                                           
##  3rd Qu.: 750.2                                                           
##  Max.   :1000.0                                                           
##       Age        Product.Category      Quantity     Price.per.Unit 
##  Min.   :18.00   Length:1000        Min.   :1.000   Min.   : 25.0  
##  1st Qu.:29.00   Class :character   1st Qu.:1.000   1st Qu.: 30.0  
##  Median :42.00   Mode  :character   Median :3.000   Median : 50.0  
##  Mean   :41.39                      Mean   :2.514   Mean   :179.9  
##  3rd Qu.:53.00                      3rd Qu.:4.000   3rd Qu.:300.0  
##  Max.   :64.00                      Max.   :4.000   Max.   :500.0  
##   Total.Amount  Discount.Price   Final.Price    
##  Min.   :  25   Min.   :  2.5   Min.   :  22.5  
##  1st Qu.:  60   1st Qu.:  6.0   1st Qu.:  54.0  
##  Median : 135   Median : 13.5   Median : 121.5  
##  Mean   : 456   Mean   : 45.6   Mean   : 410.4  
##  3rd Qu.: 900   3rd Qu.: 90.0   3rd Qu.: 810.0  
##  Max.   :2000   Max.   :200.0   Max.   :1800.0

Use any of the numerical variables from the data set and perform the following statistical functions. Mean, Median, Mode, Range.

#Mean
Mean_total= mean(retail_sales_dataset$Total.Amount, na.rm = TRUE)

#Median
Median_total= median(retail_sales_dataset$Total.Amount, na.rm = TRUE)

#Mode
#A new mode function was defined.
get_mode = function(v){
  uniqv = unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Mode_total= get_mode(retail_sales_dataset$Total.Amount)

#Range
Range_total= range(retail_sales_dataset$Total.Amount, na.rm = TRUE)

# Create a data frame to display results
stats_summary <- data.frame(
  Statistic = c("Mean", "Median", "Mode", "Minimum", "Maximum"),
  Value = c(Mean_total, Median_total, Mode_total, Range_total[1], Range_total[2])
)

knitr::kable(stats_summary, caption = "Summary Statistics of Total Amount")
Summary Statistics of Total Amount
Statistic Value
Mean 456
Median 135
Mode 50
Minimum 25
Maximum 2000

Plot a scatter plot for any 2 variables in your data set.

#Scatter plot was plotted for Quantity & Total amount
ggplot(retail_sales_dataset, aes(x= Quantity, y= Total.Amount)) + geom_point() + labs(title = "Scatter plot of Quantity vs Total amount")

Plot a bar plot for any 2 variables in your data set.

ggplot(retail_sales_dataset, aes(x= Quantity, y= Price.per.Unit)) + geom_bar(stat = "identity") + labs(title = "Bar plot for Quantity and price per unit")

Find the correlation between any 2 variables by applying least square linear regression model.

correlation = cor(retail_sales_dataset$Age, retail_sales_dataset$Total.Amount, use = "complete.obs")
lm_model_data= lm(retail_sales_dataset$Total.Amount~retail_sales_dataset$Age, data = retail_sales_dataset)
summary(lm_model_data)
## 
## Call:
## lm(formula = retail_sales_dataset$Total.Amount ~ retail_sales_dataset$Age, 
##     data = retail_sales_dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -486.5 -387.0 -305.7  416.4 1600.0 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               558.616     56.377   9.909   <2e-16 ***
## retail_sales_dataset$Age   -2.479      1.293  -1.917   0.0555 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 559.2 on 998 degrees of freedom
## Multiple R-squared:  0.003668,   Adjusted R-squared:  0.00267 
## F-statistic: 3.675 on 1 and 998 DF,  p-value: 0.05553