COMP 4033_Group6

#Load Dataset
retail_sales_dataset <- read.csv("D:/retail_sales_dataset.csv")

Print the structure of your dataset.

str(retail_sales_dataset)

## 'data.frame':    1000 obs. of  9 variables:
##  $ Transaction.ID  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date            : chr  "2023-11-24" "2023-02-27" "2023-01-13" "2023-05-21" ...
##  $ Customer.ID     : chr  "CUST001" "CUST002" "CUST003" "CUST004" ...
##  $ Gender          : chr  "Male" "Female" "Male" "Male" ...
##  $ Age             : int  34 26 50 37 30 45 46 30 63 52 ...
##  $ Product.Category: chr  "Beauty" "Clothing" "Electronics" "Clothing" ...
##  $ Quantity        : int  3 2 1 1 2 1 2 4 2 4 ...
##  $ Price.per.Unit  : int  50 500 30 500 50 30 25 25 300 50 ...
##  $ Total.Amount    : int  150 1000 30 500 100 30 50 100 600 200 ...

List the variables in your data set.

names(retail_sales_dataset)

## [1] "Transaction.ID"   "Date"             "Customer.ID"      "Gender"          
## [5] "Age"              "Product.Category" "Quantity"         "Price.per.Unit"  
## [9] "Total.Amount"

Print the top 15 rows of your data set.

head(retail_sales_dataset, 15)

##    Transaction.ID       Date Customer.ID Gender Age Product.Category Quantity
## 1               1 2023-11-24     CUST001   Male  34           Beauty        3
## 2               2 2023-02-27     CUST002 Female  26         Clothing        2
## 3               3 2023-01-13     CUST003   Male  50      Electronics        1
## 4               4 2023-05-21     CUST004   Male  37         Clothing        1
## 5               5 2023-05-06     CUST005   Male  30           Beauty        2
## 6               6 2023-04-25     CUST006 Female  45           Beauty        1
## 7               7 2023-03-13     CUST007   Male  46         Clothing        2
## 8               8 2023-02-22     CUST008   Male  30      Electronics        4
## 9               9 2023-12-13     CUST009   Male  63      Electronics        2
## 10             10 2023-10-07     CUST010 Female  52         Clothing        4
## 11             11 2023-02-14     CUST011   Male  23         Clothing        2
## 12             12 2023-10-30     CUST012   Male  35           Beauty        3
## 13             13 2023-08-05     CUST013   Male  22      Electronics        3
## 14             14 2023-01-17     CUST014   Male  64         Clothing        4
## 15             15 2023-01-16     CUST015 Female  42      Electronics        4
##    Price.per.Unit Total.Amount
## 1              50          150
## 2             500         1000
## 3              30           30
## 4             500          500
## 5              50          100
## 6              30           30
## 7              25           50
## 8              25          100
## 9             300          600
## 10             50          200
## 11             50          100
## 12             25           75
## 13            500         1500
## 14             30          120
## 15            500         2000

Write a user defined function using any of the variables from the data set.

#Define function to calculate 10% discount
discount_price = function(total_amount){
  return(total_amount * 0.10)  # 10% discount
}

#Apply the function and create new columns
retail_sales_dataset <- retail_sales_dataset %>%
  mutate(
    Discount.Price = discount_price(Total.Amount),
    Final.Price = Total.Amount - Discount.Price
  )
#Display the updated data set as a formatted table
#Run knitr package
library(knitr) 
kable(head(retail_sales_dataset, 15), caption = "Retail Sales Dataset with Discount & Final.price")

Retail Sales Dataset with Discount & Final.price
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price
1	2023-11-24	CUST001	Male	34	Beauty	3	50	150	15.0	135.0
2	2023-02-27	CUST002	Female	26	Clothing	2	500	1000	100.0	900.0
3	2023-01-13	CUST003	Male	50	Electronics	1	30	30	3.0	27.0
4	2023-05-21	CUST004	Male	37	Clothing	1	500	500	50.0	450.0
5	2023-05-06	CUST005	Male	30	Beauty	2	50	100	10.0	90.0
6	2023-04-25	CUST006	Female	45	Beauty	1	30	30	3.0	27.0
7	2023-03-13	CUST007	Male	46	Clothing	2	25	50	5.0	45.0
8	2023-02-22	CUST008	Male	30	Electronics	4	25	100	10.0	90.0
9	2023-12-13	CUST009	Male	63	Electronics	2	300	600	60.0	540.0
10	2023-10-07	CUST010	Female	52	Clothing	4	50	200	20.0	180.0
11	2023-02-14	CUST011	Male	23	Clothing	2	50	100	10.0	90.0
12	2023-10-30	CUST012	Male	35	Beauty	3	25	75	7.5	67.5
13	2023-08-05	CUST013	Male	22	Electronics	3	500	1500	150.0	1350.0
14	2023-01-17	CUST014	Male	64	Clothing	4	30	120	12.0	108.0
15	2023-01-16	CUST015	Female	42	Electronics	4	500	2000	200.0	1800.0

Use data manipulation techniques and filter rows based on any logical criteria that exist in your data set.

#Data set was filtered according to the age, gender, and product category.
Filter_data <- retail_sales_dataset %>% 
  filter(Age > 30 & Gender == "Female" & Product.Category == "Clothing")
knitr::kable(head(Filter_data), caption = "Filtered_data")

Filtered_data
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price
10	2023-10-07	CUST010	Female	52	Clothing	4	50	200	20	180
19	2023-09-16	CUST019	Female	62	Clothing	2	25	50	5	45
23	2023-04-12	CUST023	Female	35	Clothing	4	30	120	12	108
24	2023-11-29	CUST024	Female	49	Clothing	1	300	300	30	270
34	2023-12-24	CUST034	Female	51	Clothing	3	50	150	15	135
43	2023-07-14	CUST043	Female	48	Clothing	1	300	300	30	270

Identify the dependent and independent variables and use reshaping techniques and create a new data frame by joining those variables from your data set.

#Dependent variables = Quantity & Price
Product=retail_sales_dataset$Product.Category
Price = retail_sales_dataset$Price.per.Unit

#Independent variables = Product category 
Qnty= retail_sales_dataset$Quantity

Reshaped_data= cbind(Product, Price, Qnty)
Reshaped_data= as.data.frame(Reshaped_data)
#Display the updated data set as a formatted table
knitr::kable(head(Reshaped_data), caption = "Reshaped_data")

Reshaped_data
Product	Price	Qnty
Beauty	50	3
Clothing	500	2
Electronics	30	1
Clothing	500	1
Beauty	50	2
Beauty	30	1

Remove missing values in your data set.

Updated_data= na.omit(retail_sales_dataset)
knitr::kable(head(Updated_data, 15), caption = "Updated Dataset (No Missing Values)")

Updated Dataset (No Missing Values)
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price
1	2023-11-24	CUST001	Male	34	Beauty	3	50	150	15.0	135.0
2	2023-02-27	CUST002	Female	26	Clothing	2	500	1000	100.0	900.0
3	2023-01-13	CUST003	Male	50	Electronics	1	30	30	3.0	27.0
4	2023-05-21	CUST004	Male	37	Clothing	1	500	500	50.0	450.0
5	2023-05-06	CUST005	Male	30	Beauty	2	50	100	10.0	90.0
6	2023-04-25	CUST006	Female	45	Beauty	1	30	30	3.0	27.0
7	2023-03-13	CUST007	Male	46	Clothing	2	25	50	5.0	45.0
8	2023-02-22	CUST008	Male	30	Electronics	4	25	100	10.0	90.0
9	2023-12-13	CUST009	Male	63	Electronics	2	300	600	60.0	540.0
10	2023-10-07	CUST010	Female	52	Clothing	4	50	200	20.0	180.0
11	2023-02-14	CUST011	Male	23	Clothing	2	50	100	10.0	90.0
12	2023-10-30	CUST012	Male	35	Beauty	3	25	75	7.5	67.5
13	2023-08-05	CUST013	Male	22	Electronics	3	500	1500	150.0	1350.0
14	2023-01-17	CUST014	Male	64	Clothing	4	30	120	12.0	108.0
15	2023-01-16	CUST015	Female	42	Electronics	4	500	2000	200.0	1800.0

Identify and remove duplicated data from your data set.

New_data = distinct(retail_sales_dataset)
knitr::kable(head(New_data, 15), caption = "De-duplicated Dataset (First 15 Rows)")

De-duplicated Dataset (First 15 Rows)
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price
1	2023-11-24	CUST001	Male	34	Beauty	3	50	150	15.0	135.0
2	2023-02-27	CUST002	Female	26	Clothing	2	500	1000	100.0	900.0
3	2023-01-13	CUST003	Male	50	Electronics	1	30	30	3.0	27.0
4	2023-05-21	CUST004	Male	37	Clothing	1	500	500	50.0	450.0
5	2023-05-06	CUST005	Male	30	Beauty	2	50	100	10.0	90.0
6	2023-04-25	CUST006	Female	45	Beauty	1	30	30	3.0	27.0
7	2023-03-13	CUST007	Male	46	Clothing	2	25	50	5.0	45.0
8	2023-02-22	CUST008	Male	30	Electronics	4	25	100	10.0	90.0
9	2023-12-13	CUST009	Male	63	Electronics	2	300	600	60.0	540.0
10	2023-10-07	CUST010	Female	52	Clothing	4	50	200	20.0	180.0
11	2023-02-14	CUST011	Male	23	Clothing	2	50	100	10.0	90.0
12	2023-10-30	CUST012	Male	35	Beauty	3	25	75	7.5	67.5
13	2023-08-05	CUST013	Male	22	Electronics	3	500	1500	150.0	1350.0
14	2023-01-17	CUST014	Male	64	Clothing	4	30	120	12.0	108.0
15	2023-01-16	CUST015	Female	42	Electronics	4	500	2000	200.0	1800.0

Reorder multiple rows in descending order.

#Choose Total amount and age for rearranging.
Reordered_data = retail_sales_dataset %>% arrange(desc(Total.Amount), desc(Age))

knitr::kable(head(Reordered_data, 10), 
             caption = "Top 10 Rows of Data (Ordered by Total Amount and Age, Descending)")

Top 10 Rows of Data (Ordered by Total Amount and Age, Descending)
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price
561	2023-05-27	CUST561	Female	64	Clothing	4	500	2000	200	1800
735	2023-10-04	CUST735	Female	64	Clothing	4	500	2000	200	1800
547	2023-03-07	CUST547	Male	63	Clothing	4	500	2000	200	1800
157	2023-06-24	CUST157	Male	62	Electronics	4	500	2000	200	1800
946	2023-05-08	CUST946	Male	62	Electronics	4	500	2000	200	1800
789	2023-09-30	CUST789	Female	61	Clothing	4	500	2000	200	1800
634	2023-10-08	CUST634	Male	60	Electronics	4	500	2000	200	1800
970	2023-05-16	CUST970	Male	59	Electronics	4	500	2000	200	1800
89	2023-10-01	CUST089	Female	55	Electronics	4	500	2000	200	1800
731	2023-05-10	CUST731	Male	54	Clothing	4	500	2000	200	1800

Rename some of the column names in your data set.

#Transaction.ID = Trans.ID
#Customer.ID = Cust.ID
#Product.Category = Prod.cat

Renamed_data= retail_sales_dataset %>% rename(Trans.ID = Transaction.ID, Cust.ID= Customer.ID, Prod.cat= Product.Category)
head(Renamed_data)

##   Trans.ID       Date Cust.ID Gender Age    Prod.cat Quantity Price.per.Unit
## 1        1 2023-11-24 CUST001   Male  34      Beauty        3             50
## 2        2 2023-02-27 CUST002 Female  26    Clothing        2            500
## 3        3 2023-01-13 CUST003   Male  50 Electronics        1             30
## 4        4 2023-05-21 CUST004   Male  37    Clothing        1            500
## 5        5 2023-05-06 CUST005   Male  30      Beauty        2             50
## 6        6 2023-04-25 CUST006 Female  45      Beauty        1             30
##   Total.Amount Discount.Price Final.Price
## 1          150             15         135
## 2         1000            100         900
## 3           30              3          27
## 4          500             50         450
## 5          100             10          90
## 6           30              3          27

Add new variables in your data frame by using a mathematical function ( for e.g. - multiply and existing column by 2 and add it as a new variable to your data frame).

Latest_Maths_data <- retail_sales_dataset %>% mutate(Price.per.2 = Price.per.Unit * 2)

# Display the first few rows of the data set with the new variable
knitr::kable(head(Latest_Maths_data), caption = "Dataset with New Variable 'Price.per.2' (First 6 Rows)")

Dataset with New Variable ‘Price.per.2’ (First 6 Rows)
Transaction.ID	Date	Customer.ID	Gender	Age	Product.Category	Quantity	Price.per.Unit	Total.Amount	Discount.Price	Final.Price	Price.per.2
1	2023-11-24	CUST001	Male	34	Beauty	3	50	150	15	135	100
2	2023-02-27	CUST002	Female	26	Clothing	2	500	1000	100	900	1000
3	2023-01-13	CUST003	Male	50	Electronics	1	30	30	3	27	60
4	2023-05-21	CUST004	Male	37	Clothing	1	500	500	50	450	1000
5	2023-05-06	CUST005	Male	30	Beauty	2	50	100	10	90	100
6	2023-04-25	CUST006	Female	45	Beauty	1	30	30	3	27	60

Create a training set using a random number generator engine.

set.seed(123)
Trained_1 = sample(1: nrow(retail_sales_dataset), 0.7* nrow(retail_sales_dataset))
Trained_data= retail_sales_dataset[Trained_1, ]

#Display the updated data set as a formatted table
knitr::kable(t(as.data.frame(summary(Trained_data))), caption = "Summary Statistics of Training Set")

Summary Statistics of Training Set
Var1
Var2	Transaction.ID	Transaction.ID	Transaction.ID	Transaction.ID	Transaction.ID	Transaction.ID	Date	Date	Date	Date	Date	Date	Customer.ID	Customer.ID	Customer.ID	Customer.ID	Customer.ID	Customer.ID	Gender	Gender	Gender	Gender	Gender	Gender	Age	Age	Age	Age	Age	Age	Product.Category	Product.Category	Product.Category	Product.Category	Product.Category	Product.Category	Quantity	Quantity	Quantity	Quantity	Quantity	Quantity	Price.per.Unit	Price.per.Unit	Price.per.Unit	Price.per.Unit	Price.per.Unit	Price.per.Unit	Total.Amount	Total.Amount	Total.Amount	Total.Amount	Total.Amount	Total.Amount	Discount.Price	Discount.Price	Discount.Price	Discount.Price	Discount.Price	Discount.Price	Final.Price	Final.Price	Final.Price	Final.Price	Final.Price	Final.Price
Freq	Min. : 2.0	1st Qu.: 251.8	Median : 496.0	Mean : 499.2	3rd Qu.: 746.2	Max. :1000.0	Length:700	Class :character	Mode :character	NA	NA	NA	Length:700	Class :character	Mode :character	NA	NA	NA	Length:700	Class :character	Mode :character	NA	NA	NA	Min. :18.0	1st Qu.:30.0	Median :42.0	Mean :41.6	3rd Qu.:54.0	Max. :64.0	Length:700	Class :character	Mode :character	NA	NA	NA	Min. :1.000	1st Qu.:1.000	Median :2.000	Mean :2.466	3rd Qu.:3.250	Max. :4.000	Min. : 25.0	1st Qu.: 30.0	Median : 50.0	Mean :185.4	3rd Qu.:300.0	Max. :500.0	Min. : 25.0	1st Qu.: 60.0	Median : 150.0	Mean : 452.9	3rd Qu.: 900.0	Max. :2000.0	Min. : 2.50	1st Qu.: 6.00	Median : 15.00	Mean : 45.29	3rd Qu.: 90.00	Max. :200.00	Min. : 22.5	1st Qu.: 54.0	Median : 135.0	Mean : 407.6	3rd Qu.: 810.0	Max. :1800.0

Print the summary statistics of your data set.

summary(retail_sales_dataset)

##  Transaction.ID       Date           Customer.ID           Gender         
##  Min.   :   1.0   Length:1000        Length:1000        Length:1000       
##  1st Qu.: 250.8   Class :character   Class :character   Class :character  
##  Median : 500.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 500.5                                                           
##  3rd Qu.: 750.2                                                           
##  Max.   :1000.0                                                           
##       Age        Product.Category      Quantity     Price.per.Unit 
##  Min.   :18.00   Length:1000        Min.   :1.000   Min.   : 25.0  
##  1st Qu.:29.00   Class :character   1st Qu.:1.000   1st Qu.: 30.0  
##  Median :42.00   Mode  :character   Median :3.000   Median : 50.0  
##  Mean   :41.39                      Mean   :2.514   Mean   :179.9  
##  3rd Qu.:53.00                      3rd Qu.:4.000   3rd Qu.:300.0  
##  Max.   :64.00                      Max.   :4.000   Max.   :500.0  
##   Total.Amount  Discount.Price   Final.Price    
##  Min.   :  25   Min.   :  2.5   Min.   :  22.5  
##  1st Qu.:  60   1st Qu.:  6.0   1st Qu.:  54.0  
##  Median : 135   Median : 13.5   Median : 121.5  
##  Mean   : 456   Mean   : 45.6   Mean   : 410.4  
##  3rd Qu.: 900   3rd Qu.: 90.0   3rd Qu.: 810.0  
##  Max.   :2000   Max.   :200.0   Max.   :1800.0

Use any of the numerical variables from the data set and perform the following statistical functions. Mean, Median, Mode, Range.

#Mean
Mean_total= mean(retail_sales_dataset$Total.Amount, na.rm = TRUE)

#Median
Median_total= median(retail_sales_dataset$Total.Amount, na.rm = TRUE)

#Mode
#A new mode function was defined.
get_mode = function(v){
  uniqv = unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Mode_total= get_mode(retail_sales_dataset$Total.Amount)

#Range
Range_total= range(retail_sales_dataset$Total.Amount, na.rm = TRUE)

# Create a data frame to display results
stats_summary <- data.frame(
  Statistic = c("Mean", "Median", "Mode", "Minimum", "Maximum"),
  Value = c(Mean_total, Median_total, Mode_total, Range_total[1], Range_total[2])
)

knitr::kable(stats_summary, caption = "Summary Statistics of Total Amount")

Summary Statistics of Total Amount
Statistic	Value
Mean	456
Median	135
Mode	50
Minimum	25
Maximum	2000

Plot a scatter plot for any 2 variables in your data set.

#Scatter plot was plotted for Quantity & Total amount
ggplot(retail_sales_dataset, aes(x= Quantity, y= Total.Amount)) + geom_point() + labs(title = "Scatter plot of Quantity vs Total amount")

Plot a bar plot for any 2 variables in your data set.

ggplot(retail_sales_dataset, aes(x= Quantity, y= Price.per.Unit)) + geom_bar(stat = "identity") + labs(title = "Bar plot for Quantity and price per unit")

Find the correlation between any 2 variables by applying least square linear regression model.

correlation = cor(retail_sales_dataset$Age, retail_sales_dataset$Total.Amount, use = "complete.obs")
lm_model_data= lm(retail_sales_dataset$Total.Amount~retail_sales_dataset$Age, data = retail_sales_dataset)
summary(lm_model_data)

## 
## Call:
## lm(formula = retail_sales_dataset$Total.Amount ~ retail_sales_dataset$Age, 
##     data = retail_sales_dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -486.5 -387.0 -305.7  416.4 1600.0 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               558.616     56.377   9.909   <2e-16 ***
## retail_sales_dataset$Age   -2.479      1.293  -1.917   0.0555 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 559.2 on 998 degrees of freedom
## Multiple R-squared:  0.003668,   Adjusted R-squared:  0.00267 
## F-statistic: 3.675 on 1 and 998 DF,  p-value: 0.05553

COMP 4033_Group6

2025-02-24