Loading Data

I am importing the libraries needed to run these notes.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

I will use my data-set about sales of shoes,for which you have approved.

library(readr)
#loading the data
my_data_2 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 14970 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Inferential statistics. Confidence intervals, ...2, ...3, ...5, ......
## dbl (5): ...4, ...7, ...9, ...12, ...13
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Print the modified data frame
head(my_data_2)
## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹​`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

Cleaning my dataset

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data_2) <- new_names

# Verify that the column names have been changed
colnames(my_data_2)
##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

I am removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data_2 <- my_data_2[-c(1:3), ]


# Print the modified data frame
print(my_data_2)
## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>
my_data_2$SalePrice <- gsub("\\$", "", my_data_2$SalePrice)
my_data_2$SalePrice <- as.numeric(my_data_2$SalePrice)

class(my_data_2$SalePrice)
## [1] "numeric"
# removing $ for Unit Price

my_data_2$UnitPrice <- gsub("\\$", "", my_data_2$UnitPrice)

my_data_2$UnitPrice <- as.numeric(my_data_2$UnitPrice)

class(my_data_2$UnitPrice)
## [1] "numeric"

Applying group-by data frame to categorical variable and summarizing the Sales price by it.

library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
grouped_data_1 <- my_data_2 %>%
  group_by(Gender) %>%
  summarize(
    mean_unit_price=mean(UnitPrice),
    mean_sale_price=mean(SalePrice),
    no_of_people=n()
  )

print(grouped_data_1)
## # A tibble: 2 × 4
##   Gender mean_unit_price mean_sale_price no_of_people
##   <chr>            <dbl>           <dbl>        <int>
## 1 Female            164.            144.         6048
## 2 Male              164.            144.         8919
library(dbplyr)

grouped_data_2 <- my_data_2 %>%
  group_by(Country) %>%
  summarize(
    mean_unit_price=mean(UnitPrice),
    mean_sale_price=mean(SalePrice),
    Total_number_of_Shoes=n()
  )

print(grouped_data_2)
## # A tibble: 4 × 4
##   Country        mean_unit_price mean_sale_price Total_number_of_Shoes
##   <chr>                    <dbl>           <dbl>                 <int>
## 1 Canada                    165.            144.                  2952
## 2 Germany                   164.            144.                  4392
## 3 United Kingdom            166.            146.                  1737
## 4 United States             163.            144.                  5886
library(dbplyr)

Month <- c(1,2,3,4,5,6,7,8,9,10,11,12)

month_mapping <- c(
  "January", "February", "March", "April", "May", "June",
  "July", "August", "September", "October", "November", "December"
)

my_data_2$Month <- month_mapping[my_data_2$Month]

grouped_data_3 <- my_data_2 %>%
  group_by(Month) %>%
  summarize(
    mean_sale_price=mean(SalePrice),
    mean_size_of_shoes=mean(`Size(US)`),
    Shoes_sales_by_month=n()
  )

print(grouped_data_3)
## # A tibble: 12 × 4
##    Month     mean_sale_price mean_size_of_shoes Shoes_sales_by_month
##    <chr>               <dbl>              <dbl>                <int>
##  1 April                143.               9.19                 1197
##  2 August               143.               9.25                 1393
##  3 December             143.               9.14                 1148
##  4 February             143.               9.21                 1061
##  5 January              144.               9.09                 1064
##  6 July                 144.               9.23                 1413
##  7 June                 146.               9.21                 1381
##  8 March                145.               9.24                 1083
##  9 May                  145.               9.21                 1249
## 10 November             143.               9.29                 1154
## 11 October              145.               9.16                 1447
## 12 September            144.               9.13                 1377

Group all the categorical variable and summarizing by sales price and size of shoes.

library(dbplyr)

grouped_data_4 <- my_data_2 %>%
  group_by(Country, Gender, Month) %>%
  summarize(
    Mean_Price = mean(SalePrice),
    Mean_Size = mean(`Size(US)`),
    Total_Count = n()
  )
## `summarise()` has grouped output by 'Country', 'Gender'. You can override using
## the `.groups` argument.
print(grouped_data_4)
## # A tibble: 96 × 6
## # Groups:   Country, Gender [8]
##    Country Gender Month    Mean_Price Mean_Size Total_Count
##    <chr>   <chr>  <chr>         <dbl>     <dbl>       <int>
##  1 Canada  Female April          148.      8.63         106
##  2 Canada  Female August         145.      8.27          97
##  3 Canada  Female December       145.      8.55          85
##  4 Canada  Female February       148.      8.55          97
##  5 Canada  Female January        148.      8.14          79
##  6 Canada  Female July           144.      8.65          97
##  7 Canada  Female June           142.      8.28         121
##  8 Canada  Female March          140.      8.75          75
##  9 Canada  Female May            148.      8.58         105
## 10 Canada  Female November       145.      8.29          96
## # ℹ 86 more rows

Printing all the possibe combinations

library(dplyr)


combinations <- my_data_2 %>%
  group_by(Country, Gender, Month) %>%
  summarise(Count = n())
## `summarise()` has grouped output by 'Country', 'Gender'. You can override using
## the `.groups` argument.
all_possible_combinations <- expand.grid(
  Country = unique(my_data_2$Country),
  Gender = unique(my_data_2$Gender),
  Month = unique(my_data_2$Month)
)

print("All Possible Combinations:")
## [1] "All Possible Combinations:"
print(all_possible_combinations)
##           Country Gender     Month
## 1  United Kingdom   Male   January
## 2   United States   Male   January
## 3          Canada   Male   January
## 4         Germany   Male   January
## 5  United Kingdom Female   January
## 6   United States Female   January
## 7          Canada Female   January
## 8         Germany Female   January
## 9  United Kingdom   Male  February
## 10  United States   Male  February
## 11         Canada   Male  February
## 12        Germany   Male  February
## 13 United Kingdom Female  February
## 14  United States Female  February
## 15         Canada Female  February
## 16        Germany Female  February
## 17 United Kingdom   Male     March
## 18  United States   Male     March
## 19         Canada   Male     March
## 20        Germany   Male     March
## 21 United Kingdom Female     March
## 22  United States Female     March
## 23         Canada Female     March
## 24        Germany Female     March
## 25 United Kingdom   Male     April
## 26  United States   Male     April
## 27         Canada   Male     April
## 28        Germany   Male     April
## 29 United Kingdom Female     April
## 30  United States Female     April
## 31         Canada Female     April
## 32        Germany Female     April
## 33 United Kingdom   Male       May
## 34  United States   Male       May
## 35         Canada   Male       May
## 36        Germany   Male       May
## 37 United Kingdom Female       May
## 38  United States Female       May
## 39         Canada Female       May
## 40        Germany Female       May
## 41 United Kingdom   Male      June
## 42  United States   Male      June
## 43         Canada   Male      June
## 44        Germany   Male      June
## 45 United Kingdom Female      June
## 46  United States Female      June
## 47         Canada Female      June
## 48        Germany Female      June
## 49 United Kingdom   Male      July
## 50  United States   Male      July
## 51         Canada   Male      July
## 52        Germany   Male      July
## 53 United Kingdom Female      July
## 54  United States Female      July
## 55         Canada Female      July
## 56        Germany Female      July
## 57 United Kingdom   Male    August
## 58  United States   Male    August
## 59         Canada   Male    August
## 60        Germany   Male    August
## 61 United Kingdom Female    August
## 62  United States Female    August
## 63         Canada Female    August
## 64        Germany Female    August
## 65 United Kingdom   Male September
## 66  United States   Male September
## 67         Canada   Male September
## 68        Germany   Male September
## 69 United Kingdom Female September
## 70  United States Female September
## 71         Canada Female September
## 72        Germany Female September
## 73 United Kingdom   Male   October
## 74  United States   Male   October
## 75         Canada   Male   October
## 76        Germany   Male   October
## 77 United Kingdom Female   October
## 78  United States Female   October
## 79         Canada Female   October
## 80        Germany Female   October
## 81 United Kingdom   Male  November
## 82  United States   Male  November
## 83         Canada   Male  November
## 84        Germany   Male  November
## 85 United Kingdom Female  November
## 86  United States Female  November
## 87         Canada Female  November
## 88        Germany Female  November
## 89 United Kingdom   Male  December
## 90  United States   Male  December
## 91         Canada   Male  December
## 92        Germany   Male  December
## 93 United Kingdom Female  December
## 94  United States Female  December
## 95         Canada Female  December
## 96        Germany Female  December
missing_combinations <- anti_join(all_possible_combinations, combinations)
## Joining with `by = join_by(Country, Gender, Month)`
print("Missing Combinations:")
## [1] "Missing Combinations:"
print(missing_combinations)
## [1] Country Gender  Month  
## <0 rows> (or 0-length row.names)

Analyzing the most common combinations on the categorical variables

most_common_combinations <- combinations %>%
  arrange(desc(Count)) %>%
  head()

print("Most Common Combinations:")
## [1] "Most Common Combinations:"
print(most_common_combinations)
## # A tibble: 6 × 4
## # Groups:   Country, Gender [2]
##   Country       Gender Month     Count
##   <chr>         <chr>  <chr>     <int>
## 1 United States Male   October     345
## 2 United States Male   July        328
## 3 United States Male   September   320
## 4 United States Male   August      314
## 5 United States Male   June        309
## 6 Germany       Male   June        294

Printing the least common combinations between categorical variables

least_common_combinations <- combinations %>%
  arrange(Count) %>%
  head()

print(least_common_combinations)
## # A tibble: 6 × 4
## # Groups:   Country, Gender [1]
##   Country        Gender Month    Count
##   <chr>          <chr>  <chr>    <int>
## 1 United Kingdom Female June        34
## 2 United Kingdom Female February    43
## 3 United Kingdom Female January     52
## 4 United Kingdom Female October     52
## 5 United Kingdom Female December    53
## 6 United Kingdom Female July        57