I am importing the libraries needed to run these notes.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
I will use my data-set about sales of shoes,for which you have approved.
library(readr)
#loading the data
my_data_2 <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 14970 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Inferential statistics. Confidence intervals, ...2, ...3, ...5, ......
## dbl (5): ...4, ...7, ...9, ...12, ...13
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Print the modified data frame
head(my_data_2)
## # A tibble: 6 × 14
## Inferential statistics…¹ ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 2 <NA> <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 3 InvoiceNo Date Coun… NA Shop Gend… NA Size… NA "Uni…
## 4 52389 1/1/… Unit… 2152 UK2 Male 11 44 10.5 "$15…
## 5 52390 1/1/… Unit… 2230 US15 Male 11.5 44-45 11 "$19…
## 6 52391 1/1/… Cana… 2160 CAN7 Male 9.5 42-43 9 "$14…
## # ℹ abbreviated name: ¹`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>
I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.
new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)', 'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')
# Assign the new column names to the data frame
colnames(my_data_2) <- new_names
# Verify that the column names have been changed
colnames(my_data_2)
## [1] "InvoiceNo" "Date" "Country" "ProductID"
## [5] "Shop" "Gender" "Size(US)" "Size (Europe)"
## [9] "Size (UK)" "UnitPrice" "Discount" "Year"
## [13] "Month" "SalePrice"
I am removing the first 3 rows to remove null values and un-necessary titles for my data set
my_data_2 <- my_data_2[-c(1:3), ]
# Print the modified data frame
print(my_data_2)
## # A tibble: 14,967 × 14
## InvoiceNo Date Country ProductID Shop Gender `Size(US)` `Size (Europe)`
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 52389 1/1/2014 United … 2152 UK2 Male 11 44
## 2 52390 1/1/2014 United … 2230 US15 Male 11.5 44-45
## 3 52391 1/1/2014 Canada 2160 CAN7 Male 9.5 42-43
## 4 52392 1/1/2014 United … 2234 US6 Female 9.5 40
## 5 52393 1/1/2014 United … 2222 UK4 Female 9 39-40
## 6 52394 1/1/2014 United … 2173 US15 Male 10.5 43-44
## 7 52395 1/2/2014 Germany 2200 GER2 Female 9 39-40
## 8 52396 1/2/2014 Canada 2238 CAN5 Male 10 43
## 9 52397 1/2/2014 United … 2191 US13 Male 10.5 43-44
## 10 52398 1/2/2014 United … 2237 UK1 Female 9 39-40
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## # Year <dbl>, Month <dbl>, SalePrice <chr>
my_data_2$SalePrice <- gsub("\\$", "", my_data_2$SalePrice)
my_data_2$SalePrice <- as.numeric(my_data_2$SalePrice)
class(my_data_2$SalePrice)
## [1] "numeric"
# removing $ for Unit Price
my_data_2$UnitPrice <- gsub("\\$", "", my_data_2$UnitPrice)
my_data_2$UnitPrice <- as.numeric(my_data_2$UnitPrice)
class(my_data_2$UnitPrice)
## [1] "numeric"
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
grouped_data_1 <- my_data_2 %>%
group_by(Gender) %>%
summarize(
mean_unit_price=mean(UnitPrice),
mean_sale_price=mean(SalePrice),
no_of_people=n()
)
print(grouped_data_1)
## # A tibble: 2 × 4
## Gender mean_unit_price mean_sale_price no_of_people
## <chr> <dbl> <dbl> <int>
## 1 Female 164. 144. 6048
## 2 Male 164. 144. 8919
library(dbplyr)
grouped_data_2 <- my_data_2 %>%
group_by(Country) %>%
summarize(
mean_unit_price=mean(UnitPrice),
mean_sale_price=mean(SalePrice),
Total_number_of_Shoes=n()
)
print(grouped_data_2)
## # A tibble: 4 × 4
## Country mean_unit_price mean_sale_price Total_number_of_Shoes
## <chr> <dbl> <dbl> <int>
## 1 Canada 165. 144. 2952
## 2 Germany 164. 144. 4392
## 3 United Kingdom 166. 146. 1737
## 4 United States 163. 144. 5886
library(dbplyr)
Month <- c(1,2,3,4,5,6,7,8,9,10,11,12)
month_mapping <- c(
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"
)
my_data_2$Month <- month_mapping[my_data_2$Month]
grouped_data_3 <- my_data_2 %>%
group_by(Month) %>%
summarize(
mean_sale_price=mean(SalePrice),
mean_size_of_shoes=mean(`Size(US)`),
Shoes_sales_by_month=n()
)
print(grouped_data_3)
## # A tibble: 12 × 4
## Month mean_sale_price mean_size_of_shoes Shoes_sales_by_month
## <chr> <dbl> <dbl> <int>
## 1 April 143. 9.19 1197
## 2 August 143. 9.25 1393
## 3 December 143. 9.14 1148
## 4 February 143. 9.21 1061
## 5 January 144. 9.09 1064
## 6 July 144. 9.23 1413
## 7 June 146. 9.21 1381
## 8 March 145. 9.24 1083
## 9 May 145. 9.21 1249
## 10 November 143. 9.29 1154
## 11 October 145. 9.16 1447
## 12 September 144. 9.13 1377
library(dbplyr)
grouped_data_4 <- my_data_2 %>%
group_by(Country, Gender, Month) %>%
summarize(
Mean_Price = mean(SalePrice),
Mean_Size = mean(`Size(US)`),
Total_Count = n()
)
## `summarise()` has grouped output by 'Country', 'Gender'. You can override using
## the `.groups` argument.
print(grouped_data_4)
## # A tibble: 96 × 6
## # Groups: Country, Gender [8]
## Country Gender Month Mean_Price Mean_Size Total_Count
## <chr> <chr> <chr> <dbl> <dbl> <int>
## 1 Canada Female April 148. 8.63 106
## 2 Canada Female August 145. 8.27 97
## 3 Canada Female December 145. 8.55 85
## 4 Canada Female February 148. 8.55 97
## 5 Canada Female January 148. 8.14 79
## 6 Canada Female July 144. 8.65 97
## 7 Canada Female June 142. 8.28 121
## 8 Canada Female March 140. 8.75 75
## 9 Canada Female May 148. 8.58 105
## 10 Canada Female November 145. 8.29 96
## # ℹ 86 more rows
library(dplyr)
combinations <- my_data_2 %>%
group_by(Country, Gender, Month) %>%
summarise(Count = n())
## `summarise()` has grouped output by 'Country', 'Gender'. You can override using
## the `.groups` argument.
all_possible_combinations <- expand.grid(
Country = unique(my_data_2$Country),
Gender = unique(my_data_2$Gender),
Month = unique(my_data_2$Month)
)
print("All Possible Combinations:")
## [1] "All Possible Combinations:"
print(all_possible_combinations)
## Country Gender Month
## 1 United Kingdom Male January
## 2 United States Male January
## 3 Canada Male January
## 4 Germany Male January
## 5 United Kingdom Female January
## 6 United States Female January
## 7 Canada Female January
## 8 Germany Female January
## 9 United Kingdom Male February
## 10 United States Male February
## 11 Canada Male February
## 12 Germany Male February
## 13 United Kingdom Female February
## 14 United States Female February
## 15 Canada Female February
## 16 Germany Female February
## 17 United Kingdom Male March
## 18 United States Male March
## 19 Canada Male March
## 20 Germany Male March
## 21 United Kingdom Female March
## 22 United States Female March
## 23 Canada Female March
## 24 Germany Female March
## 25 United Kingdom Male April
## 26 United States Male April
## 27 Canada Male April
## 28 Germany Male April
## 29 United Kingdom Female April
## 30 United States Female April
## 31 Canada Female April
## 32 Germany Female April
## 33 United Kingdom Male May
## 34 United States Male May
## 35 Canada Male May
## 36 Germany Male May
## 37 United Kingdom Female May
## 38 United States Female May
## 39 Canada Female May
## 40 Germany Female May
## 41 United Kingdom Male June
## 42 United States Male June
## 43 Canada Male June
## 44 Germany Male June
## 45 United Kingdom Female June
## 46 United States Female June
## 47 Canada Female June
## 48 Germany Female June
## 49 United Kingdom Male July
## 50 United States Male July
## 51 Canada Male July
## 52 Germany Male July
## 53 United Kingdom Female July
## 54 United States Female July
## 55 Canada Female July
## 56 Germany Female July
## 57 United Kingdom Male August
## 58 United States Male August
## 59 Canada Male August
## 60 Germany Male August
## 61 United Kingdom Female August
## 62 United States Female August
## 63 Canada Female August
## 64 Germany Female August
## 65 United Kingdom Male September
## 66 United States Male September
## 67 Canada Male September
## 68 Germany Male September
## 69 United Kingdom Female September
## 70 United States Female September
## 71 Canada Female September
## 72 Germany Female September
## 73 United Kingdom Male October
## 74 United States Male October
## 75 Canada Male October
## 76 Germany Male October
## 77 United Kingdom Female October
## 78 United States Female October
## 79 Canada Female October
## 80 Germany Female October
## 81 United Kingdom Male November
## 82 United States Male November
## 83 Canada Male November
## 84 Germany Male November
## 85 United Kingdom Female November
## 86 United States Female November
## 87 Canada Female November
## 88 Germany Female November
## 89 United Kingdom Male December
## 90 United States Male December
## 91 Canada Male December
## 92 Germany Male December
## 93 United Kingdom Female December
## 94 United States Female December
## 95 Canada Female December
## 96 Germany Female December
missing_combinations <- anti_join(all_possible_combinations, combinations)
## Joining with `by = join_by(Country, Gender, Month)`
print("Missing Combinations:")
## [1] "Missing Combinations:"
print(missing_combinations)
## [1] Country Gender Month
## <0 rows> (or 0-length row.names)
most_common_combinations <- combinations %>%
arrange(desc(Count)) %>%
head()
print("Most Common Combinations:")
## [1] "Most Common Combinations:"
print(most_common_combinations)
## # A tibble: 6 × 4
## # Groups: Country, Gender [2]
## Country Gender Month Count
## <chr> <chr> <chr> <int>
## 1 United States Male October 345
## 2 United States Male July 328
## 3 United States Male September 320
## 4 United States Male August 314
## 5 United States Male June 309
## 6 Germany Male June 294
least_common_combinations <- combinations %>%
arrange(Count) %>%
head()
print(least_common_combinations)
## # A tibble: 6 × 4
## # Groups: Country, Gender [1]
## Country Gender Month Count
## <chr> <chr> <chr> <int>
## 1 United Kingdom Female June 34
## 2 United Kingdom Female February 43
## 3 United Kingdom Female January 52
## 4 United Kingdom Female October 52
## 5 United Kingdom Female December 53
## 6 United Kingdom Female July 57