This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Before we get started, we will import the libraries needed to run these notes.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
I will use my dataset about sales of shoes,for which you have approved the dataset.
library(readr)
#loading the data
my_data <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 14970 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Inferential statistics. Confidence intervals, ...2, ...3, ...5, ......
## dbl (5): ...4, ...7, ...9, ...12, ...13
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Print the modified data frame
head(my_data)
## # A tibble: 6 × 14
## Inferential statistics…¹ ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 2 <NA> <NA> <NA> NA <NA> <NA> NA <NA> NA <NA>
## 3 InvoiceNo Date Coun… NA Shop Gend… NA Size… NA "Uni…
## 4 52389 1/1/… Unit… 2152 UK2 Male 11 44 10.5 "$15…
## 5 52390 1/1/… Unit… 2230 US15 Male 11.5 44-45 11 "$19…
## 6 52391 1/1/… Cana… 2160 CAN7 Male 9.5 42-43 9 "$14…
## # ℹ abbreviated name: ¹​`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>
I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.
new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)', 'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')
# Assign the new column names to the data frame
colnames(my_data) <- new_names
# Verify that the column names have been changed
colnames(my_data)
## [1] "InvoiceNo" "Date" "Country" "ProductID"
## [5] "Shop" "Gender" "Size(US)" "Size (Europe)"
## [9] "Size (UK)" "UnitPrice" "Discount" "Year"
## [13] "Month" "SalePrice"
removing the first 3 rows to remove null values and un-necessary titles for my data set
my_data <- my_data[-c(1:3), ]
# Print the modified data frame
I have removed first 3 rows and renamed the titles of the column to understand the data easier.
print(my_data)
## # A tibble: 14,967 × 14
## InvoiceNo Date Country ProductID Shop Gender `Size(US)` `Size (Europe)`
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 52389 1/1/2014 United … 2152 UK2 Male 11 44
## 2 52390 1/1/2014 United … 2230 US15 Male 11.5 44-45
## 3 52391 1/1/2014 Canada 2160 CAN7 Male 9.5 42-43
## 4 52392 1/1/2014 United … 2234 US6 Female 9.5 40
## 5 52393 1/1/2014 United … 2222 UK4 Female 9 39-40
## 6 52394 1/1/2014 United … 2173 US15 Male 10.5 43-44
## 7 52395 1/2/2014 Germany 2200 GER2 Female 9 39-40
## 8 52396 1/2/2014 Canada 2238 CAN5 Male 10 43
## 9 52397 1/2/2014 United … 2191 US13 Male 10.5 43-44
## 10 52398 1/2/2014 United … 2237 UK1 Female 9 39-40
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## # Year <dbl>, Month <dbl>, SalePrice <chr>
category_counts <- table(my_data$Country)
# Display the frequency table
print(category_counts)
##
## Canada Germany United Kingdom United States
## 2952 4392 1737 5886
category_counts_2 <- table(my_data$Gender)
# Display the frequency table
print(category_counts_2)
##
## Female Male
## 6048 8919
category_counts_3 <- table(my_data$Year)
# Display the frequency table
print(category_counts_3)
##
## 2014 2015 2016
## 2753 4848 7366
category_counts_4 <- table(my_data$Month)
# Display the frequency table
print(category_counts_4)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 1064 1061 1083 1197 1249 1381 1413 1393 1377 1447 1154 1148
I am removing the dollar sign from Sale price Column,to make numerical calculations on it.
my_data$SalePrice <- gsub("\\$", "", my_data$SalePrice)
my_data$SalePrice <- as.numeric(my_data$SalePrice)
class(my_data$SalePrice)
## [1] "numeric"
numeric_column <- my_data$SalePrice
#numeric_column <- as.numeric(as.character(numeric_column))
# Minimum value
min_value <- min(numeric_column)
# Maximum value
max_value <- max(numeric_column)
# Mean (central tendency)
mean_value <- mean(numeric_column)
# Median (central tendency)
median_value <- median(numeric_column)
# Range (difference between max and min)
range_value <- range(numeric_column)
quantiles <- quantile(numeric_column, probs = c(0.25, 0.75))
# Print the results
cat("Minimum Value:", min_value, "\n")
## Minimum Value: 64.5
cat("Maximum Value:", max_value, "\n")
## Maximum Value: 199
cat("Mean Value:", mean_value, "\n")
## Mean Value: 143.9879
cat("Median Value:", median_value, "\n")
## Median Value: 149
cat("25th Percentile:", quantiles[1], "\n")
## 25th Percentile: 125.1
cat("75th Percentile:", quantiles[2], "\n")
## 75th Percentile: 169
summary(my_data$Country)
## Length Class Mode
## 14967 character character
summary(numeric_column)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 64.5 125.1 149.0 144.0 169.0 199.0
shoe_length<- table(my_data$`Size(US)`)
print(shoe_length)
##
## 4.5 5 5.5 6 6.5 7 7.5 8 8.5 9 9.5 10 10.5 11 11.5 12
## 51 36 63 186 342 537 951 1656 1716 2301 2346 1578 1506 597 459 318
## 13 14 15
## 87 126 111
UK has higher mean sale price compared to other countries.
Sales of specific products
product_country_table <- table(my_data$ProductID, my_data$Country)
no_of_sales_by_country<- table(my_data$Country)
# Print the table
print(product_country_table)
##
## Canada Germany United Kingdom United States
## 2147 36 48 6 75
## 2148 27 42 9 69
## 2149 27 54 15 66
## 2150 30 48 15 57
## 2151 24 45 18 69
## 2152 39 48 6 81
## 2153 27 33 18 51
## 2154 24 48 12 63
## 2155 30 21 15 54
## 2156 33 63 15 33
## 2157 30 60 21 72
## 2158 36 57 21 84
## 2159 18 39 15 48
## 2160 48 39 30 39
## 2161 24 39 18 69
## 2162 15 21 18 51
## 2163 6 48 3 66
## 2164 27 33 12 27
## 2165 30 51 21 51
## 2166 18 36 15 54
## 2167 18 39 21 51
## 2168 39 45 18 54
## 2169 42 48 12 60
## 2170 18 39 27 39
## 2171 39 36 15 51
## 2172 33 69 30 57
## 2173 36 54 15 66
## 2174 36 45 18 45
## 2175 27 45 15 69
## 2176 36 42 21 54
## 2177 33 36 6 87
## 2178 30 42 18 60
## 2179 39 69 15 66
## 2180 30 33 30 57
## 2181 30 51 21 60
## 2182 24 54 18 48
## 2183 45 48 24 66
## 2184 15 57 30 81
## 2185 36 39 21 66
## 2186 27 45 12 87
## 2187 24 45 18 54
## 2188 24 33 24 60
## 2189 18 51 24 51
## 2190 54 45 27 81
## 2191 24 42 15 75
## 2192 51 60 6 84
## 2193 24 51 18 54
## 2194 18 33 12 54
## 2195 42 42 18 48
## 2196 33 42 12 54
## 2197 15 69 6 45
## 2198 36 45 21 54
## 2199 27 54 15 33
## 2200 18 57 12 54
## 2201 36 51 21 33
## 2202 45 39 21 66
## 2203 24 57 12 63
## 2204 45 57 18 51
## 2205 27 39 21 75
## 2206 27 39 24 78
## 2207 24 42 36 66
## 2208 33 54 12 51
## 2209 36 57 24 66
## 2210 12 36 9 66
## 2211 36 54 18 42
## 2212 27 57 21 66
## 2213 60 42 15 87
## 2214 60 27 30 63
## 2215 36 48 27 48
## 2216 39 30 24 45
## 2217 39 45 15 75
## 2218 27 36 6 60
## 2219 48 24 9 63
## 2220 48 48 15 72
## 2221 21 48 24 72
## 2222 36 45 18 84
## 2223 24 66 9 72
## 2224 15 48 27 66
## 2225 24 69 30 63
## 2226 45 60 15 84
## 2227 15 30 18 84
## 2228 39 30 21 51
## 2229 24 39 12 39
## 2230 27 27 21 48
## 2231 42 45 21 57
## 2232 39 36 24 69
## 2233 12 39 24 81
## 2234 24 57 9 69
## 2235 36 21 21 63
## 2236 15 54 30 81
## 2237 36 45 33 63
## 2238 39 48 33 51
## 2239 33 54 15 87
## 2240 33 45 3 60
## 2241 39 69 12 51
## 2242 15 57 21 51
print(no_of_sales_by_country)
##
## Canada Germany United Kingdom United States
## 2952 4392 1737 5886
The USA has more sales compared to other UK,Canada,and Germany.
product_gender <- table(my_data$ProductID, my_data$Gender)
shoes_by_gender <- table(my_data$Gender)
# Print the table
print(shoes_by_gender)
##
## Female Male
## 6048 8919
Men generally buys more shoes compared to women. most of the time,Men are buying more units for a particular shoe than women.
sales_by_month=table(my_data$Month)
print(sales_by_month)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 1064 1061 1083 1197 1249 1381 1413 1393 1377 1447 1154 1148
October and July have highest number of sales during the entire period.
discounts<- table(my_data$Discount)
print(discounts)
##
## 0% 10% 20% 30% 50%
## 8295 1638 1605 1716 1713
For this data,the products which have no discounts got more sales compared to discounted products
By analyzing the above data,We found out The USA has more sales than UK, Germany,and Canada. Men buys more shoes than women. July and October has more sales during the period.
# Calculate the mean sale price by region
mean_prices <- aggregate(SalePrice ~ my_data$Country, data = my_data, FUN = mean)
# Sort mean sale prices by region in descending order
mean_prices <- mean_prices[order(-mean_prices$SalePrice), ]
# Print the sorted result
print(mean_prices)
## my_data$Country SalePrice
## 3 United Kingdom 145.5059
## 1 Canada 144.2290
## 4 United States 143.7274
## 2 Germany 143.5747
total_sales_by_country <- aggregate(SalePrice ~ Country+Year, data = my_data, FUN = sum)
print(total_sales_by_country)
## Country Year SalePrice
## 1 Canada 2014 87686.6
## 2 Germany 2014 102608.8
## 3 United Kingdom 2014 56580.1
## 4 United States 2014 149689.1
## 5 Canada 2015 141638.9
## 6 Germany 2015 218315.4
## 7 United Kingdom 2015 64180.8
## 8 United States 2015 280079.1
## 9 Canada 2016 196438.4
## 10 Germany 2016 309655.7
## 11 United Kingdom 2016 131982.8
## 12 United States 2016 416211.4
library(ggplot2)
ggplot(data = my_data, aes(x = Country)) + geom_bar() +
labs(title = "Sales Count by Country", x = "Country", y = "Count of Sales")
library(ggplot2)
ggplot(data = my_data, aes(x = Gender,fill='orange')) + geom_bar() +
labs(title = "Sales Count by male and female ", x = "Gender", y = "Count of Sales")
library(ggplot2)
ggplot(data=my_data)+ geom_point(mapping=aes(x=Country,y= Year)) + labs(title = "point plot of Year by Country (2014-2016)", x = "Country", y = "Year") + scale_y_continuous(breaks = seq(min(my_data$Year), max(my_data$Year), by = 1))
ggplot(data=my_data)+ geom_boxplot(mapping=aes(x=Gender, y=Shop))+ labs("Box plot for Gender and Shop ",x="Shop", y="Size in US")