R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Loading Data

Before we get started, we will import the libraries needed to run these notes.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

I will use my dataset about sales of shoes,for which you have approved the dataset.

library(readr)
#loading the data
my_data <- read_delim("C:/Users/Surya CST/Documents/CSV_files/Bundy_Shoe_Shop.csv",delim=",")

## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 14970 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Inferential statistics. Confidence intervals, ...2, ...3, ...5, ......
## dbl (5): ...4, ...7, ...9, ...12, ...13
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Print the modified data frame
head(my_data)

## # A tibble: 6 × 14
##   Inferential statistics…¹ ...2  ...3   ...4 ...5  ...6   ...7 ...8   ...9 ...10
##   <chr>                    <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Al Bundy's shoe shop     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 2 <NA>                     <NA>  <NA>     NA <NA>  <NA>   NA   <NA>   NA    <NA>
## 3 InvoiceNo                Date  Coun…    NA Shop  Gend…  NA   Size…  NA   "Uni…
## 4 52389                    1/1/… Unit…  2152 UK2   Male   11   44     10.5 "$15…
## 5 52390                    1/1/… Unit…  2230 US15  Male   11.5 44-45  11   "$19…
## 6 52391                    1/1/… Cana…  2160 CAN7  Male    9.5 42-43   9   "$14…
## # ℹ abbreviated name: ¹`Inferential statistics. Confidence intervals`
## # ℹ 4 more variables: ...11 <chr>, ...12 <dbl>, ...13 <dbl>, ...14 <chr>

I am renaming my titles from{X1,X2} to {Invoice,Date},,etc to make the date more simple and clear.

new_names <- c("InvoiceNo",'Date', "Country", "ProductID",'Shop','Gender','Size(US)','Size (Europe)',   'Size (UK)','UnitPrice','Discount', 'Year','Month','SalePrice')

# Assign the new column names to the data frame
colnames(my_data) <- new_names

# Verify that the column names have been changed
colnames(my_data)

##  [1] "InvoiceNo"     "Date"          "Country"       "ProductID"    
##  [5] "Shop"          "Gender"        "Size(US)"      "Size (Europe)"
##  [9] "Size (UK)"     "UnitPrice"     "Discount"      "Year"         
## [13] "Month"         "SalePrice"

removing the first 3 rows to remove null values and un-necessary titles for my data set

my_data <- my_data[-c(1:3), ]

# Print the modified data frame

I have removed first 3 rows and renamed the titles of the column to understand the data easier.

print(my_data)

## # A tibble: 14,967 × 14
##    InvoiceNo Date     Country  ProductID Shop  Gender `Size(US)` `Size (Europe)`
##    <chr>     <chr>    <chr>        <dbl> <chr> <chr>       <dbl> <chr>          
##  1 52389     1/1/2014 United …      2152 UK2   Male         11   44             
##  2 52390     1/1/2014 United …      2230 US15  Male         11.5 44-45          
##  3 52391     1/1/2014 Canada        2160 CAN7  Male          9.5 42-43          
##  4 52392     1/1/2014 United …      2234 US6   Female        9.5 40             
##  5 52393     1/1/2014 United …      2222 UK4   Female        9   39-40          
##  6 52394     1/1/2014 United …      2173 US15  Male         10.5 43-44          
##  7 52395     1/2/2014 Germany       2200 GER2  Female        9   39-40          
##  8 52396     1/2/2014 Canada        2238 CAN5  Male         10   43             
##  9 52397     1/2/2014 United …      2191 US13  Male         10.5 43-44          
## 10 52398     1/2/2014 United …      2237 UK1   Female        9   39-40          
## # ℹ 14,957 more rows
## # ℹ 6 more variables: `Size (UK)` <dbl>, UnitPrice <chr>, Discount <chr>,
## #   Year <dbl>, Month <dbl>, SalePrice <chr>

For categorical data

category_counts <- table(my_data$Country)

# Display the frequency table
print(category_counts)

## 
##         Canada        Germany United Kingdom  United States 
##           2952           4392           1737           5886

category_counts_2 <- table(my_data$Gender)

# Display the frequency table
print(category_counts_2)

## 
## Female   Male 
##   6048   8919

category_counts_3 <- table(my_data$Year)

# Display the frequency table
print(category_counts_3)

## 
## 2014 2015 2016 
## 2753 4848 7366

category_counts_4 <- table(my_data$Month)

# Display the frequency table
print(category_counts_4)

## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
## 1064 1061 1083 1197 1249 1381 1413 1393 1377 1447 1154 1148

Numerical columns

I am removing the dollar sign from Sale price Column,to make numerical calculations on it.

my_data$SalePrice <- gsub("\\$", "", my_data$SalePrice)

my_data$SalePrice <- as.numeric(my_data$SalePrice)

class(my_data$SalePrice)

## [1] "numeric"

numeric_column <- my_data$SalePrice
#numeric_column <- as.numeric(as.character(numeric_column))

# Minimum value
min_value <- min(numeric_column)

# Maximum value
max_value <- max(numeric_column)

# Mean (central tendency)
mean_value <- mean(numeric_column)

# Median (central tendency)
median_value <- median(numeric_column)

# Range (difference between max and min)
range_value <- range(numeric_column)

quantiles <- quantile(numeric_column, probs = c(0.25, 0.75))

# Print the results
cat("Minimum Value:", min_value, "\n")

## Minimum Value: 64.5

cat("Maximum Value:", max_value, "\n")

## Maximum Value: 199

cat("Mean Value:", mean_value, "\n")

## Mean Value: 143.9879

cat("Median Value:", median_value, "\n")

## Median Value: 149

cat("25th Percentile:", quantiles[1], "\n")

## 25th Percentile: 125.1

cat("75th Percentile:", quantiles[2], "\n")

## 75th Percentile: 169

Sumarizing the Categorical column and numerical column

summary(my_data$Country)

##    Length     Class      Mode 
##     14967 character character

summary(numeric_column)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    64.5   125.1   149.0   144.0   169.0   199.0

Section-B(Analyzing data by asking some questions)

Does the Sale price vary by region ?

shoe_length<- table(my_data$`Size(US)`)

print(shoe_length)

## 
##  4.5    5  5.5    6  6.5    7  7.5    8  8.5    9  9.5   10 10.5   11 11.5   12 
##   51   36   63  186  342  537  951 1656 1716 2301 2346 1578 1506  597  459  318 
##   13   14   15 
##   87  126  111

UK has higher mean sale price compared to other countries.

Sales of specific products

product_country_table <- table(my_data$ProductID, my_data$Country)
no_of_sales_by_country<- table(my_data$Country)

# Print the table
print(product_country_table)

##       
##        Canada Germany United Kingdom United States
##   2147     36      48              6            75
##   2148     27      42              9            69
##   2149     27      54             15            66
##   2150     30      48             15            57
##   2151     24      45             18            69
##   2152     39      48              6            81
##   2153     27      33             18            51
##   2154     24      48             12            63
##   2155     30      21             15            54
##   2156     33      63             15            33
##   2157     30      60             21            72
##   2158     36      57             21            84
##   2159     18      39             15            48
##   2160     48      39             30            39
##   2161     24      39             18            69
##   2162     15      21             18            51
##   2163      6      48              3            66
##   2164     27      33             12            27
##   2165     30      51             21            51
##   2166     18      36             15            54
##   2167     18      39             21            51
##   2168     39      45             18            54
##   2169     42      48             12            60
##   2170     18      39             27            39
##   2171     39      36             15            51
##   2172     33      69             30            57
##   2173     36      54             15            66
##   2174     36      45             18            45
##   2175     27      45             15            69
##   2176     36      42             21            54
##   2177     33      36              6            87
##   2178     30      42             18            60
##   2179     39      69             15            66
##   2180     30      33             30            57
##   2181     30      51             21            60
##   2182     24      54             18            48
##   2183     45      48             24            66
##   2184     15      57             30            81
##   2185     36      39             21            66
##   2186     27      45             12            87
##   2187     24      45             18            54
##   2188     24      33             24            60
##   2189     18      51             24            51
##   2190     54      45             27            81
##   2191     24      42             15            75
##   2192     51      60              6            84
##   2193     24      51             18            54
##   2194     18      33             12            54
##   2195     42      42             18            48
##   2196     33      42             12            54
##   2197     15      69              6            45
##   2198     36      45             21            54
##   2199     27      54             15            33
##   2200     18      57             12            54
##   2201     36      51             21            33
##   2202     45      39             21            66
##   2203     24      57             12            63
##   2204     45      57             18            51
##   2205     27      39             21            75
##   2206     27      39             24            78
##   2207     24      42             36            66
##   2208     33      54             12            51
##   2209     36      57             24            66
##   2210     12      36              9            66
##   2211     36      54             18            42
##   2212     27      57             21            66
##   2213     60      42             15            87
##   2214     60      27             30            63
##   2215     36      48             27            48
##   2216     39      30             24            45
##   2217     39      45             15            75
##   2218     27      36              6            60
##   2219     48      24              9            63
##   2220     48      48             15            72
##   2221     21      48             24            72
##   2222     36      45             18            84
##   2223     24      66              9            72
##   2224     15      48             27            66
##   2225     24      69             30            63
##   2226     45      60             15            84
##   2227     15      30             18            84
##   2228     39      30             21            51
##   2229     24      39             12            39
##   2230     27      27             21            48
##   2231     42      45             21            57
##   2232     39      36             24            69
##   2233     12      39             24            81
##   2234     24      57              9            69
##   2235     36      21             21            63
##   2236     15      54             30            81
##   2237     36      45             33            63
##   2238     39      48             33            51
##   2239     33      54             15            87
##   2240     33      45              3            60
##   2241     39      69             12            51
##   2242     15      57             21            51

print(no_of_sales_by_country)

## 
##         Canada        Germany United Kingdom  United States 
##           2952           4392           1737           5886

The USA has more sales compared to other UK,Canada,and Germany.

Who buys more shoes men or women ?

product_gender <- table(my_data$ProductID, my_data$Gender)

shoes_by_gender <- table(my_data$Gender)
# Print the table
print(shoes_by_gender)

## 
## Female   Male 
##   6048   8919

Men generally buys more shoes compared to women. most of the time,Men are buying more units for a particular shoe than women.

In which month sales are higher ?

sales_by_month=table(my_data$Month)

print(sales_by_month)

## 
##    1    2    3    4    5    6    7    8    9   10   11   12 
## 1064 1061 1083 1197 1249 1381 1413 1393 1377 1447 1154 1148

October and July have highest number of sales during the entire period.

Do Discounts actually work ?

discounts<- table(my_data$Discount)

print(discounts)

## 
##   0%  10%  20%  30%  50% 
## 8295 1638 1605 1716 1713

For this data,the products which have no discounts got more sales compared to discounted products

Final Analaytical points:

By analyzing the above data,We found out The USA has more sales than UK, Germany,and Canada. Men buys more shoes than women. July and October has more sales during the period.

Use of Aggregate functions:

# Calculate the mean sale price by region
mean_prices <- aggregate(SalePrice ~ my_data$Country, data = my_data, FUN = mean)

# Sort mean sale prices by region in descending order
mean_prices <- mean_prices[order(-mean_prices$SalePrice), ]

# Print the sorted result
print(mean_prices)

##   my_data$Country SalePrice
## 3  United Kingdom  145.5059
## 1          Canada  144.2290
## 4   United States  143.7274
## 2         Germany  143.5747

total_sales_by_country <- aggregate(SalePrice ~ Country+Year, data = my_data, FUN = sum)

print(total_sales_by_country)

##           Country Year SalePrice
## 1          Canada 2014   87686.6
## 2         Germany 2014  102608.8
## 3  United Kingdom 2014   56580.1
## 4   United States 2014  149689.1
## 5          Canada 2015  141638.9
## 6         Germany 2015  218315.4
## 7  United Kingdom 2015   64180.8
## 8   United States 2015  280079.1
## 9          Canada 2016  196438.4
## 10        Germany 2016  309655.7
## 11 United Kingdom 2016  131982.8
## 12  United States 2016  416211.4

Visual Summary

created a bar chart between Total no of sales vs Country

library(ggplot2)

ggplot(data = my_data, aes(x = Country)) + geom_bar() +
  
  labs(title = "Sales Count by Country", x = "Country", y = "Count of Sales")

created a bar chart between Sales country vs Gender

library(ggplot2)

ggplot(data = my_data, aes(x = Gender,fill='orange')) + geom_bar() +
  
  labs(title = "Sales Count by male and female ", x = "Gender", y = "Count of Sales")

Point plot between country and no of sales in a year

library(ggplot2)
ggplot(data=my_data)+ geom_point(mapping=aes(x=Country,y= Year)) + labs(title = "point plot of Year by Country (2014-2016)", x = "Country", y = "Year") +  scale_y_continuous(breaks = seq(min(my_data$Year), max(my_data$Year), by = 1))

Box plot between How many number of people (male and female) purchasing shoes of various sizes.

ggplot(data=my_data)+ geom_boxplot(mapping=aes(x=Gender, y=Shop))+ labs("Box plot for Gender and Shop ",x="Shop", y="Size in US")

R_Stat_Assignment-1

Surya

2023-09-04