library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
    library(dplyr)

#1. Import the store_3HE.csv Download store_3HE.csv as dataframe df. And learn about the data from the Data_description_store_3HE.xlsx Download Data_description_store_3HE.xlsx

df<-read.csv("C:/Users/kiram/Dropbox/My PC (DESKTOP-1BIMTFK)/Downloads/store_3HE.csv")

#2. Familiarize yourself with the features of the data by using the str(), summary(), head(), tail(), slice_sample(), unique(), and n_distinct() functions. Use your best judgment to decide whether to use each function on the full dataframe, or on a single column of data. Give a brief summary of what you learn (just two or three sentences). Note - A few of these functions are from the package “dplyr”. Thus, load “dplyr” or the meta-package “tidyverse” into your RStudio notebook. The package “tidyverse” is used in each R notebook we will work with, so, you will see it often.

str(df)
## 'data.frame':    771 obs. of  20 variables:
##  $ store                  : int  105 117 122 123 182 183 186 194 227 233 ...
##  $ city                   : chr  "BROCKVILLE" "BURLINGTON" "BURLINGTON" "BURLINGTON" ...
##  $ region                 : chr  "ONTARIO" "ONTARIO" "ONTARIO" "ONTARIO" ...
##  $ province               : chr  "ON" "ON" "ON" "ON" ...
##  $ size                   : int  496 875 691 763 784 NA 966 710 973 967 ...
##  $ revenue                : chr  "984.7" "2629.32" "2786.73" "2834.49" ...
##  $ units                  : num  470 1131 1229 1258 1950 ...
##  $ cost                   : num  591 1622 1709 1720 2486 ...
##  $ gross_profit           : num  394 1008 1077 1115 1633 ...
##  $ promo_units            : num  210 401 450 477 665 ...
##  $ energy_units           : num  72.1 192.8 240.5 194.2 199.7 ...
##  $ regularBars_units      : num  38.6 75 93.7 99.4 175.5 ...
##  $ gum_units              : num  29.3 55.9 64.3 73.2 145.8 ...
##  $ bagpegCandy_units      : num  13.4 42.1 29.9 54.1 68.1 ...
##  $ isotonics_units        : num  20.7 87.6 105.7 118.2 109.3 ...
##  $ singleServePotato_units: num  18.6 36.4 48 39.7 85.7 ...
##  $ takeHomePotato_units   : num  7.71 33.46 19.9 22.1 42.33 ...
##  $ kingBars_units         : num  17.9 47.4 45.2 45.3 79.8 ...
##  $ flatWater_units        : num  56.5 98.4 130.3 132.8 204.1 ...
##  $ psd591Ml_units         : num  39.7 38.7 47.3 39.9 50.3 ...
summary(df)
##      store           city              region            province        
##  Min.   :  105   Length:771         Length:771         Length:771        
##  1st Qu.: 6226   Class :character   Class :character   Class :character  
##  Median :34438   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :34226                                                           
##  3rd Qu.:59663                                                           
##  Max.   :98788                                                           
##                                                                          
##       size          revenue              units              cost         
##  Min.   : 276.0   Length:771         Min.   :  42.65   Min.   :   41.12  
##  1st Qu.: 671.8   Class :character   1st Qu.: 911.85   1st Qu.: 1229.12  
##  Median : 792.5   Mode  :character   Median :1351.25   Median : 1843.41  
##  Mean   : 755.5                      Mean   :1559.78   Mean   : 2180.51  
##  3rd Qu.: 893.0                      3rd Qu.:1941.60   3rd Qu.: 2689.76  
##  Max.   :1163.0                      Max.   :9715.83   Max.   :14228.89  
##  NA's   :15                                                              
##   gross_profit       promo_units      energy_units    regularBars_units
##  Min.   :   36.36   Min.   :  17.5   Min.   :   4.1   Min.   :  5.10   
##  1st Qu.:  771.74   1st Qu.: 344.4   1st Qu.: 142.7   1st Qu.: 75.67   
##  Median : 1150.93   Median : 499.3   Median : 215.6   Median :110.65   
##  Mean   : 1388.03   Mean   : 567.0   Mean   : 253.0   Mean   :123.95   
##  3rd Qu.: 1702.92   3rd Qu.: 730.5   3rd Qu.: 317.1   3rd Qu.:154.84   
##  Max.   :10457.82   Max.   :3116.5   Max.   :1576.3   Max.   :681.56   
##                                                                        
##    gum_units      bagpegCandy_units isotonics_units  singleServePotato_units
##  Min.   :  1.88   Min.   :  0.00    Min.   :  3.98   Min.   :  1.75         
##  1st Qu.: 53.49   1st Qu.: 24.43    1st Qu.: 57.79   1st Qu.: 29.51         
##  Median : 77.88   Median : 41.65    Median : 84.48   Median : 44.92         
##  Mean   : 85.53   Mean   : 50.33    Mean   : 98.76   Mean   : 56.34         
##  3rd Qu.:110.11   3rd Qu.: 63.72    3rd Qu.:124.16   3rd Qu.: 67.84         
##  Max.   :318.69   Max.   :381.96    Max.   :562.46   Max.   :439.06         
##                                                                             
##  takeHomePotato_units kingBars_units   flatWater_units   psd591Ml_units  
##  Min.   :  0.00       Min.   :  0.00   Min.   :  11.96   Min.   :  0.00  
##  1st Qu.: 15.69       1st Qu.: 33.89   1st Qu.:  82.33   1st Qu.: 31.28  
##  Median : 28.85       Median : 49.87   Median : 128.98   Median : 48.25  
##  Mean   : 38.05       Mean   : 58.68   Mean   : 158.03   Mean   : 60.57  
##  3rd Qu.: 49.31       3rd Qu.: 73.33   3rd Qu.: 190.32   3rd Qu.: 74.95  
##  Max.   :360.19       Max.   :471.19   Max.   :1424.27   Max.   :390.38  
## 
head(df, n=10)
tail(df, n=10)
slice_sample(df, n=10)
unique(df)
n_distinct(df)
## [1] 771

#Observations: From this data, we learn that Energy Units has the highest average units at 317.1 units. Average gross profit is 1388.03. By looking at the n_distinct we also learn that there are 771 distinct variables in this dataset.

#3. Convert the values in the revenue column to a numeric data type.

df$revenue<-as.numeric(df$revenue)
## Warning: NAs introduced by coercion

#4. Delete rows that contain missing values in any of the columns.

str(df)
## 'data.frame':    771 obs. of  20 variables:
##  $ store                  : int  105 117 122 123 182 183 186 194 227 233 ...
##  $ city                   : chr  "BROCKVILLE" "BURLINGTON" "BURLINGTON" "BURLINGTON" ...
##  $ region                 : chr  "ONTARIO" "ONTARIO" "ONTARIO" "ONTARIO" ...
##  $ province               : chr  "ON" "ON" "ON" "ON" ...
##  $ size                   : int  496 875 691 763 784 NA 966 710 973 967 ...
##  $ revenue                : num  985 2629 2787 2834 4118 ...
##  $ units                  : num  470 1131 1229 1258 1950 ...
##  $ cost                   : num  591 1622 1709 1720 2486 ...
##  $ gross_profit           : num  394 1008 1077 1115 1633 ...
##  $ promo_units            : num  210 401 450 477 665 ...
##  $ energy_units           : num  72.1 192.8 240.5 194.2 199.7 ...
##  $ regularBars_units      : num  38.6 75 93.7 99.4 175.5 ...
##  $ gum_units              : num  29.3 55.9 64.3 73.2 145.8 ...
##  $ bagpegCandy_units      : num  13.4 42.1 29.9 54.1 68.1 ...
##  $ isotonics_units        : num  20.7 87.6 105.7 118.2 109.3 ...
##  $ singleServePotato_units: num  18.6 36.4 48 39.7 85.7 ...
##  $ takeHomePotato_units   : num  7.71 33.46 19.9 22.1 42.33 ...
##  $ kingBars_units         : num  17.9 47.4 45.2 45.3 79.8 ...
##  $ flatWater_units        : num  56.5 98.4 130.3 132.8 204.1 ...
##  $ psd591Ml_units         : num  39.7 38.7 47.3 39.9 50.3 ...
sum(is.na(df))
## [1] 16
df<-na.omit(df)
sum(is.na(df))
## [1] 0

#5. We are interested in understanding what factors influence the revenue for an individual store. From dataframe df, create two dataframes df_low and df_high such that df_low contains data on all the stores with revenue less than the average revenue for the complete data and df_high contains data on all the stores with revenue equal or greater than the average revenue for the complete data. Calculate the average size of the stores in the two dataframes (df_lows and df_high) and comment on the relationship between revenue and size of the stores by discussing the two means (just one or two sentences).

df$revenue<-as.numeric(df$revenue)
mean(df$revenue)
## [1] 3628.561
df_low<-df %>% filter(revenue<3268.51)
df_high<-df %>% filter(revenue>=3268.51)
mean(df_low$revenue)
## [1] 2062.667
mean(df_high$revenue)
## [1] 5519.539

#The df_low mean and the dataset of df_low suggests that these units are mostly smaller items that may be less popular like gum. The df_high mean and dataset suggests that these items are most likely the flatwater purchase and other or isotonics that aren’t as popular.

#6 Report and comment on the correlation between revenue and size. First, calculate the correlation between revenue and size in the main dataframe. Next, using the ggplot() function, plot the relationship between the revenue and size using an appropriate chart. Finally, discuss in one or two sentences.

ggplot(df, aes(x=revenue, y=size))+ geom_point() +labs(Title='Scatter Plot of Revenue $ Size')

#From this diagram we see that the scatter plot’s revenue grows as the size of purchase increases. Most are heavily correlated between 250 and 1000

#7. Create boxplots of revenue for each region and comment on the distribution of revenue within each region (in one or two sentences).

ggplot(df, aes(x=factor(region), y=revenue))+ geom_boxplot(aes(fill=region)) + labs(Title='Box Plot of Revenue by Region')

#Boxplot indicates that the west region has the largeset amount of revenue. The least amount of revenue comes from the Ontario Region.

#8. Calculate the correlation between the revenue and promo_units and comment on whether the value aligns with your expectation. Based on this value of correlation, can you conclude that selling units on promotion helps in increasing revenue? Please explain your reason for the conclusion in a few sentences.

df<-na.omit(df)
sum(is.na(df))
## [1] 0
cor(df$revenue, df$promo_units)
## [1] 0.9659697

#From this we learn that the promo_units and revenue are highly correlated. This is because of how close to 1 the correlation is.

#9. Use group_by(), ’summarise(), andggplot()` to create a bar chart that shows total gross_profit for each province. Comment on a few findings from the chart in one or two sentences.

df_province<-df %>% group_by(province) %>% summarize(sum_gross_profit=sum(gross_profit))
ggplot(df_province, aes(x=province, y=sum_gross_profit)) +geom_col(aes(fill=province)) +labs(title='Sum of Gross Profit by Province')

print(df_province)
## # A tibble: 10 × 2
##    province sum_gross_profit
##    <chr>               <dbl>
##  1 AB                213294.
##  2 BC                165561.
##  3 MB                 32838.
##  4 NB                  5176.
##  5 NS                 16448.
##  6 ON                402070.
##  7 PE                  1680.
##  8 QC                184384.
##  9 SK                 32429.
## 10 UNKNOWN            11659.

#From the chart we see that the highest gross profit comes from Ontario. The lowest gross profit, other than unknown, is the SK province.

#10. (1.5 points) Create a correlation matrix for showing correlations among the average store sales for all product categories. Which two product categories sell together the least and which sell together the most?

cor(df[,11:20])
##                         energy_units regularBars_units gum_units
## energy_units               1.0000000         0.7940865 0.7791820
## regularBars_units          0.7940865         1.0000000 0.7430876
## gum_units                  0.7791820         0.7430876 1.0000000
## bagpegCandy_units          0.7578455         0.9127088 0.6865389
## isotonics_units            0.8837228         0.8913260 0.7204674
## singleServePotato_units    0.7856168         0.9105970 0.6758916
## takeHomePotato_units       0.7504980         0.8823330 0.6710437
## kingBars_units             0.7960175         0.9015765 0.6322488
## flatWater_units            0.7855815         0.8772875 0.7031540
## psd591Ml_units             0.8139704         0.8402457 0.5849746
##                         bagpegCandy_units isotonics_units
## energy_units                    0.7578455       0.8837228
## regularBars_units               0.9127088       0.8913260
## gum_units                       0.6865389       0.7204674
## bagpegCandy_units               1.0000000       0.9046627
## isotonics_units                 0.9046627       1.0000000
## singleServePotato_units         0.8725405       0.8825922
## takeHomePotato_units            0.9237664       0.8697460
## kingBars_units                  0.8739585       0.9024568
## flatWater_units                 0.8637856       0.8905652
## psd591Ml_units                  0.8275502       0.8766410
##                         singleServePotato_units takeHomePotato_units
## energy_units                          0.7856168            0.7504980
## regularBars_units                     0.9105970            0.8823330
## gum_units                             0.6758916            0.6710437
## bagpegCandy_units                     0.8725405            0.9237664
## isotonics_units                       0.8825922            0.8697460
## singleServePotato_units               1.0000000            0.8676895
## takeHomePotato_units                  0.8676895            1.0000000
## kingBars_units                        0.8782217            0.8523930
## flatWater_units                       0.9184280            0.8275743
## psd591Ml_units                        0.8880491            0.8205825
##                         kingBars_units flatWater_units psd591Ml_units
## energy_units                 0.7960175       0.7855815      0.8139704
## regularBars_units            0.9015765       0.8772875      0.8402457
## gum_units                    0.6322488       0.7031540      0.5849746
## bagpegCandy_units            0.8739585       0.8637856      0.8275502
## isotonics_units              0.9024568       0.8905652      0.8766410
## singleServePotato_units      0.8782217       0.9184280      0.8880491
## takeHomePotato_units         0.8523930       0.8275743      0.8205825
## kingBars_units               1.0000000       0.8638809      0.8641337
## flatWater_units              0.8638809       1.0000000      0.8388392
## psd591Ml_units               0.8641337       0.8388392      1.0000000
min(cor(df[,11:20]))
## [1] 0.5849746
max(cor(df[,11:20]))
## [1] 1

#The two categories that sell the least are gum and psd591Ml. The ones that sell the most are regular bars and candy.