# Declare Libraries
library(psych)
library(gmodels)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Read StoreData.csv from DAM 2018 and store it in dataframe
setwd("~/Desktop/Study Material/DAM/Pre Session 1/2018 DAM/")
store.df <- read.csv("StoreData.csv", header=TRUE)
View(store.df)
# QUESTION 1
# Q1a. Write R code to show the total number of rows and columns present in the data frame store.df
Number_RC <- dim(store.df)
print(paste0("The Number of Rows in the data frame store.df is ", Number_RC[1], " and number of columns is ", Number_RC[2]))
## [1] "The Number of Rows in the data frame store.df is 2080 and number of columns is 10"
# ******************************************
# Q1b. Write R code to list the names of the column present in the data frame store.df.
Column_Names <- colnames(store.df)
print(Column_Names)
## [1] "storeNum" "Year" "Week" "p1sales" "p2sales" "p1price"
## [7] "p2price" "p1prom" "p2prom" "country"
# QUESTION 2
# Q2a. Write R code to output the data types of the different columns.
str(store.df)
## 'data.frame': 2080 obs. of 10 variables:
## $ storeNum: int 101 101 101 101 101 101 101 101 101 101 ...
## $ Year : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ p1sales : int 127 137 156 117 138 115 116 106 116 145 ...
## $ p2sales : int 106 105 97 106 100 127 90 126 94 91 ...
## $ p1price : num 2.29 2.49 2.99 2.99 2.49 2.79 2.99 2.99 2.29 2.49 ...
## $ p2price : num 2.29 2.49 2.99 3.19 2.59 2.49 3.19 2.29 2.29 2.99 ...
## $ p1prom : int 0 0 1 0 0 0 0 0 0 0 ...
## $ p2prom : int 0 0 0 0 1 0 0 0 0 0 ...
## $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...
#********************************
# Q2b. Write R code to convert the data types of the columns {Year, p1prom, p2prom, country} into factor variables.
store.df$Year <- as.factor(store.df$Year)
store.df$p1prom <- as.factor(store.df$p1prom)
store.df$p2prom <- as.factor(store.df$p2prom)
store.df$country <- as.factor(store.df$country)
#***********************************************************
#Q2c. Write R code to verify the conversions done in Q2b.
# Answer 1
str(store.df)
## 'data.frame': 2080 obs. of 10 variables:
## $ storeNum: int 101 101 101 101 101 101 101 101 101 101 ...
## $ Year : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ p1sales : int 127 137 156 117 138 115 116 106 116 145 ...
## $ p2sales : int 106 105 97 106 100 127 90 126 94 91 ...
## $ p1price : num 2.29 2.49 2.99 2.99 2.49 2.79 2.99 2.99 2.29 2.49 ...
## $ p2price : num 2.29 2.49 2.99 3.19 2.59 2.49 3.19 2.29 2.29 2.99 ...
## $ p1prom : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
## $ p2prom : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...
# Answer 2
is.factor(store.df$Year)
## [1] TRUE
is.factor(store.df$p1prom)
## [1] TRUE
is.factor(store.df$p2prom)
## [1] TRUE
is.factor(store.df$country)
## [1] TRUE
# QUESTION 3
# Q3a. Write R code to generate the summary statistics of the different variables present in the data frame store.df
summary(store.df)
## storeNum Year Week p1sales p2sales
## Min. :101.0 1:1040 Min. : 1.00 Min. : 73 Min. : 51.0
## 1st Qu.:105.8 2:1040 1st Qu.:13.75 1st Qu.:113 1st Qu.: 84.0
## Median :110.5 Median :26.50 Median :129 Median : 96.0
## Mean :110.5 Mean :26.50 Mean :133 Mean :100.2
## 3rd Qu.:115.2 3rd Qu.:39.25 3rd Qu.:150 3rd Qu.:113.0
## Max. :120.0 Max. :52.00 Max. :263 Max. :225.0
##
## p1price p2price p1prom p2prom country
## Min. :2.190 Min. :2.29 0:1872 0:1792 AU:104
## 1st Qu.:2.290 1st Qu.:2.49 1: 208 1: 288 BR:208
## Median :2.490 Median :2.59 CN:208
## Mean :2.544 Mean :2.70 DE:520
## 3rd Qu.:2.790 3rd Qu.:2.99 GB:312
## Max. :2.990 Max. :3.19 JP:416
## US:312
#*******************************
# Bonus: Q3b. Write R code to generate the following table using the describe() function from the psych package in R.
psych :: describe(store.df)
## vars n mean sd median trimmed mad min max range
## storeNum 1 2080 110.50 5.77 110.50 110.50 7.41 101.00 120.00 19.0
## Year* 2 2080 1.50 0.50 1.50 1.50 0.74 1.00 2.00 1.0
## Week 3 2080 26.50 15.01 26.50 26.50 19.27 1.00 52.00 51.0
## p1sales 4 2080 133.05 28.37 129.00 131.08 26.69 73.00 263.00 190.0
## p2sales 5 2080 100.16 24.42 96.00 98.05 22.24 51.00 225.00 174.0
## p1price 6 2080 2.54 0.29 2.49 2.53 0.44 2.19 2.99 0.8
## p2price 7 2080 2.70 0.33 2.59 2.69 0.44 2.29 3.19 0.9
## p1prom* 8 2080 1.10 0.30 1.00 1.00 0.00 1.00 2.00 1.0
## p2prom* 9 2080 1.14 0.35 1.00 1.05 0.00 1.00 2.00 1.0
## country* 10 2080 4.55 1.72 4.50 4.62 2.22 1.00 7.00 6.0
## skew kurtosis se
## storeNum 0.00 -1.21 0.13
## Year* 0.00 -2.00 0.01
## Week 0.00 -1.20 0.33
## p1sales 0.74 0.66 0.62
## p2sales 0.99 1.51 0.54
## p1price 0.28 -1.44 0.01
## p2price 0.32 -1.40 0.01
## p1prom* 2.66 5.10 0.01
## p2prom* 2.09 2.38 0.01
## country* -0.29 -0.81 0.04
# QUESTION 4
# Q4a. Break-up store.df by country. Write R code to generate the following break-ups by country.
mytable <- with(store.df, table(country))
print(mytable)
## country
## AU BR CN DE GB JP US
## 104 208 208 520 312 416 312
#***************************
# Q4b. Write R code to express Q4a numbers in terms of percentages.
prop.table(mytable)*100
## country
## AU BR CN DE GB JP US
## 5 10 10 25 15 20 15
# Q5a. The dataset has 2080 rows of weekly data for 20 unique store branches for two different years. For product 1 (Coke) and product 2 (Pepsi), write R code to count the number of weeks,
# 1. When both products were under promotion?
# 2. When product 1 (Coke) was under promotion but product 2 (Pepsi) was not under promotion?
# 3.When product 1 (Coke) was not under promotion but product 2 (Pepsi) was under promotion?
# 4. When neither product was under promotion?
mytable <- xtabs(~ p1prom+p2prom, data=store.df)
print(mytable) # frequencies
## p2prom
## p1prom 0 1
## 0 1616 256
## 1 176 32
# ****************************************
#Q5b. Express Q5a as in percentages.
prop.table(mytable)*100
## p2prom
## p1prom 0 1
## 0 77.692308 12.307692
## 1 8.461538 1.538462
# ******************************
# Q5c. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when both were under promotion.
mean(store.df$p1price[ which( store.df$p1prom ==1 & store.df$p2prom ==1 )])
## [1] 2.568125
mean(store.df$p2price[ which( store.df$p1prom ==1 & store.df$p2prom ==1 )])
## [1] 2.67125
# *************************************************************************************
# Q5d. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when neither of them was under promotion.
mean(store.df$p1price[ which( store.df$p1prom ==0 & store.df$p2prom ==0 )])
## [1] 2.543342
mean(store.df$p2price[ which( store.df$p1prom ==0 & store.df$p2prom ==0 )])
## [1] 2.700891
# *************************************************************************************
# Q5e. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when only one of them was under promotion.
# Part 1 : When Product 1 was under promotion and product 2 is not
mean(store.df$p1price[ which( store.df$p1prom ==1 & store.df$p2prom ==0 )])
## [1] 2.524659
mean(store.df$p2price[ which( store.df$p1prom ==1 & store.df$p2prom ==0 )])
## [1] 2.703068
# Part 2 : When Product 2 was under promotion and product 1 is not
mean(store.df$p1price[ which( store.df$p1prom ==0 & store.df$p2prom ==1 )])
## [1] 2.561484
mean(store.df$p2price[ which( store.df$p1prom ==0 & store.df$p2prom ==1 )])
## [1] 2.691953
# Q6. Analyze the above numbers. Think like a Senior Manager. Prepare a list of qualitative insights on the “Effect of Promotions on Sales” of Coke and Pepsi, to be shared in class.
mean(store.df$p1sales[ which( store.df$p1prom ==1)])
## [1] 168.8894
mean(store.df$p1sales[ which( store.df$p1prom ==0)])
## [1] 129.0662
mean(store.df$p2sales[ which( store.df$p2prom ==1)])
## [1] 134.2604
mean(store.df$p2sales[ which( store.df$p2prom ==0)])
## [1] 94.67578
# 1) Coke has better market power cuz t is lesser on promotion in comparison to Pepsi
# 2) Both of them are not on promotion 80% of the times means they sell even when not on promotion
# 3) Coke sells a lower price and that is how they are trying to gain market share by playing on the price
# 4) Even both were under promo, Pepsi dropped it's price by 30cents whereas Coke increased by 2cents. Point being that Pepsi tweaks it's prices more that Coke whereas Coke is stable and EDLP types