StoreDataAnalysis - Assignment 1

# Declare Libraries
library(psych)
library(gmodels)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Read StoreData.csv from DAM 2018 and store it in dataframe
setwd("~/Desktop/Study Material/DAM/Pre Session 1/2018 DAM/")
store.df <- read.csv("StoreData.csv", header=TRUE)
View(store.df)

# QUESTION 1
# Q1a. Write R code to show the total number of rows and columns present in the data frame store.df
Number_RC <- dim(store.df)
print(paste0("The Number of Rows in the data frame store.df is ", Number_RC[1], " and number of columns is ", Number_RC[2]))

## [1] "The Number of Rows in the data frame store.df is 2080 and number of columns is 10"

# ******************************************

# Q1b. Write R code to list the names of the column present in the data frame store.df.
Column_Names <- colnames(store.df)
print(Column_Names)

##  [1] "storeNum" "Year"     "Week"     "p1sales"  "p2sales"  "p1price" 
##  [7] "p2price"  "p1prom"   "p2prom"   "country"

# QUESTION 2
# Q2a. Write R code to output the data types of the different columns.
str(store.df)

## 'data.frame':    2080 obs. of  10 variables:
##  $ storeNum: int  101 101 101 101 101 101 101 101 101 101 ...
##  $ Year    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ p1sales : int  127 137 156 117 138 115 116 106 116 145 ...
##  $ p2sales : int  106 105 97 106 100 127 90 126 94 91 ...
##  $ p1price : num  2.29 2.49 2.99 2.99 2.49 2.79 2.99 2.99 2.29 2.49 ...
##  $ p2price : num  2.29 2.49 2.99 3.19 2.59 2.49 3.19 2.29 2.29 2.99 ...
##  $ p1prom  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ p2prom  : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...

#********************************

# Q2b. Write R code to convert the data types of the columns {Year, p1prom, p2prom, country} into factor variables.
store.df$Year <- as.factor(store.df$Year)
store.df$p1prom <- as.factor(store.df$p1prom)
store.df$p2prom <- as.factor(store.df$p2prom)
store.df$country <- as.factor(store.df$country)

#***********************************************************

#Q2c. Write R code to verify the conversions done in Q2b.
# Answer 1
str(store.df)

## 'data.frame':    2080 obs. of  10 variables:
##  $ storeNum: int  101 101 101 101 101 101 101 101 101 101 ...
##  $ Year    : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Week    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ p1sales : int  127 137 156 117 138 115 116 106 116 145 ...
##  $ p2sales : int  106 105 97 106 100 127 90 126 94 91 ...
##  $ p1price : num  2.29 2.49 2.99 2.99 2.49 2.79 2.99 2.99 2.29 2.49 ...
##  $ p2price : num  2.29 2.49 2.99 3.19 2.59 2.49 3.19 2.29 2.29 2.99 ...
##  $ p1prom  : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
##  $ p2prom  : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...

# Answer 2
is.factor(store.df$Year)

## [1] TRUE

is.factor(store.df$p1prom)

## [1] TRUE

is.factor(store.df$p2prom)

## [1] TRUE

is.factor(store.df$country)

## [1] TRUE

# QUESTION 3
# Q3a. Write R code to generate the summary statistics of the different variables present in the data frame  store.df
summary(store.df)

##     storeNum     Year          Week          p1sales       p2sales     
##  Min.   :101.0   1:1040   Min.   : 1.00   Min.   : 73   Min.   : 51.0  
##  1st Qu.:105.8   2:1040   1st Qu.:13.75   1st Qu.:113   1st Qu.: 84.0  
##  Median :110.5            Median :26.50   Median :129   Median : 96.0  
##  Mean   :110.5            Mean   :26.50   Mean   :133   Mean   :100.2  
##  3rd Qu.:115.2            3rd Qu.:39.25   3rd Qu.:150   3rd Qu.:113.0  
##  Max.   :120.0            Max.   :52.00   Max.   :263   Max.   :225.0  
##                                                                        
##     p1price         p2price     p1prom   p2prom   country 
##  Min.   :2.190   Min.   :2.29   0:1872   0:1792   AU:104  
##  1st Qu.:2.290   1st Qu.:2.49   1: 208   1: 288   BR:208  
##  Median :2.490   Median :2.59                     CN:208  
##  Mean   :2.544   Mean   :2.70                     DE:520  
##  3rd Qu.:2.790   3rd Qu.:2.99                     GB:312  
##  Max.   :2.990   Max.   :3.19                     JP:416  
##                                                   US:312

#*******************************

# Bonus: Q3b. Write R code to generate the following table using the describe() function from the psych package in R.
psych :: describe(store.df)

##          vars    n   mean    sd median trimmed   mad    min    max range
## storeNum    1 2080 110.50  5.77 110.50  110.50  7.41 101.00 120.00  19.0
## Year*       2 2080   1.50  0.50   1.50    1.50  0.74   1.00   2.00   1.0
## Week        3 2080  26.50 15.01  26.50   26.50 19.27   1.00  52.00  51.0
## p1sales     4 2080 133.05 28.37 129.00  131.08 26.69  73.00 263.00 190.0
## p2sales     5 2080 100.16 24.42  96.00   98.05 22.24  51.00 225.00 174.0
## p1price     6 2080   2.54  0.29   2.49    2.53  0.44   2.19   2.99   0.8
## p2price     7 2080   2.70  0.33   2.59    2.69  0.44   2.29   3.19   0.9
## p1prom*     8 2080   1.10  0.30   1.00    1.00  0.00   1.00   2.00   1.0
## p2prom*     9 2080   1.14  0.35   1.00    1.05  0.00   1.00   2.00   1.0
## country*   10 2080   4.55  1.72   4.50    4.62  2.22   1.00   7.00   6.0
##           skew kurtosis   se
## storeNum  0.00    -1.21 0.13
## Year*     0.00    -2.00 0.01
## Week      0.00    -1.20 0.33
## p1sales   0.74     0.66 0.62
## p2sales   0.99     1.51 0.54
## p1price   0.28    -1.44 0.01
## p2price   0.32    -1.40 0.01
## p1prom*   2.66     5.10 0.01
## p2prom*   2.09     2.38 0.01
## country* -0.29    -0.81 0.04

# QUESTION 4
# Q4a. Break-up store.df by country. Write R code to generate the following break-ups by country.
mytable <- with(store.df, table(country)) 
print(mytable)

## country
##  AU  BR  CN  DE  GB  JP  US 
## 104 208 208 520 312 416 312

#***************************

# Q4b. Write R code to express Q4a numbers in terms of percentages.
prop.table(mytable)*100

## country
## AU BR CN DE GB JP US 
##  5 10 10 25 15 20 15

# Q5a. The dataset has 2080 rows of weekly data for 20 unique store branches for two different years. For product 1 (Coke) and product 2 (Pepsi), write R code to count the number of weeks,
# 1. When both products were under promotion?
# 2. When product 1 (Coke) was under promotion but product 2 (Pepsi) was not under promotion?
# 3.When product 1 (Coke) was not under promotion but product 2 (Pepsi) was under promotion?
# 4. When neither product was under promotion?
mytable <- xtabs(~ p1prom+p2prom, data=store.df)
print(mytable) # frequencies

##       p2prom
## p1prom    0    1
##      0 1616  256
##      1  176   32

# ****************************************

#Q5b. Express Q5a as in percentages.
prop.table(mytable)*100

##       p2prom
## p1prom         0         1
##      0 77.692308 12.307692
##      1  8.461538  1.538462

# ******************************

# Q5c. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when both were under promotion.
mean(store.df$p1price[ which( store.df$p1prom ==1 & store.df$p2prom ==1 )])

## [1] 2.568125

mean(store.df$p2price[ which( store.df$p1prom ==1 & store.df$p2prom ==1 )])

## [1] 2.67125

# *************************************************************************************

# Q5d. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when neither of them was under promotion.
mean(store.df$p1price[ which( store.df$p1prom ==0 & store.df$p2prom ==0 )])

## [1] 2.543342

mean(store.df$p2price[ which( store.df$p1prom ==0 & store.df$p2prom ==0 )])

## [1] 2.700891

# *************************************************************************************

# Q5e. Write R code to calculate the average price of product 1 (Coke) and product 2 (Pepsi) when only one of them was under promotion.
# Part 1 : When Product 1 was under promotion and product 2 is not
mean(store.df$p1price[ which( store.df$p1prom ==1 & store.df$p2prom ==0 )])

## [1] 2.524659

mean(store.df$p2price[ which( store.df$p1prom ==1 & store.df$p2prom ==0 )])

## [1] 2.703068

# Part 2 : When Product 2 was under promotion and product 1 is not
mean(store.df$p1price[ which( store.df$p1prom ==0 & store.df$p2prom ==1 )])

## [1] 2.561484

mean(store.df$p2price[ which( store.df$p1prom ==0 & store.df$p2prom ==1 )])

## [1] 2.691953

# Q6. Analyze the above numbers. Think like a Senior Manager. Prepare a list of qualitative insights on the “Effect of Promotions on Sales” of Coke and Pepsi, to be shared in class.
mean(store.df$p1sales[ which( store.df$p1prom ==1)])

## [1] 168.8894

mean(store.df$p1sales[ which( store.df$p1prom ==0)])

## [1] 129.0662

mean(store.df$p2sales[ which( store.df$p2prom ==1)])

## [1] 134.2604

mean(store.df$p2sales[ which( store.df$p2prom ==0)])

## [1] 94.67578

# 1) Coke has better market power cuz t is lesser on promotion in comparison to Pepsi 
# 2) Both of them are not on promotion 80% of the times means they sell even when not on promotion 
# 3) Coke sells a lower price and that is how they are trying to gain market share by playing on the price 
# 4) Even both were under promo, Pepsi dropped it's price by 30cents whereas Coke increased by 2cents. Point being that Pepsi tweaks it's prices more that Coke whereas Coke is stable and EDLP types

StoreDataAnalysis - Assignment 1_Group 5

Mudit Gupta

9/20/2018