library(data.table)
library(ISLR)
OJ<-OJ
head(OJ)
## Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1 CH 237 1 1.75 1.99 0.00 0.0 0
## 2 CH 239 1 1.75 1.99 0.00 0.3 0
## 3 CH 245 1 1.86 2.09 0.17 0.0 0
## 4 MM 227 1 1.69 1.69 0.00 0.0 0
## 5 CH 228 7 1.69 1.69 0.00 0.0 0
## 6 CH 230 7 1.69 1.99 0.00 0.0 0
## SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1 0 0.500000 1.99 1.75 0.24 No 0.000000
## 2 1 0.600000 1.69 1.75 -0.06 No 0.150754
## 3 0 0.680000 2.09 1.69 0.40 No 0.000000
## 4 0 0.400000 1.69 1.69 0.00 No 0.000000
## 5 0 0.956535 1.69 1.69 0.00 Yes 0.000000
## 6 1 0.965228 1.99 1.69 0.30 Yes 0.000000
## PctDiscCH ListPriceDiff STORE
## 1 0.000000 0.24 1
## 2 0.000000 0.24 1
## 3 0.091398 0.23 1
## 4 0.000000 0.00 1
## 5 0.000000 0.00 0
## 6 0.000000 0.30 0
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
pacman::p_load(pacman,dplyr,GGally,ggplot2,ggvis,rio,
shiny,tidyr,stringr,httr,lubridate,
plotly,rmarkdown,ggthemes,psych,tidyverse)
#Our data frame consists of 1070 observations with 18 variables
#This data contains whether the customer purchased Citrus Hill in short CH ,
# or purchased Minute Maid orange juice.the variables are made of a number of characteristics of the customer and product
###QUESTION STATEMENTS###
# we are trying to see what influences the Sale price of this Two juices
#and also how they compare in prices and why certain prices in certain stores
?OJ
## starting httpd help server ... done
#1 Scatter plot of price charged for Citrus Hill by week of purchase
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceCH,col=LoyalCH
))+
geom_point()

#it is evident that price charged for Citrus hill gradually rises with each passing couple of weeks
#in week 230 to 235 we bought Citrus Hill for about $1.70 and the customer brand loyalty was in between 0.50 to 0.75
#5 weeks later at week 240 the price of citrus Hill had risen with about $0.050 from $1.7 to $1.75 where the customer brand loyalty was at around 0.75
#but where the customer brand loyalty's average was in between 0.25 and 0.5 the price rose significantly higher than $0.050 in a span of the same week.
#in the mention spike the price of Citrus Hill for the same week 240 was at around $1.8 , which is $0.10 more than when we sold the juice in week 230
#The price seems to follow a pattern and rises with the passing of weeks.
#in week 240 Citrus Hill was $1.70 and by week 280 the price is at around $2.10 which is a $0.40 increase in price in a span of 50 weeks
#2 Scatter plot of price charged for MInute Maid by week of purchase
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceMM,
col=LoyalCH))+
geom_point()

#Prices for MInute Maid for the week 230 vary mainly due to customer brand loyalty levels ,
#we observe prices that are small where the brand loyalty is between 0.25 and 0.50 and conversely, higher prices
# where the brand loyalty is high in the ranges of 0.5 to 0.75
# as depicted by the prices for Minute Maid in week 230, we had about 3 variations of pricing for the Juice in question
# listed by loyalty strength and respective price range
#Prices were $1.7 ,$1.8 and $2.0 respectively
#similarly the prices increase with each passing week and variations apply to pricing at the same week due to
# customer brand loyalty rating differences
#3 Scatter plot of price charged for Citrus Hill by week per Store
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceCH,
col=STORE))+
geom_point()

#from this plot it becomes clear that the price chareged for Citrus Hill varried per store
#as we can extrapolate from the graph that stores 0,1,2 during week 230 charged around $1.70
# where else store 3 and 4 charged significantly higher than the other three with its price at around $1.80
# $0.10 higher than the others mainly due to the customer brand loyalty levels the two stores have since we saw that the ones that priced high had a giher loyalty level
#4 Scatter plot of price charged for Minute Maid by week per Store
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceMM,
col=STORE))+
geom_point()+
labs(x="Week of Purchace",
y="Price Charged Minute Maid",
title="Scatter plot of price charged for Minute Maid by week per Store")

# as expected there are price variations between store 0,1,2 and those of stores 3 and 4
#Minute Maid was also priced significantly higher at stores 3 and 4 compared to that of 0,1 and 2
#we suspect its mainly due to Customer brand loyalty differences between the two set groups
#in week 230 Minute Maid was priced at around $1.7 at stores 0,1, and 2 but priced at around
# $1.8 dollars at stores 3 and 4
#this leads us to suggest that loyalty levels affect the level of price sensitivity for differnt stores
#5 Scatter plot of price charged for Citurs Hill per week by its Sale price
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceCH,
col=SalePriceCH))+
geom_point()

#this shows us that on average in week 230 the sale price was around $1.8 -$2.0
#and as weeks continues that was the average till around week 265 where we see the price drop to around $1.4-$1.6
# so in conclusion the sale price of Minute Maid was set at around $2 from week 230 till 280 with slight variations mainly due to
# costumer brand loyalty levels
par(mfrow=c(2,1))
#histogram of prices charged for Citrus Hill
hist(OJ$PriceCH,
col="red",
main="Hist of Price Charged for Citurus Hill",
xlab = "Prices:in Dollars")
#histogram of prices charged for Minute Maid
hist(OJ$PriceMM,
col="blue",
main="Hist of Price Charged for Minute Maid",
xlab = "Prices:in Dollars")

#from the two graphs we can see that Minute Maid is actually charged a bit more than Citrus Hill
#with Citurs hills minimum charged price at $1.69 and its max at $2.09 while the Minute Maid charged prices range from
# $1.69 to $2.29 , 19 cents higher than the Citrus Hills max charged price
par(mfrow=c(1,1))
# 6 boxplots
boxplot(OJ$PriceCH,
col="green",
fill=T,
horizontal = T,
xlab="Prices :in Dollars",
main="Boxplot of Prices Charged for Citrus Hill")

boxplot(OJ$PriceMM,
col="yellow",
fill=T,
horizontal = T,
xlab="Prices :in Dollars",
main="Boxplot of Prices Charged for Minute Maid")

# the box plot suggest that mean price charged for Minute Maid is at around $ 2.10
#with the lower price limits of $ 1.8 and an outlier at around $ 1.69
# while on the other hand it suggest that the average Charged Price for Citrus Hill is at around $1.85
#with the lower price limit of $1.7
#7 Scatter plot of price charged for Citrus Hill by week of purchase with discount
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceCH,col=DiscCH
))+
geom_point()

#basically we can say there is little if no discount offered for Citrus Hill
#the average discount on Citrus Hill is 0.0 with a few instances where the discount is around 0.5 during
#weeks 263-265 and 0.3 during weeks 251-256
#the mean discount for the entire time frame for Citrus Hill is 0.05185981
#8 Scatter plot of price charged for Minute Maid by week of purchase with discount
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceMM,col=DiscMM
))+
geom_point()

#For Minute Maid there was some discount offered and the average discount offered for Minute Maid is 0.1233645
#9 Scatter plot of price charged for Citrus Hill by week of purchase with store 7 as selector
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceCH,col=Store7
))+
geom_point()

summary(OJ$Store7)
## No Yes
## 714 356
# In the 50 week period most Citrus Hill was not sold at Store 7
#there were 356 sales at store 7 in total out of 1070 sales so about 33% of the sales happened at Store7
# and about 66.7% happened else where
# we can say that there is a 33% chance of store7 Selling Citrus Hill in the 50 week period
#with exceptions during weeks 250-265 ,where Citrus Hill was sold at Store 7
#10 Scatter plot of price charged for Minute Maid by week of purchase with store 7 as selector
ggplot(OJ,aes(x=WeekofPurchase,
y=PriceMM,col=Store7
))+
geom_point()

# similar to that of Citrus Hill, Minute Maid was on average not sold at Store 7 with exceptions during weeks
#260-275 where most sold at Store 7
#but compared to Citrus Hill, store 7 sold quite a bit of Minute Maid during this 50 week period
# 11 MODELING#
# model 1 for price of Citrus Hill with all the variables except list price diff
MD1 <-lm(SalePriceCH~WeekofPurchase+
StoreID +
PriceCH +
DiscCH +
SpecialCH +
LoyalCH+
PriceDiff+
Store7 +
PctDiscCH +
STORE ,data=OJ)
summary(MD1)
##
## Call:
## lm(formula = SalePriceCH ~ WeekofPurchase + StoreID + PriceCH +
## DiscCH + SpecialCH + LoyalCH + PriceDiff + Store7 + PctDiscCH +
## STORE, data = OJ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.111e-13 -5.900e-17 8.200e-17 2.410e-16 4.872e-15
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.958e-15 2.337e-15 -1.266e+00 0.206
## WeekofPurchase 4.003e-18 1.235e-17 3.240e-01 0.746
## StoreID 2.704e-16 1.667e-16 1.622e+00 0.105
## PriceCH 1.000e+00 2.104e-15 4.752e+14 <2e-16 ***
## DiscCH -1.000e+00 2.332e-14 -4.288e+13 <2e-16 ***
## SpecialCH -5.263e-17 4.019e-16 -1.310e-01 0.896
## LoyalCH -1.002e-16 3.600e-16 -2.780e-01 0.781
## PriceDiff -2.412e-16 4.382e-16 -5.500e-01 0.582
## Store7Yes -1.047e-15 8.419e-16 -1.243e+00 0.214
## PctDiscCH 1.052e-14 4.439e-14 2.370e-01 0.813
## STORE NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.426e-15 on 1060 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.08e+29 on 9 and 1060 DF, p-value: < 2.2e-16
#Adjusted R-squared:1 ,this is a perfect model to explain the data, very rarely do we get the adjusted R^2 = 1,
# since this is a perfect fit for predicting sale price for Citrus Hill no anova test are necessary
# model 2 for price of Citrus Hill with all the variables except list price diff plus
# loyalty since Minute Maid has no Customer loyalty category
MD2 <-lm(SalePriceMM~WeekofPurchase+
StoreID +
PriceMM +
DiscMM +
SpecialMM +
PriceDiff+
Store7 +
PctDiscMM
,data=OJ)
summary(MD2)
##
## Call:
## lm(formula = SalePriceMM ~ WeekofPurchase + StoreID + PriceMM +
## DiscMM + SpecialMM + PriceDiff + Store7 + PctDiscMM, data = OJ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.047e-13 -8.500e-17 7.600e-17 4.510e-16 5.539e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.063e-14 4.253e-15 -2.499e+00 0.0126 *
## WeekofPurchase 9.761e-20 1.638e-17 6.000e-03 0.9952
## StoreID 4.831e-16 2.510e-16 1.925e+00 0.0545 .
## PriceMM 1.000e+00 2.149e-15 4.653e+14 <2e-16 ***
## DiscMM -1.000e+00 2.105e-14 -4.750e+13 <2e-16 ***
## SpecialMM 4.222e-16 6.127e-16 6.890e-01 0.4909
## PriceDiff 1.376e-16 1.692e-15 8.100e-02 0.9352
## Store7Yes -1.955e-15 1.288e-15 -1.517e+00 0.1295
## PctDiscMM 3.751e-14 4.421e-14 8.480e-01 0.3964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.307e-15 on 1061 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.145e+29 on 8 and 1061 DF, p-value: < 2.2e-16
# this two models are a perfect fit for predicting Sales price for both Minute Maid and Citrus Hill
#Adjusted R-squared:1
#12 SCATTER PLOT OF FITTED VALUES MINUTE MAID
plot(OJ$WeekofPurchase,fitted(MD2),col="yellow",
main="Fitted Sales price for Minute Maid",
ylab="Md2 fitted Prices : in Dollars",
xlab = "Week of Purchase",pch=19)

# SCATTER PLOT OF FITTED VALUES CITRUS HILL
plot(OJ$WeekofPurchase,fitted(MD1),col="green",
main="Fitted Sales price for Citrus Hill",
ylab="Md1 fitted Prices : in Dollars",
xlab = "Week of Purchase",pch=19)

# COMPARING THE PRICES FITTED AND ACTUAL for Citrus Hill
ggplot(OJ,aes(x=WeekofPurchase,
y=fitted(MD1),col=SalePriceCH))+
geom_point()+
labs(x="Week of Purchase",
y="Sale price for Citrus Hill",
title="Sales price for CH per week vs predicted")

# as you can see since the model is a perfect fit ,the predicted values and the actual ones
# align perfectly on the scatter plot with no variation in sales price for Citrus Hill
# the same can be expected for Minute Maid since both models are a perfect fit
# now for Minute Maid
ggplot(OJ,aes(x=WeekofPurchase,
y=fitted(MD2),col=SalePriceMM))+
geom_point()+
labs(x="Week of Purchase",
y="Sale price for Minute Maid",
title="Sales price for MM per week vs predicted")

# as expected the point align