library(data.table)
library(ISLR)
OJ<-OJ
head(OJ)
##   Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1       CH            237       1    1.75    1.99   0.00    0.0         0
## 2       CH            239       1    1.75    1.99   0.00    0.3         0
## 3       CH            245       1    1.86    2.09   0.17    0.0         0
## 4       MM            227       1    1.69    1.69   0.00    0.0         0
## 5       CH            228       7    1.69    1.69   0.00    0.0         0
## 6       CH            230       7    1.69    1.99   0.00    0.0         0
##   SpecialMM  LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1         0 0.500000        1.99        1.75      0.24     No  0.000000
## 2         1 0.600000        1.69        1.75     -0.06     No  0.150754
## 3         0 0.680000        2.09        1.69      0.40     No  0.000000
## 4         0 0.400000        1.69        1.69      0.00     No  0.000000
## 5         0 0.956535        1.69        1.69      0.00    Yes  0.000000
## 6         1 0.965228        1.99        1.69      0.30    Yes  0.000000
##   PctDiscCH ListPriceDiff STORE
## 1  0.000000          0.24     1
## 2  0.000000          0.24     1
## 3  0.091398          0.23     1
## 4  0.000000          0.00     1
## 5  0.000000          0.00     0
## 6  0.000000          0.30     0
str(OJ)
## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...
pacman::p_load(pacman,dplyr,GGally,ggplot2,ggvis,rio,
               shiny,tidyr,stringr,httr,lubridate,
               plotly,rmarkdown,ggthemes,psych,tidyverse)

#Our data frame consists of 1070 observations with 18 variables
#This data contains whether the customer purchased Citrus Hill in short CH ,
# or purchased Minute Maid orange juice.the variables are made of a number of characteristics of the customer and product

###QUESTION STATEMENTS###
# we are trying to see what influences the Sale price of this Two juices
#and also how they compare in prices and why certain prices in certain stores

?OJ
## starting httpd help server ... done
#1 Scatter plot of price charged for Citrus Hill by week of purchase
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceCH,col=LoyalCH
              ))+
  geom_point()

#it is evident that price charged for Citrus hill gradually rises with each passing couple of weeks
#in week 230 to 235 we bought Citrus Hill for about $1.70 and the customer brand loyalty was in between 0.50 to 0.75
#5 weeks later at week 240 the price of citrus Hill had risen with about $0.050 from $1.7 to $1.75 where the customer brand loyalty was at around 0.75
#but where the customer brand loyalty's average was in between 0.25 and 0.5 the price rose significantly higher than $0.050 in a span of the same week.
#in the mention spike the price of Citrus Hill for the same week 240 was at around $1.8 , which is $0.10 more than when we sold the juice in week 230
#The price seems to follow a pattern and rises with the passing of weeks.
#in week 240 Citrus Hill was $1.70 and by week 280 the price is at around $2.10 which is a $0.40 increase in price in a span of 50 weeks

#2 Scatter plot of price charged for MInute Maid by week of purchase
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceMM,
              col=LoyalCH))+
  geom_point()

#Prices for MInute Maid for the week 230 vary mainly due to customer brand loyalty levels ,
#we observe prices that are small where the brand loyalty is between 0.25 and 0.50 and conversely, higher prices 
# where the brand loyalty is high in the ranges of 0.5 to 0.75
# as depicted by the prices for Minute Maid in week 230, we had about 3 variations of pricing for the Juice in question
# listed by loyalty strength and respective price range
#Prices were $1.7 ,$1.8 and $2.0 respectively
#similarly the prices increase with each passing week and variations apply to pricing at the same week due to
#  customer brand loyalty rating differences


#3 Scatter plot of price charged for Citrus Hill by week per Store 
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceCH,
              col=STORE))+
  geom_point()

#from this plot it becomes clear that the price chareged for Citrus Hill varried per store
#as we can extrapolate from the graph that stores 0,1,2 during week 230 charged around $1.70
# where else store 3 and 4 charged significantly higher than the other three with its price at around $1.80
# $0.10 higher than the others mainly due to the customer brand loyalty levels the two stores have since we saw that the ones that priced high had a giher loyalty level

#4  Scatter plot of price charged for Minute Maid by week per Store
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceMM,
              col=STORE))+
  geom_point()+
  labs(x="Week of Purchace",
       y="Price Charged Minute Maid",
       title="Scatter plot of price charged for Minute Maid by week per Store")

# as expected there are price variations between store 0,1,2 and those of stores 3 and 4
#Minute Maid was also priced significantly higher at stores 3 and 4 compared to that of 0,1 and 2
#we suspect its mainly due to Customer brand loyalty differences between the two set groups 
#in week 230 Minute Maid was priced at around $1.7 at stores 0,1, and 2 but priced at around
# $1.8 dollars at stores 3 and 4
#this leads us to suggest that loyalty levels affect the level of price sensitivity for differnt stores



#5 Scatter plot of price charged for Citurs Hill per week by its Sale price
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceCH,
              col=SalePriceCH))+
  geom_point()

#this shows us that on average in week 230 the sale price was around $1.8 -$2.0
#and as weeks continues that was the average till around week 265 where we see the price drop to around $1.4-$1.6
# so in conclusion the sale price of Minute Maid was set at around $2 from week 230 till 280 with slight variations mainly due to
# costumer brand loyalty levels

par(mfrow=c(2,1))

#histogram of prices charged for Citrus Hill
hist(OJ$PriceCH,
     col="red",
     main="Hist of Price Charged for Citurus Hill",
     xlab = "Prices:in Dollars")

#histogram of prices charged for Minute Maid
hist(OJ$PriceMM,
     col="blue",
     main="Hist of Price Charged for Minute Maid",
     xlab = "Prices:in Dollars")

#from the two graphs we can see that Minute Maid is actually charged a bit more than Citrus Hill
#with Citurs hills minimum charged price at $1.69 and its max at $2.09 while the Minute Maid charged prices range from 
# $1.69 to $2.29 , 19 cents higher than the Citrus Hills max charged price
par(mfrow=c(1,1))

# 6 boxplots
boxplot(OJ$PriceCH,
        col="green",
        fill=T,
        horizontal = T,
        xlab="Prices :in Dollars",
        main="Boxplot of Prices Charged for Citrus Hill")

boxplot(OJ$PriceMM,
        col="yellow",
        fill=T,
        horizontal = T,
        xlab="Prices :in Dollars",
        main="Boxplot of Prices Charged for Minute Maid")

# the box plot suggest that mean price charged for Minute Maid is at around $ 2.10
#with the lower price limits of $ 1.8 and an outlier at around $ 1.69

# while on the other hand it suggest that the average Charged Price for Citrus Hill is at around $1.85
#with the lower price limit of $1.7


#7 Scatter plot of price charged for Citrus Hill by week of purchase with discount
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceCH,col=DiscCH
))+
  geom_point()

#basically we can say there is little if no discount offered for Citrus Hill
#the average discount on Citrus Hill is 0.0 with a few instances where the discount is around 0.5 during
#weeks 263-265 and 0.3 during weeks 251-256
#the mean discount for the entire time frame for Citrus Hill is 0.05185981


#8 Scatter plot of price charged for Minute Maid by week of purchase with discount
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceMM,col=DiscMM
))+
  geom_point()

#For Minute Maid there was some discount offered and the average discount offered for Minute Maid is 0.1233645 


#9 Scatter plot of price charged for Citrus Hill by week of purchase with store 7 as selector
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceCH,col=Store7
))+
  geom_point()

summary(OJ$Store7)
##  No Yes 
## 714 356
# In the 50 week period most Citrus Hill was not sold at Store 7
#there were 356 sales at store 7 in total out of 1070 sales so about 33% of the sales happened at Store7
# and about 66.7% happened else where
# we can say that there is a 33% chance of store7 Selling Citrus Hill in the 50 week period
#with exceptions during weeks 250-265 ,where Citrus Hill was sold at Store 7


#10 Scatter plot of price charged for Minute Maid by week of purchase with store 7 as selector
ggplot(OJ,aes(x=WeekofPurchase,
              y=PriceMM,col=Store7
))+
  geom_point()

# similar  to that of Citrus Hill, Minute Maid was on average not sold at Store 7 with exceptions during weeks 
#260-275 where most sold at Store 7
#but compared to Citrus Hill, store 7 sold quite a bit of Minute Maid during this 50 week period


# 11 MODELING#
# model 1 for price of Citrus Hill with all the variables except list price diff
MD1 <-lm(SalePriceCH~WeekofPurchase+
           StoreID +
           PriceCH +
           DiscCH +
           SpecialCH +
           LoyalCH+
           PriceDiff+
           Store7 +
           PctDiscCH +
           STORE  ,data=OJ)

summary(MD1)
## 
## Call:
## lm(formula = SalePriceCH ~ WeekofPurchase + StoreID + PriceCH + 
##     DiscCH + SpecialCH + LoyalCH + PriceDiff + Store7 + PctDiscCH + 
##     STORE, data = OJ)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.111e-13 -5.900e-17  8.200e-17  2.410e-16  4.872e-15 
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)    -2.958e-15  2.337e-15 -1.266e+00    0.206    
## WeekofPurchase  4.003e-18  1.235e-17  3.240e-01    0.746    
## StoreID         2.704e-16  1.667e-16  1.622e+00    0.105    
## PriceCH         1.000e+00  2.104e-15  4.752e+14   <2e-16 ***
## DiscCH         -1.000e+00  2.332e-14 -4.288e+13   <2e-16 ***
## SpecialCH      -5.263e-17  4.019e-16 -1.310e-01    0.896    
## LoyalCH        -1.002e-16  3.600e-16 -2.780e-01    0.781    
## PriceDiff      -2.412e-16  4.382e-16 -5.500e-01    0.582    
## Store7Yes      -1.047e-15  8.419e-16 -1.243e+00    0.214    
## PctDiscCH       1.052e-14  4.439e-14  2.370e-01    0.813    
## STORE                  NA         NA         NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.426e-15 on 1060 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.08e+29 on 9 and 1060 DF,  p-value: < 2.2e-16
#Adjusted R-squared:1 ,this is a perfect model to explain the data, very rarely do we get the adjusted R^2 = 1, 
# since this is a perfect fit for predicting sale price for Citrus Hill no anova test are necessary

# model 2 for price of Citrus Hill with all the variables except list price diff plus 
# loyalty since Minute Maid has no Customer loyalty category
MD2 <-lm(SalePriceMM~WeekofPurchase+
           StoreID +
           PriceMM +
           DiscMM +
           SpecialMM +
           PriceDiff+
           Store7 +
           PctDiscMM
           ,data=OJ)
summary(MD2)
## 
## Call:
## lm(formula = SalePriceMM ~ WeekofPurchase + StoreID + PriceMM + 
##     DiscMM + SpecialMM + PriceDiff + Store7 + PctDiscMM, data = OJ)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.047e-13 -8.500e-17  7.600e-17  4.510e-16  5.539e-15 
## 
## Coefficients:
##                  Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)    -1.063e-14  4.253e-15 -2.499e+00   0.0126 *  
## WeekofPurchase  9.761e-20  1.638e-17  6.000e-03   0.9952    
## StoreID         4.831e-16  2.510e-16  1.925e+00   0.0545 .  
## PriceMM         1.000e+00  2.149e-15  4.653e+14   <2e-16 ***
## DiscMM         -1.000e+00  2.105e-14 -4.750e+13   <2e-16 ***
## SpecialMM       4.222e-16  6.127e-16  6.890e-01   0.4909    
## PriceDiff       1.376e-16  1.692e-15  8.100e-02   0.9352    
## Store7Yes      -1.955e-15  1.288e-15 -1.517e+00   0.1295    
## PctDiscMM       3.751e-14  4.421e-14  8.480e-01   0.3964    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.307e-15 on 1061 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.145e+29 on 8 and 1061 DF,  p-value: < 2.2e-16
# this two models are a perfect fit for predicting Sales price for both Minute Maid and Citrus Hill
#Adjusted R-squared:1 

#12 SCATTER PLOT OF FITTED VALUES MINUTE MAID
plot(OJ$WeekofPurchase,fitted(MD2),col="yellow",
     main="Fitted Sales price for Minute Maid",
     ylab="Md2 fitted Prices : in Dollars",
     xlab = "Week of Purchase",pch=19)

# SCATTER PLOT OF FITTED VALUES CITRUS HILL
plot(OJ$WeekofPurchase,fitted(MD1),col="green",
     main="Fitted Sales price for Citrus Hill",
     ylab="Md1 fitted Prices : in Dollars",
     xlab = "Week of Purchase",pch=19)

# COMPARING THE PRICES FITTED AND ACTUAL for Citrus Hill
ggplot(OJ,aes(x=WeekofPurchase,
              y=fitted(MD1),col=SalePriceCH))+
  geom_point()+
  labs(x="Week of Purchase",
       y="Sale price for Citrus Hill",
       title="Sales price for CH per week vs predicted")

# as you can see since the model is a perfect fit ,the predicted values and the actual ones
# align perfectly on the scatter plot with no variation in sales price for Citrus Hill
# the same can be expected for Minute Maid since both models are a perfect fit

# now for Minute Maid
ggplot(OJ,aes(x=WeekofPurchase,
              y=fitted(MD2),col=SalePriceMM))+
  geom_point()+
  labs(x="Week of Purchase",
       y="Sale price for Minute Maid",
       title="Sales price for MM per week vs predicted")

# as expected the point align