#loading library for market basket analysis
#loading MBA datset
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
#loading data and saving in data3 variable
data3 <- read.transactions('MBA.csv',sep = ',')
## Warning in asMethod(object): removing duplicated items in transactions
#summary of the dataset, showing frequent items mineral water, eggs,
#spagetti etc. 1 trasaction is 1754 then 20 transactions is 1.
summary(data3)
## transactions as itemMatrix in sparse format with
##  7501 rows (elements/itemsets/transactions) and
##  119 columns (items) and a density of 0.03288973 
## 
## most frequent items:
## mineral water          eggs     spaghetti  french fries     chocolate 
##          1788          1348          1306          1282          1229 
##       (Other) 
##         22405 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17 
##   16   18   19   20 
##    4    1    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.914   5.000  20.000 
## 
## includes extended item information - examples:
##              labels
## 1           almonds
## 2 antioxydant juice
## 3         asparagus
# missing values in the dataset, showing false
is.na(data3)
## Warning in is.na(data3): is.na() applied to non-(list or vector) of type
## 'S4'
## [1] FALSE
#exploratory analysis of the data to better understand the data 
#inspecting the data into two brackets
inspect(data3[1:2])
##     items              
## [1] {almonds,          
##      antioxydant juice,
##      avocado,          
##      cottage cheese,   
##      energy drink,     
##      frozen smoothie,  
##      green grapes,     
##      green tea,        
##      honey,            
##      low fat yogurt,   
##      mineral water,    
##      olive oil,        
##      salad,            
##      salmon,           
##      shrimp,           
##      spinach,          
##      tomato juice,     
##      vegetables mix,   
##      whole weat flour, 
##      yams}             
## [2] {burgers,          
##      eggs,             
##      meatballs}
#what pecentage a particular item will showup, showing support
itemFrequency(data3[,1:6])
##           almonds antioxydant juice         asparagus           avocado 
##       0.020397280       0.008932142       0.004799360       0.033328889 
##       babies food             bacon 
##       0.004532729       0.008665511
#plotting frequency wth support .20 and .10, showing .20 support mineral water purchased
#more then 20% and when the support is .10 its showing mineral water, eggs,frech fries,
#spagetti less is milk(12%)
itemFrequencyPlot(data3,support=.20)

itemFrequencyPlot(data3,support=.10)

#5 items showing highest support 
itemFrequencyPlot(data3,topN=5)

#problem statement using association rule mining to find 
#associations between interesting variables 
#What are customers likely to buy if they purchase mineral water?

#now showing confidence - conditional probability, 
# Get the rules, support minimums taken .001 and minimum confidence is
#.15 giving set of 5 rules
rules <- apriori(data3, parameter=list(supp=0.001, conf=0.15,minlen=2), appearance=list(default="rhs", lhs="mineral water"),control=list(verbose=F))
rules <- sort(rules, by = "confidence", decreasing = TRUE)
summary(rules)
## set of 5 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 
## 5 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support          confidence          lift           count      
##  Min.   :0.04093   Min.   :0.1717   Min.   :1.189   Min.   :307.0  
##  1st Qu.:0.04799   1st Qu.:0.2013   1st Qu.:1.348   1st Qu.:360.0  
##  Median :0.05093   Median :0.2136   Median :1.439   Median :382.0  
##  Mean   :0.05045   Mean   :0.2116   Mean   :1.456   Mean   :378.4  
##  3rd Qu.:0.05266   3rd Qu.:0.2209   3rd Qu.:1.554   3rd Qu.:395.0  
##  Max.   :0.05973   Max.   :0.2506   Max.   :1.748   Max.   :448.0  
## 
## mining info:
##   data ntransactions support confidence
##  data3          7501   0.001       0.15
#inspecting rule, showing if buying mineral water 
# 25%confidence you will likely to buy spagetti. If buying mineral water you will likely to buy
#chocolate 22%confidence and if buying mineral water you will likely to buy
#eggs 21% confidence
inspect(rules[1:5]) ## Look at the first 5 generated rules
##     lhs                rhs           support    confidence lift     count
## [1] {mineral water} => {spaghetti}   0.05972537 0.2505593  1.439085 448  
## [2] {mineral water} => {chocolate}   0.05265965 0.2209172  1.348332 395  
## [3] {mineral water} => {eggs}        0.05092654 0.2136465  1.188845 382  
## [4] {mineral water} => {milk}        0.04799360 0.2013423  1.553774 360  
## [5] {mineral water} => {ground beef} 0.04092788 0.1717002  1.747522 307
#on the predictors you have found among the generated rules using suitable
#plots (scatter plots, box plots, etc).
#Look for multivariate outliers, non-linear relationships, etc
plot(rules,method="graph")

#With mineral water , do customer buy energy drink?
#loading the dataset as csv file and then deleting the unwanted variables
data4 <- read.csv("MBA.csv")
data4$olive.oil <- NULL
data4$shrimp <- NULL
data4$spinach <- NULL
data4$almonds <- NULL
data4$avocado <- NULL
data4$vegetables.mix <- NULL
data4$green.grapes <- NULL
data4$whole.weat.flour <- NULL
data4$yams <- NULL
data4$cottage.cheese <- NULL
data4$salad <- NULL
data4$honey <- NULL
data4$salmon <- NULL 
#seeing the structure of the updated dataset
str(data4)
## 'data.frame':    7500 obs. of  7 variables:
##  $ energy.drink     : Factor w/ 89 levels "","almonds","antioxydant juice",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tomato.juice     : Factor w/ 81 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ low.fat.yogurt   : Factor w/ 67 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ green.tea        : Factor w/ 51 levels "","blueberries",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ mineral.water    : Factor w/ 19 levels "","candy bars",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ antioxydant.juice: Factor w/ 3 levels "","french fries",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ frozen.smoothie  : Factor w/ 3 levels "","protein bar",..: 1 1 1 1 1 1 1 1 1 1 ...
#converting the variable type from factor to integer
data4$energy.drink <- as.integer(as.factor(data4$energy.drink))
data4$tomato.juice <- as.integer(as.factor(data4$tomato.juice))
data4$low.fat.yogurt <- as.integer(as.factor(data4$low.fat.yogurt))
data4$green.tea <- as.integer(as.factor(data4$green.tea))
data4$mineral.water <- as.integer(as.factor(data4$mineral.water))
data4$antioxydant.juice <- as.integer(as.factor(data4$antioxydant.juice))
data4$frozen.smoothie <- as.integer(as.factor(data4$frozen.smoothie))
#subseting the dataset to get the interesting variable
data5 <- subset(data4, select = c("mineral.water", "frozen.smoothie",
                                       "antioxydant.juice","green.tea",
                                       "tomato.juice","energy.drink"))
#seeing the summary of the dataset
summary(data5)
##  mineral.water   frozen.smoothie antioxydant.juice   green.tea     
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000     Min.   : 1.000  
##  1st Qu.: 1.00   1st Qu.:1.000   1st Qu.:1.000     1st Qu.: 1.000  
##  Median : 1.00   Median :1.000   Median :1.000     Median : 1.000  
##  Mean   : 1.03   Mean   :1.001   Mean   :1.001     Mean   : 1.524  
##  3rd Qu.: 1.00   3rd Qu.:1.000   3rd Qu.:1.000     3rd Qu.: 1.000  
##  Max.   :19.00   Max.   :3.000   Max.   :3.000     Max.   :51.000  
##   tomato.juice     energy.drink   
##  Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 1.000   Median : 1.000  
##  Mean   : 3.067   Mean   : 4.902  
##  3rd Qu.: 1.000   3rd Qu.: 1.000  
##  Max.   :81.000   Max.   :89.000
#now we are seeing the corelation between mineral water and other drinks
#frozen smoothie, antioxydant.juice, green.tea, low.fat.yogurt are compartively
#related whereas tomato.juice and energy.drink are less related
cor(data5)
##                   mineral.water frozen.smoothie antioxydant.juice
## mineral.water         1.0000000      0.34250453        0.29250456
## frozen.smoothie       0.3425045      1.00000000        0.95256220
## antioxydant.juice     0.2925046      0.95256220        1.00000000
## green.tea             0.2738993      0.11236689        0.09507322
## tomato.juice          0.1388897      0.10326421        0.10835472
## energy.drink          0.1497351      0.03813707        0.03034197
##                    green.tea tomato.juice energy.drink
## mineral.water     0.27389930    0.1388897   0.14973506
## frozen.smoothie   0.11236689    0.1032642   0.03813707
## antioxydant.juice 0.09507322    0.1083547   0.03034197
## green.tea         1.00000000    0.4441522   0.38096480
## tomato.juice      0.44415222    1.0000000   0.58608582
## energy.drink      0.38096480    0.5860858   1.00000000
#plotting the data, we can see that mineral water vs frozen smothie
#and antioxydant.juice are dense left hand side whereas mineral water vs
#green tea, tamato juice and energy drink is scattered
plot(data5)

#seeting linear regression model
drink.mod <- lm(mineral.water ~ energy.drink,
              data=data5)
# Summarize and print the results
summary(drink.mod)
## 
## Call:
## lm(formula = mineral.water ~ energy.drink, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5601 -0.0053 -0.0053 -0.0053 17.6921 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.9989568  0.0071404  139.90   <2e-16 ***
## energy.drink 0.0063050  0.0004808   13.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5837 on 7498 degrees of freedom
## Multiple R-squared:  0.02242,    Adjusted R-squared:  0.02229 
## F-statistic:   172 on 1 and 7498 DF,  p-value: < 2.2e-16
#Is there a relationship between the predictor and the response?
#The p-values for the regression coefficients are nearly zero. 
#This implies statistical significance, which in turn mean that there is a relationship.

#How strong is the relationship between the predictor and the response?
#The R^{2} value indicates that about 2.2% of the variation in the 
#response variable (mineral.water) is due to the predictor 
#variable (energy.drink)

#Is the relationship between the predictor and the response positive or negative?
#The regression coefficient for ‘energy drink’ is positive.
#Hence, the relationship is positive.

#What are the associated 95 % confidence and prediction intervals?
#The confidence 95% interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="confidence")
##        fit      lwr      upr
## 1 1.534886 1.458246 1.611526
#And, the 95% prediction interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="prediction")
##        fit       lwr      upr
## 1 1.534886 0.3880825 2.681689
#As expected the prediction interval is wider than the confidence interval.

#Write out the model in equation form
#The equation has the form Y= a + bX, where Y is the dependent variable 
#(that's the variable that goes on the Y axis), X is the independent variable 
#(i.e. it is plotted on the X axis), b is the slope of the line and a is the y-intercept.
#energy.drink = a + b(mineral.water)

#Plot the response and the predictor. Use the 
#abline() function to display the least squares regression line
plot(data5$mineral.water~data5$energy.drink, main ="MineralWater Vs EnergyDrink ", xlab = "MWater", ylab ="EDrink")
abline(coef = coef(drink.mod), col ="red")

#Produce diagnostic plots of the least squares regression fit.
par(mfrow=c(2,2))
plot(drink.mod)

#The first plot shows a pattern (straight line) between the residuals
#and the fitted values. This indicates a linear relationship 
#between the predictor and response variables. The second plot 
#shows that the residuals are non-normality distributed. The third plot 
#shows that the variance of the errors is constant. Finally, the 
#fourth plot indicates that there are leverage points in the data.

#Describe a problem statement using multiple linear regression
#With mineral water what other drinks customer buy?
# fit another model2, adding house and senate as predictors
drink.mod2 <- lm(mineral.water ~ energy.drink + frozen.smoothie,
                data=data5)
#summary of the multi linear model
summary(drink.mod2)
## 
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie, 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0497 -0.0036 -0.0036 -0.0036 17.7197 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6.0424823  0.2243802  -26.93   <2e-16 ***
## energy.drink     0.0057634  0.0004524   12.74   <2e-16 ***
## frozen.smoothie  7.0403396  0.2242447   31.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5488 on 7497 degrees of freedom
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1358 
## F-statistic: 590.1 on 2 and 7497 DF,  p-value: < 2.2e-16
# fit another model3, adding house and senate as predictors
drink.mod3 <- lm(mineral.water ~ energy.drink + frozen.smoothie + green.tea,
                 data=data5)
#summary of the multi linear model
summary(drink.mod3)
## 
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie + 
##     green.tea, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7260 -0.0006 -0.0006 -0.0006 17.6014 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.6290809  0.2202096 -25.562  < 2e-16 ***
## energy.drink     0.0023084  0.0004775   4.834 1.36e-06 ***
## frozen.smoothie  6.5954883  0.2202465  29.946  < 2e-16 ***
## green.tea        0.0319061  0.0016728  19.074  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.536 on 7496 degrees of freedom
## Multiple R-squared:  0.176,  Adjusted R-squared:  0.1757 
## F-statistic: 533.7 on 3 and 7496 DF,  p-value: < 2.2e-16
#The p-values for the regression coefficients for the variables are nearly zero. 
#This implies statistical significance, which in turn mean that 
#there is a relationship.

#This may be checked using ANOVA to check all the variables 
anova(drink.mod,drink.mod2)
## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7498 2554.7                                 
## 2   7497 2257.9  1    296.86 985.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more 
#complex model has one additional parameter), and a very small p-value (< .001). 
#This means that adding the frozen smoothie to the model did lead to a significantly 
#improved fit over the model 1
anova(drink.mod2,drink.mod3)
## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink + frozen.smoothie
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7497 2257.9                                 
## 2   7496 2153.3  1    104.51 363.8 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more complex 
#model has one additional parameter), and a very small p-value (< .001). 
anova(drink.mod,drink.mod3)
## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7498 2554.7                                 
## 2   7496 2153.3  2    401.37 698.6 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more 
#complex model has one additional parameter), and a very small p-value (< .001). 
#This means that adding the green.tea to the model did lead to a significantly 
#improved fit over the model 1

#v. Check if there are any interactions between predictors in determining the outcome
#1. If there are any interactions use the * and : symbols to fit linear regression
#models with interaction effects
#Do any interactions appear statistically significant?
drink.mod3 <- lm(mineral.water ~.-energy.drink+frozen.smoothie:antioxydant.juice,
                 data=data5)
summary(drink.mod3)
## 
## Call:
## lm(formula = mineral.water ~ . - energy.drink + frozen.smoothie:antioxydant.juice, 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6543 -0.0078 -0.0078 -0.0078 17.6704 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       -2.033e+01  1.692e+00 -12.018  < 2e-16
## frozen.smoothie                    2.607e+01  1.715e+00  15.197  < 2e-16
## antioxydant.juice                  1.415e-01  9.209e-01   0.154    0.878
## green.tea                          3.243e-02  1.719e-03  18.866  < 2e-16
## tomato.juice                       7.691e-04  6.829e-04   1.126    0.260
## frozen.smoothie:antioxydant.juice -4.908e+00  5.974e-01  -8.216 2.46e-16
##                                      
## (Intercept)                       ***
## frozen.smoothie                   ***
## antioxydant.juice                    
## green.tea                         ***
## tomato.juice                         
## frozen.smoothie:antioxydant.juice ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5311 on 7494 degrees of freedom
## Multiple R-squared:  0.1911, Adjusted R-squared:  0.1905 
## F-statistic: 354.1 on 5 and 7494 DF,  p-value: < 2.2e-16
drink.mod4 <- lm(mineral.water ~.-energy.drink+frozen.smoothie:antioxydant.juice+green.tea:tomato.juice+green.tea,
                 data=data5)
summary(drink.mod4)
## 
## Call:
## lm(formula = mineral.water ~ . - energy.drink + frozen.smoothie:antioxydant.juice + 
##     green.tea:tomato.juice + green.tea, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7174 -0.0033 -0.0033 -0.0033 17.6086 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       -1.759e+01  1.684e+00 -10.442  < 2e-16
## frozen.smoothie                    2.372e+01  1.705e+00  13.915  < 2e-16
## antioxydant.juice                 -1.332e+00  9.169e-01  -1.453    0.146
## green.tea                          6.452e-02  2.945e-03  21.913  < 2e-16
## tomato.juice                       4.875e-03  7.418e-04   6.572 5.31e-11
## frozen.smoothie:antioxydant.juice -3.864e+00  5.956e-01  -6.487 9.31e-11
## green.tea:tomato.juice            -9.591e-04  7.187e-05 -13.345  < 2e-16
##                                      
## (Intercept)                       ***
## frozen.smoothie                   ***
## antioxydant.juice                    
## green.tea                         ***
## tomato.juice                      ***
## frozen.smoothie:antioxydant.juice ***
## green.tea:tomato.juice            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5249 on 7493 degrees of freedom
## Multiple R-squared:  0.2099, Adjusted R-squared:  0.2092 
## F-statistic: 331.7 on 6 and 7493 DF,  p-value: < 2.2e-16
drink.mod5 = lm(mineral.water ~.-energy.drink-frozen.smoothie-antioxydant.juice+green.tea:
             antioxydant.juice+green.tea:tomato.juice+antioxydant.juice:
             frozen.smoothie+antioxydant.juice+green.tea, data=data5)
summary(drink.mod5)
## 
## Call:
## lm(formula = mineral.water ~ . - energy.drink - frozen.smoothie - 
##     antioxydant.juice + green.tea:antioxydant.juice + green.tea:tomato.juice + 
##     antioxydant.juice:frozen.smoothie + antioxydant.juice + green.tea, 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7174 -0.0033 -0.0033 -0.0033 17.6086 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        2.774e+00  5.880e-01   4.718 2.43e-06
## green.tea                         -1.592e-01  1.680e-02  -9.477  < 2e-16
## tomato.juice                       4.875e-03  7.418e-04   6.572 5.31e-11
## antioxydant.juice                 -2.898e+00  8.710e-01  -3.328 0.000880
## antioxydant.juice:green.tea        2.238e-01  1.608e-02  13.915  < 2e-16
## green.tea:tomato.juice            -9.591e-04  7.187e-05 -13.345  < 2e-16
## frozen.smoothie:antioxydant.juice  1.059e+00  3.106e-01   3.411 0.000651
##                                      
## (Intercept)                       ***
## green.tea                         ***
## tomato.juice                      ***
## antioxydant.juice                 ***
## antioxydant.juice:green.tea       ***
## green.tea:tomato.juice            ***
## frozen.smoothie:antioxydant.juice ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5249 on 7493 degrees of freedom
## Multiple R-squared:  0.2099, Adjusted R-squared:  0.2092 
## F-statistic: 331.7 on 6 and 7493 DF,  p-value: < 2.2e-16
#From all the 3 models, the last model is the only one with all variables being significant. 
#And, based on results from a few trials not show here, it is very likely that it is the best
#combination of predictors and interaction terms. The R-squared statistics estimates that 20% 
#of the changes in the response can be explained by this particular set of predictors 
#( single and interaction.) A higher value was not obtained from the trials.

#If there are any non-linear relationships, try a few different transformations of the
#variables (log(x), sqrt(x), x2) to see if they yield a better fitting model. Comment on
#your findings. I have used log
a <- lm(mineral.water~log(energy.drink),data = data5)
lm(mineral.water~log(green.tea),data = data5)
## 
## Call:
## lm(formula = mineral.water ~ log(green.tea), data = data5)
## 
## Coefficients:
##    (Intercept)  log(green.tea)  
##         1.0025          0.4272
lm(mineral.water~log(frozen.smoothie),data = data5)
## 
## Call:
## lm(formula = mineral.water ~ log(frozen.smoothie), data = data5)
## 
## Coefficients:
##          (Intercept)  log(frozen.smoothie)  
##                1.026                11.861
lm(mineral.water~log(antioxydant.juice),data = data5)
## 
## Call:
## lm(formula = mineral.water ~ log(antioxydant.juice), data = data5)
## 
## Coefficients:
##            (Intercept)  log(antioxydant.juice)  
##                  1.026                   9.160
lm(mineral.water~log(tomato.juice),data = data5)
## 
## Call:
## lm(formula = mineral.water ~ log(tomato.juice), data = data5)
## 
## Coefficients:
##       (Intercept)  log(tomato.juice)  
##            1.0043             0.1385
#I can see that at the intercept of 1.026 frozen.smoothie is the highest with 11.86%