#loading library for market basket analysis
#loading MBA datset
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
#loading data and saving in data3 variable
data3 <- read.transactions('MBA.csv',sep = ',')
## Warning in asMethod(object): removing duplicated items in transactions
#summary of the dataset, showing frequent items mineral water, eggs,
#spagetti etc. 1 trasaction is 1754 then 20 transactions is 1.
summary(data3)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17
## 16 18 19 20
## 4 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# missing values in the dataset, showing false
is.na(data3)
## Warning in is.na(data3): is.na() applied to non-(list or vector) of type
## 'S4'
## [1] FALSE
#exploratory analysis of the data to better understand the data
#inspecting the data into two brackets
inspect(data3[1:2])
## items
## [1] {almonds,
## antioxydant juice,
## avocado,
## cottage cheese,
## energy drink,
## frozen smoothie,
## green grapes,
## green tea,
## honey,
## low fat yogurt,
## mineral water,
## olive oil,
## salad,
## salmon,
## shrimp,
## spinach,
## tomato juice,
## vegetables mix,
## whole weat flour,
## yams}
## [2] {burgers,
## eggs,
## meatballs}
#what pecentage a particular item will showup, showing support
itemFrequency(data3[,1:6])
## almonds antioxydant juice asparagus avocado
## 0.020397280 0.008932142 0.004799360 0.033328889
## babies food bacon
## 0.004532729 0.008665511
#plotting frequency wth support .20 and .10, showing .20 support mineral water purchased
#more then 20% and when the support is .10 its showing mineral water, eggs,frech fries,
#spagetti less is milk(12%)
itemFrequencyPlot(data3,support=.20)

itemFrequencyPlot(data3,support=.10)

#5 items showing highest support
itemFrequencyPlot(data3,topN=5)

#problem statement using association rule mining to find
#associations between interesting variables
#What are customers likely to buy if they purchase mineral water?
#now showing confidence - conditional probability,
# Get the rules, support minimums taken .001 and minimum confidence is
#.15 giving set of 5 rules
rules <- apriori(data3, parameter=list(supp=0.001, conf=0.15,minlen=2), appearance=list(default="rhs", lhs="mineral water"),control=list(verbose=F))
rules <- sort(rules, by = "confidence", decreasing = TRUE)
summary(rules)
## set of 5 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 5
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.04093 Min. :0.1717 Min. :1.189 Min. :307.0
## 1st Qu.:0.04799 1st Qu.:0.2013 1st Qu.:1.348 1st Qu.:360.0
## Median :0.05093 Median :0.2136 Median :1.439 Median :382.0
## Mean :0.05045 Mean :0.2116 Mean :1.456 Mean :378.4
## 3rd Qu.:0.05266 3rd Qu.:0.2209 3rd Qu.:1.554 3rd Qu.:395.0
## Max. :0.05973 Max. :0.2506 Max. :1.748 Max. :448.0
##
## mining info:
## data ntransactions support confidence
## data3 7501 0.001 0.15
#inspecting rule, showing if buying mineral water
# 25%confidence you will likely to buy spagetti. If buying mineral water you will likely to buy
#chocolate 22%confidence and if buying mineral water you will likely to buy
#eggs 21% confidence
inspect(rules[1:5]) ## Look at the first 5 generated rules
## lhs rhs support confidence lift count
## [1] {mineral water} => {spaghetti} 0.05972537 0.2505593 1.439085 448
## [2] {mineral water} => {chocolate} 0.05265965 0.2209172 1.348332 395
## [3] {mineral water} => {eggs} 0.05092654 0.2136465 1.188845 382
## [4] {mineral water} => {milk} 0.04799360 0.2013423 1.553774 360
## [5] {mineral water} => {ground beef} 0.04092788 0.1717002 1.747522 307
#on the predictors you have found among the generated rules using suitable
#plots (scatter plots, box plots, etc).
#Look for multivariate outliers, non-linear relationships, etc
plot(rules,method="graph")

#With mineral water , do customer buy energy drink?
#loading the dataset as csv file and then deleting the unwanted variables
data4 <- read.csv("MBA.csv")
data4$olive.oil <- NULL
data4$shrimp <- NULL
data4$spinach <- NULL
data4$almonds <- NULL
data4$avocado <- NULL
data4$vegetables.mix <- NULL
data4$green.grapes <- NULL
data4$whole.weat.flour <- NULL
data4$yams <- NULL
data4$cottage.cheese <- NULL
data4$salad <- NULL
data4$honey <- NULL
data4$salmon <- NULL
#seeing the structure of the updated dataset
str(data4)
## 'data.frame': 7500 obs. of 7 variables:
## $ energy.drink : Factor w/ 89 levels "","almonds","antioxydant juice",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ tomato.juice : Factor w/ 81 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ low.fat.yogurt : Factor w/ 67 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ green.tea : Factor w/ 51 levels "","blueberries",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ mineral.water : Factor w/ 19 levels "","candy bars",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ antioxydant.juice: Factor w/ 3 levels "","french fries",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ frozen.smoothie : Factor w/ 3 levels "","protein bar",..: 1 1 1 1 1 1 1 1 1 1 ...
#converting the variable type from factor to integer
data4$energy.drink <- as.integer(as.factor(data4$energy.drink))
data4$tomato.juice <- as.integer(as.factor(data4$tomato.juice))
data4$low.fat.yogurt <- as.integer(as.factor(data4$low.fat.yogurt))
data4$green.tea <- as.integer(as.factor(data4$green.tea))
data4$mineral.water <- as.integer(as.factor(data4$mineral.water))
data4$antioxydant.juice <- as.integer(as.factor(data4$antioxydant.juice))
data4$frozen.smoothie <- as.integer(as.factor(data4$frozen.smoothie))
#subseting the dataset to get the interesting variable
data5 <- subset(data4, select = c("mineral.water", "frozen.smoothie",
"antioxydant.juice","green.tea",
"tomato.juice","energy.drink"))
#seeing the summary of the dataset
summary(data5)
## mineral.water frozen.smoothie antioxydant.juice green.tea
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. : 1.000
## 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 1.000
## Median : 1.00 Median :1.000 Median :1.000 Median : 1.000
## Mean : 1.03 Mean :1.001 Mean :1.001 Mean : 1.524
## 3rd Qu.: 1.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.: 1.000
## Max. :19.00 Max. :3.000 Max. :3.000 Max. :51.000
## tomato.juice energy.drink
## Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 1.000 Median : 1.000
## Mean : 3.067 Mean : 4.902
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :81.000 Max. :89.000
#now we are seeing the corelation between mineral water and other drinks
#frozen smoothie, antioxydant.juice, green.tea, low.fat.yogurt are compartively
#related whereas tomato.juice and energy.drink are less related
cor(data5)
## mineral.water frozen.smoothie antioxydant.juice
## mineral.water 1.0000000 0.34250453 0.29250456
## frozen.smoothie 0.3425045 1.00000000 0.95256220
## antioxydant.juice 0.2925046 0.95256220 1.00000000
## green.tea 0.2738993 0.11236689 0.09507322
## tomato.juice 0.1388897 0.10326421 0.10835472
## energy.drink 0.1497351 0.03813707 0.03034197
## green.tea tomato.juice energy.drink
## mineral.water 0.27389930 0.1388897 0.14973506
## frozen.smoothie 0.11236689 0.1032642 0.03813707
## antioxydant.juice 0.09507322 0.1083547 0.03034197
## green.tea 1.00000000 0.4441522 0.38096480
## tomato.juice 0.44415222 1.0000000 0.58608582
## energy.drink 0.38096480 0.5860858 1.00000000
#plotting the data, we can see that mineral water vs frozen smothie
#and antioxydant.juice are dense left hand side whereas mineral water vs
#green tea, tamato juice and energy drink is scattered
plot(data5)

#seeting linear regression model
drink.mod <- lm(mineral.water ~ energy.drink,
data=data5)
# Summarize and print the results
summary(drink.mod)
##
## Call:
## lm(formula = mineral.water ~ energy.drink, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5601 -0.0053 -0.0053 -0.0053 17.6921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9989568 0.0071404 139.90 <2e-16 ***
## energy.drink 0.0063050 0.0004808 13.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5837 on 7498 degrees of freedom
## Multiple R-squared: 0.02242, Adjusted R-squared: 0.02229
## F-statistic: 172 on 1 and 7498 DF, p-value: < 2.2e-16
#Is there a relationship between the predictor and the response?
#The p-values for the regression coefficients are nearly zero.
#This implies statistical significance, which in turn mean that there is a relationship.
#How strong is the relationship between the predictor and the response?
#The R^{2} value indicates that about 2.2% of the variation in the
#response variable (mineral.water) is due to the predictor
#variable (energy.drink)
#Is the relationship between the predictor and the response positive or negative?
#The regression coefficient for ‘energy drink’ is positive.
#Hence, the relationship is positive.
#What are the associated 95 % confidence and prediction intervals?
#The confidence 95% interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="confidence")
## fit lwr upr
## 1 1.534886 1.458246 1.611526
#And, the 95% prediction interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="prediction")
## fit lwr upr
## 1 1.534886 0.3880825 2.681689
#As expected the prediction interval is wider than the confidence interval.
#Write out the model in equation form
#The equation has the form Y= a + bX, where Y is the dependent variable
#(that's the variable that goes on the Y axis), X is the independent variable
#(i.e. it is plotted on the X axis), b is the slope of the line and a is the y-intercept.
#energy.drink = a + b(mineral.water)
#Plot the response and the predictor. Use the
#abline() function to display the least squares regression line
plot(data5$mineral.water~data5$energy.drink, main ="MineralWater Vs EnergyDrink ", xlab = "MWater", ylab ="EDrink")
abline(coef = coef(drink.mod), col ="red")

#Produce diagnostic plots of the least squares regression fit.
par(mfrow=c(2,2))
plot(drink.mod)

#The first plot shows a pattern (straight line) between the residuals
#and the fitted values. This indicates a linear relationship
#between the predictor and response variables. The second plot
#shows that the residuals are non-normality distributed. The third plot
#shows that the variance of the errors is constant. Finally, the
#fourth plot indicates that there are leverage points in the data.
#Describe a problem statement using multiple linear regression
#With mineral water what other drinks customer buy?
# fit another model2, adding house and senate as predictors
drink.mod2 <- lm(mineral.water ~ energy.drink + frozen.smoothie,
data=data5)
#summary of the multi linear model
summary(drink.mod2)
##
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie,
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0497 -0.0036 -0.0036 -0.0036 17.7197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.0424823 0.2243802 -26.93 <2e-16 ***
## energy.drink 0.0057634 0.0004524 12.74 <2e-16 ***
## frozen.smoothie 7.0403396 0.2242447 31.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5488 on 7497 degrees of freedom
## Multiple R-squared: 0.136, Adjusted R-squared: 0.1358
## F-statistic: 590.1 on 2 and 7497 DF, p-value: < 2.2e-16
# fit another model3, adding house and senate as predictors
drink.mod3 <- lm(mineral.water ~ energy.drink + frozen.smoothie + green.tea,
data=data5)
#summary of the multi linear model
summary(drink.mod3)
##
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie +
## green.tea, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7260 -0.0006 -0.0006 -0.0006 17.6014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.6290809 0.2202096 -25.562 < 2e-16 ***
## energy.drink 0.0023084 0.0004775 4.834 1.36e-06 ***
## frozen.smoothie 6.5954883 0.2202465 29.946 < 2e-16 ***
## green.tea 0.0319061 0.0016728 19.074 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.536 on 7496 degrees of freedom
## Multiple R-squared: 0.176, Adjusted R-squared: 0.1757
## F-statistic: 533.7 on 3 and 7496 DF, p-value: < 2.2e-16
#The p-values for the regression coefficients for the variables are nearly zero.
#This implies statistical significance, which in turn mean that
#there is a relationship.
#This may be checked using ANOVA to check all the variables
anova(drink.mod,drink.mod2)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7498 2554.7
## 2 7497 2257.9 1 296.86 985.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more
#complex model has one additional parameter), and a very small p-value (< .001).
#This means that adding the frozen smoothie to the model did lead to a significantly
#improved fit over the model 1
anova(drink.mod2,drink.mod3)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink + frozen.smoothie
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7497 2257.9
## 2 7496 2153.3 1 104.51 363.8 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more complex
#model has one additional parameter), and a very small p-value (< .001).
anova(drink.mod,drink.mod3)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7498 2554.7
## 2 7496 2153.3 2 401.37 698.6 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more
#complex model has one additional parameter), and a very small p-value (< .001).
#This means that adding the green.tea to the model did lead to a significantly
#improved fit over the model 1
#v. Check if there are any interactions between predictors in determining the outcome
#1. If there are any interactions use the * and : symbols to fit linear regression
#models with interaction effects
#Do any interactions appear statistically significant?
drink.mod3 <- lm(mineral.water ~.-energy.drink+frozen.smoothie:antioxydant.juice,
data=data5)
summary(drink.mod3)
##
## Call:
## lm(formula = mineral.water ~ . - energy.drink + frozen.smoothie:antioxydant.juice,
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6543 -0.0078 -0.0078 -0.0078 17.6704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.033e+01 1.692e+00 -12.018 < 2e-16
## frozen.smoothie 2.607e+01 1.715e+00 15.197 < 2e-16
## antioxydant.juice 1.415e-01 9.209e-01 0.154 0.878
## green.tea 3.243e-02 1.719e-03 18.866 < 2e-16
## tomato.juice 7.691e-04 6.829e-04 1.126 0.260
## frozen.smoothie:antioxydant.juice -4.908e+00 5.974e-01 -8.216 2.46e-16
##
## (Intercept) ***
## frozen.smoothie ***
## antioxydant.juice
## green.tea ***
## tomato.juice
## frozen.smoothie:antioxydant.juice ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5311 on 7494 degrees of freedom
## Multiple R-squared: 0.1911, Adjusted R-squared: 0.1905
## F-statistic: 354.1 on 5 and 7494 DF, p-value: < 2.2e-16
drink.mod4 <- lm(mineral.water ~.-energy.drink+frozen.smoothie:antioxydant.juice+green.tea:tomato.juice+green.tea,
data=data5)
summary(drink.mod4)
##
## Call:
## lm(formula = mineral.water ~ . - energy.drink + frozen.smoothie:antioxydant.juice +
## green.tea:tomato.juice + green.tea, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7174 -0.0033 -0.0033 -0.0033 17.6086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.759e+01 1.684e+00 -10.442 < 2e-16
## frozen.smoothie 2.372e+01 1.705e+00 13.915 < 2e-16
## antioxydant.juice -1.332e+00 9.169e-01 -1.453 0.146
## green.tea 6.452e-02 2.945e-03 21.913 < 2e-16
## tomato.juice 4.875e-03 7.418e-04 6.572 5.31e-11
## frozen.smoothie:antioxydant.juice -3.864e+00 5.956e-01 -6.487 9.31e-11
## green.tea:tomato.juice -9.591e-04 7.187e-05 -13.345 < 2e-16
##
## (Intercept) ***
## frozen.smoothie ***
## antioxydant.juice
## green.tea ***
## tomato.juice ***
## frozen.smoothie:antioxydant.juice ***
## green.tea:tomato.juice ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5249 on 7493 degrees of freedom
## Multiple R-squared: 0.2099, Adjusted R-squared: 0.2092
## F-statistic: 331.7 on 6 and 7493 DF, p-value: < 2.2e-16
drink.mod5 = lm(mineral.water ~.-energy.drink-frozen.smoothie-antioxydant.juice+green.tea:
antioxydant.juice+green.tea:tomato.juice+antioxydant.juice:
frozen.smoothie+antioxydant.juice+green.tea, data=data5)
summary(drink.mod5)
##
## Call:
## lm(formula = mineral.water ~ . - energy.drink - frozen.smoothie -
## antioxydant.juice + green.tea:antioxydant.juice + green.tea:tomato.juice +
## antioxydant.juice:frozen.smoothie + antioxydant.juice + green.tea,
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7174 -0.0033 -0.0033 -0.0033 17.6086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.774e+00 5.880e-01 4.718 2.43e-06
## green.tea -1.592e-01 1.680e-02 -9.477 < 2e-16
## tomato.juice 4.875e-03 7.418e-04 6.572 5.31e-11
## antioxydant.juice -2.898e+00 8.710e-01 -3.328 0.000880
## antioxydant.juice:green.tea 2.238e-01 1.608e-02 13.915 < 2e-16
## green.tea:tomato.juice -9.591e-04 7.187e-05 -13.345 < 2e-16
## frozen.smoothie:antioxydant.juice 1.059e+00 3.106e-01 3.411 0.000651
##
## (Intercept) ***
## green.tea ***
## tomato.juice ***
## antioxydant.juice ***
## antioxydant.juice:green.tea ***
## green.tea:tomato.juice ***
## frozen.smoothie:antioxydant.juice ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5249 on 7493 degrees of freedom
## Multiple R-squared: 0.2099, Adjusted R-squared: 0.2092
## F-statistic: 331.7 on 6 and 7493 DF, p-value: < 2.2e-16
#From all the 3 models, the last model is the only one with all variables being significant.
#And, based on results from a few trials not show here, it is very likely that it is the best
#combination of predictors and interaction terms. The R-squared statistics estimates that 20%
#of the changes in the response can be explained by this particular set of predictors
#( single and interaction.) A higher value was not obtained from the trials.
#If there are any non-linear relationships, try a few different transformations of the
#variables (log(x), sqrt(x), x2) to see if they yield a better fitting model. Comment on
#your findings. I have used log
a <- lm(mineral.water~log(energy.drink),data = data5)
lm(mineral.water~log(green.tea),data = data5)
##
## Call:
## lm(formula = mineral.water ~ log(green.tea), data = data5)
##
## Coefficients:
## (Intercept) log(green.tea)
## 1.0025 0.4272
lm(mineral.water~log(frozen.smoothie),data = data5)
##
## Call:
## lm(formula = mineral.water ~ log(frozen.smoothie), data = data5)
##
## Coefficients:
## (Intercept) log(frozen.smoothie)
## 1.026 11.861
lm(mineral.water~log(antioxydant.juice),data = data5)
##
## Call:
## lm(formula = mineral.water ~ log(antioxydant.juice), data = data5)
##
## Coefficients:
## (Intercept) log(antioxydant.juice)
## 1.026 9.160
lm(mineral.water~log(tomato.juice),data = data5)
##
## Call:
## lm(formula = mineral.water ~ log(tomato.juice), data = data5)
##
## Coefficients:
## (Intercept) log(tomato.juice)
## 1.0043 0.1385
#I can see that at the intercept of 1.026 frozen.smoothie is the highest with 11.86%