#loading library for market basket analysis
#loading MBA datset
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
#loading data and saving in data3 variable
data3 <- read.transactions('MBA.csv',sep = ',')
## Warning in asMethod(object): removing duplicated items in transactions
#summary of the dataset, showing frequent items mineral water, eggs,
#spagetti etc. 1 trasaction is 1754 then 20 transactions is 1.
summary(data3)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17
## 16 18 19 20
## 4 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# missing values in the dataset, showing false
is.na(data3)
## Warning in is.na(data3): is.na() applied to non-(list or vector) of type
## 'S4'
## [1] FALSE
#exploratory analysis of the data to better understand the data
#inspecting the data into two brackets
inspect(data3[1:2])
## items
## [1] {almonds,
## antioxydant juice,
## avocado,
## cottage cheese,
## energy drink,
## frozen smoothie,
## green grapes,
## green tea,
## honey,
## low fat yogurt,
## mineral water,
## olive oil,
## salad,
## salmon,
## shrimp,
## spinach,
## tomato juice,
## vegetables mix,
## whole weat flour,
## yams}
## [2] {burgers,
## eggs,
## meatballs}
#what pecentage a particular item will showup, showing support
itemFrequency(data3[,1:6])
## almonds antioxydant juice asparagus avocado
## 0.020397280 0.008932142 0.004799360 0.033328889
## babies food bacon
## 0.004532729 0.008665511
#plotting frequency wth support .20 and .10, showing .20 support mineral water purchased
#more then 20% and when the support is .10 its showing mineral water, eggs,frech fries,
#spagetti less is milk(12%)
itemFrequencyPlot(data3,support=.20)

itemFrequencyPlot(data3,support=.10)

#5 items showing highest support
itemFrequencyPlot(data3,topN=5)

#problem statement using association rule mining to find
#associations between interesting variables
#Do people buy three or two items togther, if yes what are they?
#now showing confidence - conditional probability,
# Get the rules, support minimums taken .007 and minimum confidence is
#.25 giving good 200 rules and we kept two items minimum(minlen) in the rule
m1 <- apriori(data3, parameter = list(supp = 0.007, conf = 0.25,minlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.007 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 52
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [91 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [200 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
#summary of the model, rule length showing 2nd is telling 81 items showing
#if you bought item 1 you will but item 2 as well, 3rd is telling 119 items
#shoing if you are vuying 2 items you will buy 3rd item as well.
summary(m1)
## set of 200 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 81 119
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.595 3.000 3.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.007066 Min. :0.2500 Min. :1.146 Min. : 53.0
## 1st Qu.:0.007999 1st Qu.:0.2844 1st Qu.:1.539 1st Qu.: 60.0
## Median :0.009732 Median :0.3229 Median :1.754 Median : 73.0
## Mean :0.013292 Mean :0.3425 Mean :1.834 Mean : 99.7
## 3rd Qu.:0.015331 3rd Qu.:0.3899 3rd Qu.:2.010 3rd Qu.:115.0
## Max. :0.059725 Max. :0.5614 Max. :4.122 Max. :448.0
##
## mining info:
## data ntransactions support confidence
## data3 7501 0.007 0.25
#inspecting rule by sorting by high support upto 4 items, showing if buying
#spagetti 6% you will buy mineral water same for vise versa. If
#buying chocolate buying mineral water 5.3% and if buying eggs showing
#mineral water 5.1%
inspect(sort(m1, by='support')[1:4])
## lhs rhs support confidence lift
## [1] {spaghetti} => {mineral water} 0.05972537 0.3430322 1.439085
## [2] {mineral water} => {spaghetti} 0.05972537 0.2505593 1.439085
## [3] {chocolate} => {mineral water} 0.05265965 0.3213995 1.348332
## [4] {eggs} => {mineral water} 0.05092654 0.2833828 1.188845
## count
## [1] 448
## [2] 448
## [3] 395
## [4] 382
#inspecting rule by sorting by high confidence upto 4 items, showing if buying
#milk and soup 56% you will buy mineral water,if buying
#frozen vegetables and ground beef 54% you will buy mineral water. If
#buying soup,spaghetti you will buy mineral water 52% and if buying chicken,chocolate
#showing mineral water 52%
inspect(sort(m1, by='confidence')[1:4])
## lhs rhs support
## [1] {milk,soup} => {mineral water} 0.008532196
## [2] {frozen vegetables,ground beef} => {mineral water} 0.009198773
## [3] {soup,spaghetti} => {mineral water} 0.007465671
## [4] {chicken,chocolate} => {mineral water} 0.007598987
## confidence lift count
## [1] 0.5614035 2.355194 64
## [2] 0.5433071 2.279277 69
## [3] 0.5233645 2.195614 56
## [4] 0.5181818 2.173871 57
#inspecting rule by sorting by high lift upto 4 items, showing if buying
#whole wheat pasta 4.1(lift) you will likely buy olive oil,if buying
#herb & pepper 3.3(lift) you will buy ground beef. If
#buying mineral water,shrimp you will buy frozen vegetables 3.2(lift) and
#if buying frozen vegetables,spaghetti showing ground beef(3.2) likely to buy
#higher the support, confidence and lift higher is the chance of good rule
#good predictive power.
inspect(sort(m1, by='lift')[1:4])
## lhs rhs support
## [1] {whole wheat pasta} => {olive oil} 0.007998933
## [2] {herb & pepper} => {ground beef} 0.015997867
## [3] {mineral water,shrimp} => {frozen vegetables} 0.007199040
## [4] {frozen vegetables,spaghetti} => {ground beef} 0.008665511
## confidence lift count
## [1] 0.2714932 4.122410 60
## [2] 0.3234501 3.291994 120
## [3] 0.3050847 3.200616 54
## [4] 0.3110048 3.165328 65
#on the predictors you have found among the generated rules using suitable
#plots (scatter plots, box plots, etc).
#Look for multivariate outliers, non-linear relationships, etc
plot(m1[1:10],method="graph")

#With mineral water what other drinks customer buy?
#loading the dataset as csv file and then deleting the unwanted variables
data4 <- read.csv("MBA.csv")
data4$olive.oil <- NULL
data4$shrimp <- NULL
data4$spinach <- NULL
data4$almonds <- NULL
data4$avocado <- NULL
data4$vegetables.mix <- NULL
data4$green.grapes <- NULL
data4$whole.weat.flour <- NULL
data4$yams <- NULL
data4$cottage.cheese <- NULL
data4$salad <- NULL
data4$honey <- NULL
data4$salmon <- NULL
#seeing the structure of the updated dataset
str(data4)
## 'data.frame': 7500 obs. of 7 variables:
## $ energy.drink : Factor w/ 89 levels "","almonds","antioxydant juice",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ tomato.juice : Factor w/ 81 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ low.fat.yogurt : Factor w/ 67 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ green.tea : Factor w/ 51 levels "","blueberries",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ mineral.water : Factor w/ 19 levels "","candy bars",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ antioxydant.juice: Factor w/ 3 levels "","french fries",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ frozen.smoothie : Factor w/ 3 levels "","protein bar",..: 1 1 1 1 1 1 1 1 1 1 ...
#converting the variable type from factor to integer
data4$energy.drink <- as.integer(as.factor(data4$energy.drink))
data4$tomato.juice <- as.integer(as.factor(data4$tomato.juice))
data4$low.fat.yogurt <- as.integer(as.factor(data4$low.fat.yogurt))
data4$green.tea <- as.integer(as.factor(data4$green.tea))
data4$mineral.water <- as.integer(as.factor(data4$mineral.water))
data4$antioxydant.juice <- as.integer(as.factor(data4$antioxydant.juice))
data4$frozen.smoothie <- as.integer(as.factor(data4$frozen.smoothie))
#subseting the dataset to get the interesting variable
data5 <- subset(data4, select = c("mineral.water", "frozen.smoothie",
"antioxydant.juice","green.tea",
"tomato.juice","energy.drink"))
#seeing the summary of the dataset
summary(data5)
## mineral.water frozen.smoothie antioxydant.juice green.tea
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. : 1.000
## 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 1.000
## Median : 1.00 Median :1.000 Median :1.000 Median : 1.000
## Mean : 1.03 Mean :1.001 Mean :1.001 Mean : 1.524
## 3rd Qu.: 1.00 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.: 1.000
## Max. :19.00 Max. :3.000 Max. :3.000 Max. :51.000
## tomato.juice energy.drink
## Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 1.000 Median : 1.000
## Mean : 3.067 Mean : 4.902
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :81.000 Max. :89.000
#now we are seeing the corelation between mineral water and other drinks
#frozen smoothie, antioxydant.juice, green.tea, low.fat.yogurt are compartively
#related whereas tomato.juice and energy.drink are less related
cor(data5)
## mineral.water frozen.smoothie antioxydant.juice
## mineral.water 1.0000000 0.34250453 0.29250456
## frozen.smoothie 0.3425045 1.00000000 0.95256220
## antioxydant.juice 0.2925046 0.95256220 1.00000000
## green.tea 0.2738993 0.11236689 0.09507322
## tomato.juice 0.1388897 0.10326421 0.10835472
## energy.drink 0.1497351 0.03813707 0.03034197
## green.tea tomato.juice energy.drink
## mineral.water 0.27389930 0.1388897 0.14973506
## frozen.smoothie 0.11236689 0.1032642 0.03813707
## antioxydant.juice 0.09507322 0.1083547 0.03034197
## green.tea 1.00000000 0.4441522 0.38096480
## tomato.juice 0.44415222 1.0000000 0.58608582
## energy.drink 0.38096480 0.5860858 1.00000000
#plotting the data, we can see that mineral water vs frozen smothie
#and antioxydant.juice are dense left hand side whereas mineral water vs
#green tea, tamato juice and energy drink is scattered
plot(data5)

#seeting linear regression model
drink.mod <- lm(mineral.water ~ energy.drink,
data=data5)
# Summarize and print the results
summary(drink.mod)
##
## Call:
## lm(formula = mineral.water ~ energy.drink, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5601 -0.0053 -0.0053 -0.0053 17.6921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9989568 0.0071404 139.90 <2e-16 ***
## energy.drink 0.0063050 0.0004808 13.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5837 on 7498 degrees of freedom
## Multiple R-squared: 0.02242, Adjusted R-squared: 0.02229
## F-statistic: 172 on 1 and 7498 DF, p-value: < 2.2e-16
#Is there a relationship between the predictor and the response?
#The p-values for the regression coefficients are nearly zero.
#This implies statistical significance, which in turn mean that there is a relationship.
#How strong is the relationship between the predictor and the response?
#The R^{2} value indicates that about 2.2% of the variation in the
#response variable (mineral.water) is due to the predictor
#variable (energy.drink)
#Is the relationship between the predictor and the response positive or negative?
#The regression coefficient for ‘energy drink’ is positive.
#Hence, the relationship is positive.
#What are the associated 95 % confidence and prediction intervals?
#The confidence 95% interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="confidence")
## fit lwr upr
## 1 1.534886 1.458246 1.611526
#And, the 95% prediction interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="prediction")
## fit lwr upr
## 1 1.534886 0.3880825 2.681689
#As expected the prediction interval is wider than the confidence interval.
#Plot the response and the predictor. Use the
#abline() function to display the least squares regression line
plot(data5$mineral.water~data5$energy.drink, main ="MineralWater Vs EnergyDrink ", xlab = "MWater", ylab ="EDrink")
abline(coef = coef(drink.mod), col ="red")

#Produce diagnostic plots of the least squares regression fit.
par(mfrow=c(2,2))
plot(drink.mod)

#The first plot shows a pattern (straight line) between the residuals
#and the fitted values. This indicates a linear relationship
#between the predictor and response variables. The second plot
#shows that the residuals are non-normality distributed. The third plot
#shows that the variance of the errors is constant. Finally, the
#fourth plot indicates that there are leverage points in the data.
#Describe a problem statement using multiple linear regression
#With mineral water what other drinks customer buy?
# fit another model2, adding house and senate as predictors
drink.mod2 <- lm(mineral.water ~ energy.drink + frozen.smoothie,
data=data5)
#summary of the multi linear modle
summary(drink.mod2)
##
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie,
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0497 -0.0036 -0.0036 -0.0036 17.7197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.0424823 0.2243802 -26.93 <2e-16 ***
## energy.drink 0.0057634 0.0004524 12.74 <2e-16 ***
## frozen.smoothie 7.0403396 0.2242447 31.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5488 on 7497 degrees of freedom
## Multiple R-squared: 0.136, Adjusted R-squared: 0.1358
## F-statistic: 590.1 on 2 and 7497 DF, p-value: < 2.2e-16
# fit another model3, adding house and senate as predictors
drink.mod3 <- lm(mineral.water ~ energy.drink + frozen.smoothie + green.tea,
data=data5)
#summary of the multi linear modle
summary(drink.mod3)
##
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie +
## green.tea, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7260 -0.0006 -0.0006 -0.0006 17.6014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.6290809 0.2202096 -25.562 < 2e-16 ***
## energy.drink 0.0023084 0.0004775 4.834 1.36e-06 ***
## frozen.smoothie 6.5954883 0.2202465 29.946 < 2e-16 ***
## green.tea 0.0319061 0.0016728 19.074 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.536 on 7496 degrees of freedom
## Multiple R-squared: 0.176, Adjusted R-squared: 0.1757
## F-statistic: 533.7 on 3 and 7496 DF, p-value: < 2.2e-16
#The p-values for the regression coefficients for the variables are nearly zero.
#This implies statistical significance, which in turn mean that
#there is a relationship.
#This may be checked using ANOVA to check all the variables
anova(drink.mod,drink.mod2)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7498 2554.7
## 2 7497 2257.9 1 296.86 985.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more
#complex model has one additional parameter), and a very small p-value (< .001).
#This means that adding the frozen smoothie to the model did lead to a significantly
#improved fit over the model 1
anova(drink.mod2,drink.mod3)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink + frozen.smoothie
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7497 2257.9
## 2 7496 2153.3 1 104.51 363.8 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more complex
#model has one additional parameter), and a very small p-value (< .001).
anova(drink.mod,drink.mod3)
## Analysis of Variance Table
##
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7498 2554.7
## 2 7496 2153.3 2 401.37 698.6 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#As you can see, the result shows a Df of 1 (indicating that the more
#complex model has one additional parameter), and a very small p-value (< .001).
#This means that adding the green.tea to the model did lead to a significantly
#improved fit over the model 1