mini_project2.utf8.md

#loading library for market basket analysis
#loading MBA datset
library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Loading required package: grid

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

#loading data and saving in data3 variable
data3 <- read.transactions('MBA.csv',sep = ',')

## Warning in asMethod(object): removing duplicated items in transactions

#summary of the dataset, showing frequent items mineral water, eggs,
#spagetti etc. 1 trasaction is 1754 then 20 transactions is 1.
summary(data3)

## transactions as itemMatrix in sparse format with
##  7501 rows (elements/itemsets/transactions) and
##  119 columns (items) and a density of 0.03288973 
## 
## most frequent items:
## mineral water          eggs     spaghetti  french fries     chocolate 
##          1788          1348          1306          1282          1229 
##       (Other) 
##         22405 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17 
##   16   18   19   20 
##    4    1    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.914   5.000  20.000 
## 
## includes extended item information - examples:
##              labels
## 1           almonds
## 2 antioxydant juice
## 3         asparagus

# missing values in the dataset, showing false
is.na(data3)

## Warning in is.na(data3): is.na() applied to non-(list or vector) of type
## 'S4'

## [1] FALSE

#exploratory analysis of the data to better understand the data 
#inspecting the data into two brackets
inspect(data3[1:2])

##     items              
## [1] {almonds,          
##      antioxydant juice,
##      avocado,          
##      cottage cheese,   
##      energy drink,     
##      frozen smoothie,  
##      green grapes,     
##      green tea,        
##      honey,            
##      low fat yogurt,   
##      mineral water,    
##      olive oil,        
##      salad,            
##      salmon,           
##      shrimp,           
##      spinach,          
##      tomato juice,     
##      vegetables mix,   
##      whole weat flour, 
##      yams}             
## [2] {burgers,          
##      eggs,             
##      meatballs}

#what pecentage a particular item will showup, showing support
itemFrequency(data3[,1:6])

##           almonds antioxydant juice         asparagus           avocado 
##       0.020397280       0.008932142       0.004799360       0.033328889 
##       babies food             bacon 
##       0.004532729       0.008665511

#plotting frequency wth support .20 and .10, showing .20 support mineral water purchased
#more then 20% and when the support is .10 its showing mineral water, eggs,frech fries,
#spagetti less is milk(12%)
itemFrequencyPlot(data3,support=.20)

itemFrequencyPlot(data3,support=.10)

#5 items showing highest support 
itemFrequencyPlot(data3,topN=5)

#problem statement using association rule mining to find 
#associations between interesting variables 
#Do people buy three or two items togther, if yes what are they?
#now showing confidence - conditional probability, 
# Get the rules, support minimums taken .007 and minimum confidence is
#.25 giving good 200 rules and we kept two items minimum(minlen) in the rule
m1 <- apriori(data3, parameter = list(supp = 0.007, conf = 0.25,minlen=2))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   0.007      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 52 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
## sorting and recoding items ... [91 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [200 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

#summary of the model, rule length showing 2nd is telling 81 items showing
#if you bought item 1 you will but item 2 as well, 3rd is telling 119 items
#shoing if you are vuying 2 items you will buy 3rd item as well.
summary(m1)

## set of 200 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3 
##  81 119 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.595   3.000   3.000 
## 
## summary of quality measures:
##     support           confidence          lift           count      
##  Min.   :0.007066   Min.   :0.2500   Min.   :1.146   Min.   : 53.0  
##  1st Qu.:0.007999   1st Qu.:0.2844   1st Qu.:1.539   1st Qu.: 60.0  
##  Median :0.009732   Median :0.3229   Median :1.754   Median : 73.0  
##  Mean   :0.013292   Mean   :0.3425   Mean   :1.834   Mean   : 99.7  
##  3rd Qu.:0.015331   3rd Qu.:0.3899   3rd Qu.:2.010   3rd Qu.:115.0  
##  Max.   :0.059725   Max.   :0.5614   Max.   :4.122   Max.   :448.0  
## 
## mining info:
##   data ntransactions support confidence
##  data3          7501   0.007       0.25

#inspecting rule by sorting by high support upto 4 items, showing if buying
#spagetti 6% you will buy mineral water same for vise versa. If
#buying chocolate buying mineral water 5.3% and if buying eggs showing
#mineral water 5.1%
inspect(sort(m1, by='support')[1:4])

##     lhs                rhs             support    confidence lift    
## [1] {spaghetti}     => {mineral water} 0.05972537 0.3430322  1.439085
## [2] {mineral water} => {spaghetti}     0.05972537 0.2505593  1.439085
## [3] {chocolate}     => {mineral water} 0.05265965 0.3213995  1.348332
## [4] {eggs}          => {mineral water} 0.05092654 0.2833828  1.188845
##     count
## [1] 448  
## [2] 448  
## [3] 395  
## [4] 382

#inspecting rule by sorting by high confidence upto 4 items, showing if buying
#milk and soup 56% you will buy mineral water,if buying
#frozen vegetables and ground beef 54% you will buy mineral water. If
#buying soup,spaghetti you will buy mineral water 52% and if buying chicken,chocolate
#showing mineral water 52%
inspect(sort(m1, by='confidence')[1:4])

##     lhs                                rhs             support    
## [1] {milk,soup}                     => {mineral water} 0.008532196
## [2] {frozen vegetables,ground beef} => {mineral water} 0.009198773
## [3] {soup,spaghetti}                => {mineral water} 0.007465671
## [4] {chicken,chocolate}             => {mineral water} 0.007598987
##     confidence lift     count
## [1] 0.5614035  2.355194 64   
## [2] 0.5433071  2.279277 69   
## [3] 0.5233645  2.195614 56   
## [4] 0.5181818  2.173871 57

#inspecting rule by sorting by high lift upto 4 items, showing if buying
#whole wheat pasta 4.1(lift) you will likely buy olive oil,if buying
#herb & pepper 3.3(lift) you will buy ground beef. If
#buying mineral water,shrimp you will buy frozen vegetables 3.2(lift) and
#if buying frozen vegetables,spaghetti showing ground beef(3.2) likely to buy
#higher the support, confidence and lift higher is the chance of good rule
#good predictive power.
inspect(sort(m1, by='lift')[1:4])

##     lhs                              rhs                 support    
## [1] {whole wheat pasta}           => {olive oil}         0.007998933
## [2] {herb & pepper}               => {ground beef}       0.015997867
## [3] {mineral water,shrimp}        => {frozen vegetables} 0.007199040
## [4] {frozen vegetables,spaghetti} => {ground beef}       0.008665511
##     confidence lift     count
## [1] 0.2714932  4.122410  60  
## [2] 0.3234501  3.291994 120  
## [3] 0.3050847  3.200616  54  
## [4] 0.3110048  3.165328  65

#on the predictors you have found among the generated rules using suitable
#plots (scatter plots, box plots, etc).
#Look for multivariate outliers, non-linear relationships, etc
plot(m1[1:10],method="graph")

#With mineral water what other drinks customer buy?
#loading the dataset as csv file and then deleting the unwanted variables
data4 <- read.csv("MBA.csv")
data4$olive.oil <- NULL
data4$shrimp <- NULL
data4$spinach <- NULL
data4$almonds <- NULL
data4$avocado <- NULL
data4$vegetables.mix <- NULL
data4$green.grapes <- NULL
data4$whole.weat.flour <- NULL
data4$yams <- NULL
data4$cottage.cheese <- NULL
data4$salad <- NULL
data4$honey <- NULL
data4$salmon <- NULL 
#seeing the structure of the updated dataset
str(data4)

## 'data.frame':    7500 obs. of  7 variables:
##  $ energy.drink     : Factor w/ 89 levels "","almonds","antioxydant juice",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tomato.juice     : Factor w/ 81 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ low.fat.yogurt   : Factor w/ 67 levels "","asparagus",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ green.tea        : Factor w/ 51 levels "","blueberries",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ mineral.water    : Factor w/ 19 levels "","candy bars",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ antioxydant.juice: Factor w/ 3 levels "","french fries",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ frozen.smoothie  : Factor w/ 3 levels "","protein bar",..: 1 1 1 1 1 1 1 1 1 1 ...

#converting the variable type from factor to integer
data4$energy.drink <- as.integer(as.factor(data4$energy.drink))
data4$tomato.juice <- as.integer(as.factor(data4$tomato.juice))
data4$low.fat.yogurt <- as.integer(as.factor(data4$low.fat.yogurt))
data4$green.tea <- as.integer(as.factor(data4$green.tea))
data4$mineral.water <- as.integer(as.factor(data4$mineral.water))
data4$antioxydant.juice <- as.integer(as.factor(data4$antioxydant.juice))
data4$frozen.smoothie <- as.integer(as.factor(data4$frozen.smoothie))
#subseting the dataset to get the interesting variable
data5 <- subset(data4, select = c("mineral.water", "frozen.smoothie",
                                       "antioxydant.juice","green.tea",
                                       "tomato.juice","energy.drink"))
#seeing the summary of the dataset
summary(data5)

##  mineral.water   frozen.smoothie antioxydant.juice   green.tea     
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000     Min.   : 1.000  
##  1st Qu.: 1.00   1st Qu.:1.000   1st Qu.:1.000     1st Qu.: 1.000  
##  Median : 1.00   Median :1.000   Median :1.000     Median : 1.000  
##  Mean   : 1.03   Mean   :1.001   Mean   :1.001     Mean   : 1.524  
##  3rd Qu.: 1.00   3rd Qu.:1.000   3rd Qu.:1.000     3rd Qu.: 1.000  
##  Max.   :19.00   Max.   :3.000   Max.   :3.000     Max.   :51.000  
##   tomato.juice     energy.drink   
##  Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 1.000   Median : 1.000  
##  Mean   : 3.067   Mean   : 4.902  
##  3rd Qu.: 1.000   3rd Qu.: 1.000  
##  Max.   :81.000   Max.   :89.000

#now we are seeing the corelation between mineral water and other drinks
#frozen smoothie, antioxydant.juice, green.tea, low.fat.yogurt are compartively
#related whereas tomato.juice and energy.drink are less related
cor(data5)

##                   mineral.water frozen.smoothie antioxydant.juice
## mineral.water         1.0000000      0.34250453        0.29250456
## frozen.smoothie       0.3425045      1.00000000        0.95256220
## antioxydant.juice     0.2925046      0.95256220        1.00000000
## green.tea             0.2738993      0.11236689        0.09507322
## tomato.juice          0.1388897      0.10326421        0.10835472
## energy.drink          0.1497351      0.03813707        0.03034197
##                    green.tea tomato.juice energy.drink
## mineral.water     0.27389930    0.1388897   0.14973506
## frozen.smoothie   0.11236689    0.1032642   0.03813707
## antioxydant.juice 0.09507322    0.1083547   0.03034197
## green.tea         1.00000000    0.4441522   0.38096480
## tomato.juice      0.44415222    1.0000000   0.58608582
## energy.drink      0.38096480    0.5860858   1.00000000

#plotting the data, we can see that mineral water vs frozen smothie
#and antioxydant.juice are dense left hand side whereas mineral water vs
#green tea, tamato juice and energy drink is scattered
plot(data5)

#seeting linear regression model
drink.mod <- lm(mineral.water ~ energy.drink,
              data=data5)
# Summarize and print the results
summary(drink.mod)

## 
## Call:
## lm(formula = mineral.water ~ energy.drink, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5601 -0.0053 -0.0053 -0.0053 17.6921 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.9989568  0.0071404  139.90   <2e-16 ***
## energy.drink 0.0063050  0.0004808   13.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5837 on 7498 degrees of freedom
## Multiple R-squared:  0.02242,    Adjusted R-squared:  0.02229 
## F-statistic:   172 on 1 and 7498 DF,  p-value: < 2.2e-16

#Is there a relationship between the predictor and the response?
#The p-values for the regression coefficients are nearly zero. 
#This implies statistical significance, which in turn mean that there is a relationship.

#How strong is the relationship between the predictor and the response?
#The R^{2} value indicates that about 2.2% of the variation in the 
#response variable (mineral.water) is due to the predictor 
#variable (energy.drink)

#Is the relationship between the predictor and the response positive or negative?
#The regression coefficient for ‘energy drink’ is positive.
#Hence, the relationship is positive.

#What are the associated 95 % confidence and prediction intervals?
#The confidence 95% interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="confidence")

##        fit      lwr      upr
## 1 1.534886 1.458246 1.611526

#And, the 95% prediction interval
predict(drink.mod, data.frame(energy.drink = c(85)), interval ="prediction")

##        fit       lwr      upr
## 1 1.534886 0.3880825 2.681689

#As expected the prediction interval is wider than the confidence interval.

#Plot the response and the predictor. Use the 
#abline() function to display the least squares regression line
plot(data5$mineral.water~data5$energy.drink, main ="MineralWater Vs EnergyDrink ", xlab = "MWater", ylab ="EDrink")
abline(coef = coef(drink.mod), col ="red")

#Produce diagnostic plots of the least squares regression fit.
par(mfrow=c(2,2))
plot(drink.mod)

#The first plot shows a pattern (straight line) between the residuals
#and the fitted values. This indicates a linear relationship 
#between the predictor and response variables. The second plot 
#shows that the residuals are non-normality distributed. The third plot 
#shows that the variance of the errors is constant. Finally, the 
#fourth plot indicates that there are leverage points in the data.

#Describe a problem statement using multiple linear regression
#With mineral water what other drinks customer buy?
# fit another model2, adding house and senate as predictors
drink.mod2 <- lm(mineral.water ~ energy.drink + frozen.smoothie,
                data=data5)
#summary of the multi linear modle
summary(drink.mod2)

## 
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie, 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0497 -0.0036 -0.0036 -0.0036 17.7197 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6.0424823  0.2243802  -26.93   <2e-16 ***
## energy.drink     0.0057634  0.0004524   12.74   <2e-16 ***
## frozen.smoothie  7.0403396  0.2242447   31.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5488 on 7497 degrees of freedom
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1358 
## F-statistic: 590.1 on 2 and 7497 DF,  p-value: < 2.2e-16

# fit another model3, adding house and senate as predictors
drink.mod3 <- lm(mineral.water ~ energy.drink + frozen.smoothie + green.tea,
                 data=data5)
#summary of the multi linear modle
summary(drink.mod3)

## 
## Call:
## lm(formula = mineral.water ~ energy.drink + frozen.smoothie + 
##     green.tea, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7260 -0.0006 -0.0006 -0.0006 17.6014 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.6290809  0.2202096 -25.562  < 2e-16 ***
## energy.drink     0.0023084  0.0004775   4.834 1.36e-06 ***
## frozen.smoothie  6.5954883  0.2202465  29.946  < 2e-16 ***
## green.tea        0.0319061  0.0016728  19.074  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.536 on 7496 degrees of freedom
## Multiple R-squared:  0.176,  Adjusted R-squared:  0.1757 
## F-statistic: 533.7 on 3 and 7496 DF,  p-value: < 2.2e-16

#The p-values for the regression coefficients for the variables are nearly zero. 
#This implies statistical significance, which in turn mean that 
#there is a relationship.

#This may be checked using ANOVA to check all the variables 
anova(drink.mod,drink.mod2)

## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7498 2554.7                                 
## 2   7497 2257.9  1    296.86 985.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#As you can see, the result shows a Df of 1 (indicating that the more 
#complex model has one additional parameter), and a very small p-value (< .001). 
#This means that adding the frozen smoothie to the model did lead to a significantly 
#improved fit over the model 1
anova(drink.mod2,drink.mod3)

## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink + frozen.smoothie
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7497 2257.9                                 
## 2   7496 2153.3  1    104.51 363.8 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#As you can see, the result shows a Df of 1 (indicating that the more complex 
#model has one additional parameter), and a very small p-value (< .001). 
anova(drink.mod,drink.mod3)

## Analysis of Variance Table
## 
## Model 1: mineral.water ~ energy.drink
## Model 2: mineral.water ~ energy.drink + frozen.smoothie + green.tea
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   7498 2554.7                                 
## 2   7496 2153.3  2    401.37 698.6 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#As you can see, the result shows a Df of 1 (indicating that the more 
#complex model has one additional parameter), and a very small p-value (< .001). 
#This means that adding the green.tea to the model did lead to a significantly 
#improved fit over the model 1

mini_project2.R

arnabchakraboty

2020-04-09