# Exploration of data
s <- read.csv("StarbucksPrepaid.csv")
str(s)
## 'data.frame': 25 obs. of 5 variables:
## $ Amount: int 5 25 10 5 15 50 10 15 5 5 ...
## $ Age : int 25 30 27 42 29 25 50 45 32 23 ...
## $ Days : int 4 12 10 8 11 12 8 6 16 10 ...
## $ Cups : int 1 5 4 5 8 5 3 5 7 1 ...
## $ Income: int 20 35 30 30 25 60 30 35 25 20 ...
attach(s)
cor(s)
## Amount Age Days Cups Income
## Amount 1.0000000 0.21514123 0.40686371 0.2862270 0.8500323
## Age 0.2151412 1.00000000 0.03745681 0.2682888 0.1778494
## Days 0.4068637 0.03745681 1.00000000 0.5876009 0.3054375
## Cups 0.2862270 0.26828876 0.58760095 1.0000000 0.1594511
## Income 0.8500323 0.17784941 0.30543755 0.1594511 1.0000000
plot(s)
# Using backward elimination
fit0 <- lm(Amount ~ ., data = s)
summary(fit0)
##
## Call:
## lm(formula = Amount ~ ., data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.874 -9.947 -2.351 7.332 56.872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -83.8257 22.4944 -3.727 0.00133 **
## Age 0.2369 0.5759 0.411 0.68515
## Days 1.1897 1.4739 0.807 0.42909
## Cups 1.4216 2.6310 0.540 0.59494
## Income 2.4065 0.3597 6.690 1.64e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.15 on 20 degrees of freedom
## Multiple R-squared: 0.7546, Adjusted R-squared: 0.7056
## F-statistic: 15.38 on 4 and 20 DF, p-value: 6.758e-06
# Income looks good from the correlation & Significance, lets remove Cups
fit1 <- lm(Amount ~ .-Cups, data = s)
summary(fit1)
##
## Call:
## lm(formula = Amount ~ . - Cups, data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.531 -8.484 -2.505 10.454 55.653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85.2168 21.9667 -3.879 0.000867 ***
## Age 0.3347 0.5375 0.623 0.540212
## Days 1.6645 1.1631 1.431 0.167113
## Income 2.3901 0.3523 6.784 1.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.77 on 21 degrees of freedom
## Multiple R-squared: 0.7511, Adjusted R-squared: 0.7155
## F-statistic: 21.12 on 3 and 21 DF, p-value: 1.517e-06
# R^2 and adjusted R^2 looks relatively constant.
# Still keeping income, remove Days
fit2 <- lm(Amount ~ Age + Income, data = s)
summary(fit2)
##
## Call:
## lm(formula = Amount ~ Age + Income, data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.128 -12.283 -0.679 7.113 56.193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -72.4912 20.5593 -3.526 0.0019 **
## Age 0.3208 0.5500 0.583 0.5656
## Income 2.5433 0.3436 7.402 2.09e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.28 on 22 degrees of freedom
## Multiple R-squared: 0.7268, Adjusted R-squared: 0.7019
## F-statistic: 29.26 on 2 and 22 DF, p-value: 6.333e-07
# R^2 and adjusted R^2 looks relatively constant.
# Only Income
fit3 <- lm(Amount ~ Income, data = s)
summary(fit3)
##
## Call:
## lm(formula = Amount ~ Income, data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.432 -9.063 0.937 3.832 56.989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -63.3057 13.0255 -4.860 6.60e-05 ***
## Income 2.5790 0.3332 7.739 7.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.96 on 23 degrees of freedom
## Multiple R-squared: 0.7226, Adjusted R-squared: 0.7105
## F-statistic: 59.9 on 1 and 23 DF, p-value: 7.534e-08
# R^2 and adjusted R^2 looks relatively constant.
# Let's try removing Income from the fit0 for experiment sake.
fit4 <- lm(Amount ~ . -Income, data = s)
summary(fit4)
##
## Call:
## lm(formula = Amount ~ . - Income, data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.234 -14.231 -6.114 8.314 144.015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.99529 38.09239 -1.155 0.261
## Age 0.97661 0.99250 0.984 0.336
## Days 4.08304 2.47435 1.650 0.114
## Cups -0.06594 4.60369 -0.014 0.989
##
## Residual standard error: 38.89 on 21 degrees of freedom
## Multiple R-squared: 0.2056, Adjusted R-squared: 0.09207
## F-statistic: 1.811 on 3 and 21 DF, p-value: 0.176
# R^2 and adjusted R^2 drops drastically.
# Sales (Amount) is evident from customer's income (Income).
par(mfcol = c(2,2))
plot(fit3)
# apart from observation row 15 (extreme outlier) and
# row 20 (potential outlier), the residual analysis look good.
detach(s)
# let's remove row 15 & 20
new.s <- s[-c(15,20),]
attach(new.s)
newfit3 <- lm(Amount ~ Income, data = new.s)
summary(newfit3)
##
## Call:
## lm(formula = Amount ~ Income, data = new.s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6605 -5.0698 -0.6605 3.2955 18.1581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.4276 5.1879 -4.130 0.000476 ***
## Income 1.2363 0.1442 8.574 2.66e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.909 on 21 degrees of freedom
## Multiple R-squared: 0.7778, Adjusted R-squared: 0.7672
## F-statistic: 73.51 on 1 and 21 DF, p-value: 2.665e-08
# R^2 and adjusted R^2 improved.
plot(newfit3)
detach(new.s)
Question 2:
attach(s)
fit5 <- lm(Days ~ . -Amount, data = s)
summary(fit5)
##
## Call:
## lm(formula = Days ~ . - Amount, data = s)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7442 -2.5016 -0.2539 2.4524 6.9865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.96836 3.06510 1.947 0.06501 .
## Age -0.07853 0.08352 -0.940 0.35777
## Cups 1.06441 0.31270 3.404 0.00267 **
## Income 0.07161 0.05091 1.407 0.17418
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.279 on 21 degrees of freedom
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.3324
## F-statistic: 4.984 on 3 and 21 DF, p-value: 0.009129
# low R^2 and adjusted R^2, model is not strong.
Question 3:
# Exploration of data
sG <- read.csv("StarbucksGrowth.csv")
sG
## Year Revenue Stores Drinks AveWeekEarnings
## 1 1 400 676 15 386
## 2 2 700 1015 15 394
## 3 3 1000 1412 18 407
## 4 4 1350 1886 22 425
## 5 5 1650 2135 27 442
## 6 6 2200 3300 27 457
## 7 7 2600 4709 30 474
cor(sG)
## Year Revenue Stores Drinks AveWeekEarnings
## Year 1.0000000 0.9945365 0.9509748 0.9762210 0.9957778
## Revenue 0.9945365 1.0000000 0.9749025 0.9639655 0.9956002
## Stores 0.9509748 0.9749025 1.0000000 0.9075303 0.9605431
## Drinks 0.9762210 0.9639655 0.9075303 1.0000000 0.9844740
## AveWeekEarnings 0.9957778 0.9956002 0.9605431 0.9844740 1.0000000
plot(sG)
attach(sG)
testfit0 <- lm(Stores~Drinks + AveWeekEarnings, data = sG)
library(car)
vif(testfit0)
## Drinks AveWeekEarnings
## 32.45607 32.45607
# stores is highly correlated with Drinks and AveWeekEarnings, VIF > 32
testfit1 <- lm(AveWeekEarnings ~ Stores + Drinks, data = sG)
vif(testfit1)
## Stores Drinks
## 5.669294 5.669294
# AveWeekEarnings is highly correlated with Drinks and AveWeekEarnings, VIF > 5
testfit2 <- lm(Drinks ~ AveWeekEarnings + Stores, data = sG)
vif(testfit2)
## AveWeekEarnings Stores
## 12.9271 12.9271
# Drinks is highly correlated with Drinks and AveWeekEarnings, VIF > 12
# using forward selection
fit0 <- lm(Revenue ~ AveWeekEarnings, data = sG)
summary(fit0)
##
## Call:
## lm(formula = Revenue ~ AveWeekEarnings, data = sG)
##
## Residuals:
## 1 2 3 4 5 6 7
## -42.32 65.35 52.81 -29.94 -138.65 50.73 42.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8837.698 432.614 -20.43 5.20e-06 ***
## AveWeekEarnings 24.042 1.012 23.76 2.46e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.62 on 5 degrees of freedom
## Multiple R-squared: 0.9912, Adjusted R-squared: 0.9895
## F-statistic: 564.5 on 1 and 5 DF, p-value: 2.46e-06
# R^2 and adjusted R^2 is very high ~0.99
fit1 <- lm(Revenue ~ AveWeekEarnings + Stores, data = sG)
summary(fit1)
##
## Call:
## lm(formula = Revenue ~ AveWeekEarnings + Stores, data = sG)
##
## Residuals:
## 1 2 3 4 5 6 7
## -66.4447 39.9068 46.0617 -0.5523 -48.2340 67.0008 -37.7382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.754e+03 1.079e+03 -6.261 0.00332 **
## AveWeekEarnings 1.847e+01 2.852e+00 6.477 0.00293 **
## Stores 1.354e-01 6.655e-02 2.035 0.11163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.97 on 4 degrees of freedom
## Multiple R-squared: 0.9957, Adjusted R-squared: 0.9935
## F-statistic: 461.5 on 2 and 4 DF, p-value: 1.862e-05
# R^2 and adjusted R^2 improved.
par(mfcol = c(2,2))
plot(fit1)
# residuals analysis indicates regression is poor.
fit2 <- lm(Revenue ~ AveWeekEarnings + Stores + Drinks, data = sG)
summary(fit2)
##
## Call:
## lm(formula = Revenue ~ AveWeekEarnings + Stores + Drinks, data = sG)
##
## Residuals:
## 1 2 3 4 5 6 7
## -3.6994 -6.6557 22.5877 -15.8762 3.9080 -0.1463 -0.1180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.350e+04 9.462e+02 -14.268 0.000746 ***
## AveWeekEarnings 3.899e+01 2.847e+00 13.696 0.000842 ***
## Stores -2.642e-02 2.777e-02 -0.952 0.411447
## Drinks -7.520e+01 1.007e+01 -7.468 0.004972 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.69 on 3 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9996
## F-statistic: 4539 on 3 and 3 DF, p-value: 5.549e-06
# R^2 and adjusted R^2 improved yet again.
plot(fit2)
# residuals analysis indicates regression is poor.
# perhaps lets remove AveWeekEarning as it is a form of revenue.
fit3 <- lm(Revenue ~ Stores + Drinks, data = sG)
summary(fit3)
##
## Call:
## lm(formula = Revenue ~ Stores + Drinks, data = sG)
##
## Residuals:
## 1 2 3 4 5 6 7
## -133.74 57.88 57.17 23.91 -45.33 132.19 -92.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -551.29356 258.05485 -2.136 0.0995 .
## Stores 0.31972 0.07936 4.029 0.0158 *
## Drinks 57.92642 18.16540 3.189 0.0333 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 115.2 on 4 degrees of freedom
## Multiple R-squared: 0.986, Adjusted R-squared: 0.979
## F-statistic: 140.9 on 2 and 4 DF, p-value: 0.0001958
plot(fit3)
# residuals analysis indicates regression is poor.
fit4 <- lm(Revenue ~ AveWeekEarnings + Drinks, data = sG)
summary(fit4)
##
## Call:
## lm(formula = Revenue ~ AveWeekEarnings + Drinks, data = sG)
##
## Residuals:
## 1 2 3 4 5 6 7
## -11.780 -3.966 24.408 -12.110 5.621 7.773 -9.946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12670.302 362.794 -34.92 4.01e-06 ***
## AveWeekEarnings 36.523 1.165 31.36 6.17e-06 ***
## Drinks -67.725 6.222 -10.88 0.000404 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.49 on 4 degrees of freedom
## Multiple R-squared: 0.9997, Adjusted R-squared: 0.9996
## F-statistic: 6973 on 2 and 4 DF, p-value: 8.223e-08
plot(fit4)
# residuals analysis indicates regression is poor.
par(mfcol = c(1,1))
detach(sG)