Introduction

Data preview

head(starbucks, 15) # Previews the first 15 rows (default is 6)
##                                item calories fat carb fiber protein   type
## 1                      8-Grain Roll      350   8   67     5      10 bakery
## 2                 Apple Bran Muffin      350   9   64     7       6 bakery
## 3                     Apple Fritter      420  20   59     0       5 bakery
## 4                   Banana Nut Loaf      490  19   75     4       7 bakery
## 5       Birthday Cake Mini Doughnut      130   6   17     0       0 bakery
## 6                 Blueberry Oat Bar      370  14   47     5       6 bakery
## 7                   Blueberry Scone      460  22   61     2       7 bakery
## 8        Bountiful Blueberry Muffin      370  14   55     0       6 bakery
## 9                 Butter Croissant       310  18   32     0       5 bakery
## 10                    Cheese Danish      420  25   39     0       7 bakery
## 11           Chocolate Chunk Cookie      380  17   51     2       4 bakery
## 12         Chocolate Cinnamon Bread      320  12   53     3       6 bakery
## 13              Chocolate Croissant      300  17   34     2       5 bakery
## 14 Chocolate Old-Fashioned Doughnut      420  21   57     2       5 bakery
## 15                     Chonga Bagel      310   5   52     3      12 bakery

Data structure

str(starbucks)
## 'data.frame':    77 obs. of  7 variables:
##  $ item    : chr  "8-Grain Roll" "Apple Bran Muffin" "Apple Fritter" "Banana Nut Loaf" ...
##  $ calories: int  350 350 420 490 130 370 460 370 310 420 ...
##  $ fat     : num  8 9 20 19 6 14 22 14 18 25 ...
##  $ carb    : int  67 64 59 75 17 47 61 55 32 39 ...
##  $ fiber   : int  5 7 0 4 0 5 2 0 0 0 ...
##  $ protein : int  10 6 5 7 0 6 7 6 5 7 ...
##  $ type    : chr  "bakery" "bakery" "bakery" "bakery" ...

Add new data for prediction

mcdonalds = data.frame(item=c('Big Mac','Cheeseburger','McDouble','Chocolate Chip Cookie'),
                     fat=c(28,11,17,7),
                     carb=c(45,33,34,22),
                     fiber=c(3,2,2,1),
                     protein=c(24,15,21,2),
                     type=c('sandwich','sandwich','sandwich','bakery'))
mcdonalds #print out the data to check
##                    item fat carb fiber protein     type
## 1               Big Mac  28   45     3      24 sandwich
## 2          Cheeseburger  11   33     2      15 sandwich
## 3              McDouble  17   34     2      21 sandwich
## 4 Chocolate Chip Cookie   7   22     1       2   bakery

Note: Make sure the var names in mcdonalds match the ones in the starbucks data

truth = c(520,290,370,160) 
truth #print out the data to check
## [1] 520 290 370 160

Fitting different models

mod = lm(calories~carb,data=starbucks) 
modlogy = lm(log(calories)~carb,data=starbucks) # log-transformed outcome variable 
modlogx = lm(calories~log(carb),data=starbucks) # log-transformed predictor variable 
modloglog= lm(log(calories)~log(carb), data=starbucks) # both predictor and outcome variables log-transformed 

Understanding output

names(mod) 
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
summary(mod)
## 
## Call:
## lm(formula = calories ~ carb, data = starbucks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -151.962  -70.556   -0.636   54.908  179.444 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 146.0204    25.9186   5.634 2.93e-07 ***
## carb          4.2971     0.5424   7.923 1.67e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.26 on 75 degrees of freedom
## Multiple R-squared:  0.4556, Adjusted R-squared:  0.4484 
## F-statistic: 62.77 on 1 and 75 DF,  p-value: 1.673e-11

Predicting using the models fitted

pred = predict(mod, newdata=mcdonalds, interval='prediction')
predlogy =exp(predict(modlogy, newdata=mcdonalds, interval='prediction'))
predlogx = predict(modlogx, newdata=mcdonalds, interval='prediction')
predloglog = exp(predict(modloglog, newdata=mcdonalds, interval='prediction'))

Understanding output

pred 
##        fit       lwr      upr
## 1 339.3892 182.47905 496.2994
## 2 287.8242 130.39085 445.2576
## 3 292.1213 134.77227 449.4703
## 4 240.5563  81.71246 399.4001

Comparing mean squared error (MSE) for each model

mean((pred[,1]-truth)^2)
## [1] 11294.85
mean((predlogy[,1]-truth)^2)
## [1] 13881.53
mean((predlogx[,1]-truth)^2)
## [1] 9023.895
mean((predloglog[,1]-truth)^2)
## [1] 11207.85