Data Cleaning

To Clean the data, I would check for missing values in the dataframe, in this example below, I notice there are missing values in the dataframe. So, what I did was use the mean value imputation for the “caffeine column”

#Checking for missing values in the dataframe
colSums(is.na(starbuck)) #Caffeine has a missing value so we will use mean imputation

##       Beverage_category                Beverage           Beverage_prep 
##                       0                       0                       0 
##                Calories           Total Fat (g)           Trans Fat (g) 
##                       0                       0                       0 
##       Saturated Fat (g)             Sodium (mg) Total Carbohydrates (g) 
##                       0                       0                       0 
##        Cholesterol (mg)       Dietary Fibre (g)              Sugars (g) 
##                       0                       0                       0 
##             Protein (g)        Vitamin A (% DV)        Vitamin C (% DV) 
##                       0                       0                       0 
##          Calcium (% DV)             Iron (% DV)           Caffeine (mg) 
##                       0                       0                       1

imputed_data <- impute(starbuck$`Caffeine (mg)`, fun=mean)

## Warning in mean.default(x[!m]): argument is not numeric or logical: returning
## NA

Notice that this column has “varies” in the vector. To clean that part up, I would just delete the rows since caffeine can vary. The drink could be low in caffeine or very high in caffeine

starbuck <- starbuck[!(starbuck$`Caffeine (mg)` == "varies"), ]
starbuck <- starbuck[!starbuck$`Caffeine (mg)` %in% c("Varies", "NA"), ]
starbuck$`Caffeine (mg)`

##   [1] "175" "260" "330" "410" "75"  "75"  "75"  "75"  "75"  "75"  "150" "150"
##  [13] "150" "150" "150" "150" "85"  "85"  "85"  "95"  "95"  "95"  "175" "175"
##  [25] "175" "180" "180" "180" "75"  "75"  "75"  "75"  "75"  "75"  "150" "150"
##  [37] "150" "150" "150" "150" "75"  "150" "225" "300" "75"  "75"  "75"  "75" 
##  [49] "75"  "75"  "150" "150" "150" "150" "150" "150" "75"  "150" "75"  "75" 
##  [61] "150" "150" "75"  "75"  "75"  "75"  "75"  "75"  "150" "150" "150" "150"
##  [73] "150" "150" "75"  "75"  "75"  "75"  "75"  "75"  "150" "150" "150" "150"
##  [85] "150" "150" "10"  "10"  "10"  "20"  "20"  "20"  "25"  "25"  "25"  "30" 
##  [97] "30"  "30"  "0"   "0"   "0"   "0"   "50"  "50"  "50"  "70"  "70"  "70" 
## [109] "95"  "95"  "95"  "120" "120" "120" "25"  "25"  "25"  "55"  "55"  "55" 
## [121] "80"  "80"  "80"  "110" "110" "110" "0"   "0"   "0"   "0"   "0"   "0"  
## [133] "0"   "0"   "0"   "0"   "0"   "0"   "120" "165" "235" "90"  NA    "90" 
## [145] "90"  "125" "125" "125" "170" "170" "170" "15"  "15"  "15"  "0"   "0"  
## [157] "0"   "0"   "0"   "0"   "70"  "70"  "70"  "95"  "95"  "95"  "130" "130"
## [169] "130" "70"  "70"  "110" "110" "110" "140" "140" "140" "70"  "70"  "70" 
## [181] "100" "100" "100" "130" "130" "130" "75"  "75"  "75"  "110" "110" "110"
## [193] "145" "145" "145" "70"  "95"  "120" "70"  "95"  "130" "65"  "90"  "120"
## [205] "70"  "105" "165" "0"   "0"   "0"   "0"   "0"   "0"   "0"   "0"   "0"  
## [217] "0"   "0"   "0"   "0"

starbuck$`Caffeine (mg)`[is.na(starbuck$`Caffeine (mg)`)] <- as.numeric(names(which.max(table(starbuck$`Caffeine (mg)`))))

Delete the Vitamin A, Vitamin C, Calcium, and Iron columns

drop<-c("Vitamin A (% DV)","Vitamin C (% DV)", "Calcium (% DV)", "Iron (% DV)")
starbuck = starbuck[,!(names(starbuck) %in% drop)]

Checking the type of variables, what category they fall into

lapply(starbuck, class)

## $Beverage_category
## [1] "character"
## 
## $Beverage
## [1] "character"
## 
## $Beverage_prep
## [1] "character"
## 
## $Calories
## [1] "numeric"
## 
## $`Total Fat (g)`
## [1] "numeric"
## 
## $`Trans Fat (g)`
## [1] "numeric"
## 
## $`Saturated Fat (g)`
## [1] "numeric"
## 
## $`Sodium (mg)`
## [1] "numeric"
## 
## $`Total Carbohydrates (g)`
## [1] "numeric"
## 
## $`Cholesterol (mg)`
## [1] "numeric"
## 
## $`Dietary Fibre (g)`
## [1] "numeric"
## 
## $`Sugars (g)`
## [1] "numeric"
## 
## $`Protein (g)`
## [1] "numeric"
## 
## $`Caffeine (mg)`
## [1] "character"

Since the caffeine column is character, i’ll convert to numeric

starbuck$`Caffeine (mg)`<-as.numeric(starbuck$`Caffeine (mg)`)

Replacing All NA’s using mean imputation

starbuck$Calories[is.na(starbuck$Calories)]<-mean(starbuck$Calories,na.rm=TRUE)
starbuck$`Total Fat (g)`[is.na(starbuck$`Total Fat (g)`)]<-mean(starbuck$`Total Fat (g)`,na.rm=TRUE)
starbuck$`Trans Fat (g)`[is.na(starbuck$`Trans Fat (g)`)]<-mean(starbuck$`Trans Fat (g)`,na.rm=TRUE)
starbuck$`Saturated Fat (g)`[is.na(starbuck$`Saturated Fat (g)`)]<-mean(starbuck$`Saturated Fat (g)`,na.rm=TRUE)
starbuck$`Sodium (mg)`[is.na(starbuck$`Sodium (mg)`)]<-mean(starbuck$`Sodium (mg)`,na.rm=TRUE)
starbuck$`Total Carbohydrates (g)`[is.na(starbuck$`Total Carbohydrates (g)`)]<-mean(starbuck$`Total Carbohydrates (g)`,na.rm=TRUE)
starbuck$`Cholesterol (mg)`[is.na(starbuck$`Cholesterol (mg)`)]<-mean(starbuck$`Cholesterol (mg)`,na.rm=TRUE)
starbuck$`Dietary Fibre (g)`[is.na(starbuck$`Dietary Fibre (g)`)]<-mean(starbuck$`Dietary Fibre (g)`,na.rm=TRUE)
starbuck$`Protein (g)`[is.na(starbuck$`Protein (g)`)]<-mean(starbuck$`Protein (g)`,na.rm=TRUE)
starbuck$`Sugars (g)`[is.na(starbuck$`Sugars (g)`)]<-mean(starbuck$`Sugars (g)`,na.rm=TRUE)

Now, there are no NA’s in the continous predictors. Caffeine has a missing value so we will use mean imputation

colSums(is.na(starbuck))

##       Beverage_category                Beverage           Beverage_prep 
##                       1                       1                       1 
##                Calories           Total Fat (g)           Trans Fat (g) 
##                       0                       0                       0 
##       Saturated Fat (g)             Sodium (mg) Total Carbohydrates (g) 
##                       0                       0                       0 
##        Cholesterol (mg)       Dietary Fibre (g)              Sugars (g) 
##                       0                       0                       0 
##             Protein (g)           Caffeine (mg) 
##                       0                       0

# Generate column plot
par(mfrow = c(1,2))

hist(starbuck$Calories,
     xlab = "Calories",
     main = "Frequency histogram")

hist(starbuck$'Saturated Fat (g)',
     xlab = "Weight",
     main = "Probability histogram",
     probability = T)

hist(starbuck$Calories, main='Histogram of Points', xlab='Points', col='steelblue', breaks=12)
hist(starbuck$`Total Fat (g)`, main='Histogram of Total Fat', xlab='Total Fat', col='steelblue', breaks=12)

as.numeric(as.character(starbuck$`Total Fat (g)`))

##   [1]  0.100000  0.100000  0.100000  0.100000  0.100000  3.500000  2.500000
##   [8]  0.200000  6.000000  4.500000  0.300000  7.000000  5.000000  0.400000
##  [15]  9.000000  7.000000  1.500000  4.000000  3.500000  2.000000  6.000000
##  [22]  5.000000  2.500000  8.000000  7.000000  3.000000 11.000000  9.000000
##  [29]  0.100000  3.500000  2.500000  0.200000  5.000000  4.000000  0.300000
##  [36]  6.000000  5.000000  0.300000  9.000000  7.000000  0.000000  0.000000
##  [43]  0.000000  0.000000  0.100000  3.000000  1.500000  0.100000  3.500000
##  [50]  3.000000  0.200000  4.000000  3.500000  0.200000  6.000000  4.500000
##  [57]  0.000000  0.000000  0.100000  0.200000  0.300000  0.300000  1.000000
##  [64]  4.000000  3.000000  1.000000  5.000000  4.500000  1.000000  7.000000
##  [71]  5.000000  1.000000  8.000000  7.000000  3.000000  6.000000  5.000000
##  [78]  4.500000  9.000000  8.000000  6.000000 11.000000 10.000000  7.000000
##  [85] 15.000000 13.000000  1.500000  4.500000  3.500000  2.000000  7.000000
##  [92]  6.000000  2.500000  9.000000  7.000000  3.000000 11.000000  9.000000
##  [99]  0.000000  0.000000  0.000000  0.000000  0.100000  2.000000  1.500000
## [106]  0.200000  3.500000  2.500000  0.200000  4.500000  3.500000  0.300000
## [113]  6.000000  4.500000  0.200000  4.000000  3.000000  0.400000  6.000000
## [120]  4.500000  0.500000  8.000000  6.000000  0.500000 10.000000  8.000000
## [127]  0.100000  2.000000  1.500000  0.100000  3.000000  2.500000  0.200000
## [134]  4.000000  3.500000  0.200000  5.000000  4.000000  0.000000  0.100000
## [141]  0.100000  0.100000  3.074886  1.000000  0.100000  1.500000  1.000000
## [148]  0.100000  2.000000  1.500000  0.000000  5.000000  4.500000  1.000000
## [155]  1.500000  1.500000  1.000000  2.000000  2.000000  0.100000  2.500000
## [162]  1.500000  0.100000  3.000000  1.500000  0.100000  5.000000  2.500000
## [169]  0.500000  3.000000  2.000000  1.000000  4.000000  2.500000  1.000000
## [176]  6.000000  3.000000  0.100000  2.500000  1.500000  0.100000  3.500000
## [183]  1.500000  0.100000  5.000000  2.500000  3.000000  5.000000  4.000000
## [190]  4.000000  7.000000  6.000000  5.000000 10.000000  8.000000  0.100000
## [197]  0.100000  0.100000  0.500000  1.000000  1.000000  0.100000  0.100000
## [204]  0.100000  3.000000  4.000000  5.000000  0.100000  3.000000  1.500000
## [211]  0.200000  4.000000  2.000000  0.200000  6.000000  3.200000  0.100000
## [218]  3.500000  1.500000  0.100000

# Trans fat is the bad column that won't plot


hist(starbuck$`Saturated Fat (g)`, main='Histogram of Saturated Fats', xlab='Saturated fat concentration', col='steelblue', breaks=12)
hist(starbuck$`Sodium (mg)`, main='Histogram of sodium', xlab='sodium', col='steelblue', breaks=12)

#hist(starbuck$`Total Carbohydrates (g) `, main='XXX', xlab='XXX', col='steelblue', breaks=12)
hist(starbuck$`Cholesterol (mg)`, main='Histogram of cholesterol', xlab='Cholesterol concentration', col='steelblue', breaks=12)
hist(starbuck$`Dietary Fibre (g)`, main='Histogram of fiber', xlab='fiber (g)', col='steelblue', breaks=12)

hist(starbuck$`Sugars (g)`, main='Histogram of sugar', xlab='sugar (g)', col='steelblue', breaks=12)
hist(starbuck$`Protein (g)`, main='Histogram of protein', xlab='protein (g)', col='steelblue', breaks=12)

hist(starbuck$`Caffeine (mg)`, main='Histogram of caffeine', xlab='Caffeine (mg) concentration', col='steelblue', breaks=12)

Which drinks have the most sugar, caffeine, and calories?

Calories

As we can see, The signature espreso drinks as well as the frappuchino drinks have the most calories. It is obvious as they put a lot of sugar and milk into such drinks. The highest is about 510 calories and that goes to the espresso mocha.

#Calories
top_cals<-top_n(starbuck,7,Calories)
top_cals

## # A tibble: 7 × 14
##   Beverage_category            Beverage   Beverage_prep Calories `Total Fat (g)`
##   <chr>                        <chr>      <chr>            <dbl>           <dbl>
## 1 Signature Espresso Drinks    White Cho… Venti Nonfat…      450               7
## 2 Signature Espresso Drinks    White Cho… 2% Milk            510              15
## 3 Signature Espresso Drinks    White Cho… Soymilk            460              13
## 4 TazoÂ® Tea Drinks            TazoÂ® Gr… 2% Milk            450              10
## 5 FrappuccinoÂ® Blended Coffee Java Chip… Venti Nonfat…      420               5
## 6 FrappuccinoÂ® Blended Coffee Java Chip… Whole Milk         460              10
## 7 FrappuccinoÂ® Blended Coffee Java Chip… Soymilk            430               8
## # ℹ 9 more variables: `Trans Fat (g)` <dbl>, `Saturated Fat (g)` <dbl>,
## #   `Sodium (mg)` <dbl>, `Total Carbohydrates (g)` <dbl>,
## #   `Cholesterol (mg)` <dbl>, `Dietary Fibre (g)` <dbl>, `Sugars (g)` <dbl>,
## #   `Protein (g)` <dbl>, `Caffeine (mg)` <dbl>

top_cals$Calories

## [1] 450 510 460 450 420 460 430

Caffeine

The regular coffee drinks (brewed coffee and/or espresso shot drinks) are the ones that have the most caffeine where 410 is the highest amount of caffeine out of any drink at starbucks.

top_caff <- starbuck %>% top_n(5, `Caffeine (mg)`)
top_caff

## # A tibble: 5 × 14
##   Beverage_category       Beverage        Beverage_prep Calories `Total Fat (g)`
##   <chr>                   <chr>           <chr>            <dbl>           <dbl>
## 1 Coffee                  Brewed Coffee   Tall                 4             0.1
## 2 Coffee                  Brewed Coffee   Grande               5             0.1
## 3 Coffee                  Brewed Coffee   Venti                5             0.1
## 4 Classic Espresso Drinks CaffÃ¨ America… Venti               25             0  
## 5 Shaken Iced Beverages   Iced Brewed Co… Venti              130             0.1
## # ℹ 9 more variables: `Trans Fat (g)` <dbl>, `Saturated Fat (g)` <dbl>,
## #   `Sodium (mg)` <dbl>, `Total Carbohydrates (g)` <dbl>,
## #   `Cholesterol (mg)` <dbl>, `Dietary Fibre (g)` <dbl>, `Sugars (g)` <dbl>,
## #   `Protein (g)` <dbl>, `Caffeine (mg)` <dbl>

top_caff$`Caffeine (mg)`

## [1] 260 330 410 300 235

Sugar

Similar to Calories, the obvious frappuccino and espresso drinks have the most stugar with the caramel frappuccino having the most sugar (about 77 to 80 grams)

top_sug<-top_n(starbuck,5,`Sugars (g)`)
top_sug

## # A tibble: 6 × 14
##   Beverage_category            Beverage   Beverage_prep Calories `Total Fat (g)`
##   <chr>                        <chr>      <chr>            <dbl>           <dbl>
## 1 Signature Espresso Drinks    Caramel A… Venti              360             0  
## 2 FrappuccinoÂ® Blended Coffee Caramel (… Venti Nonfat…      330             0.1
## 3 FrappuccinoÂ® Blended Coffee Caramel (… Whole Milk         370             5  
## 4 FrappuccinoÂ® Blended Coffee Java Chip… Venti Nonfat…      420             5  
## 5 FrappuccinoÂ® Blended Coffee Java Chip… Whole Milk         460            10  
## 6 FrappuccinoÂ® Blended Coffee Java Chip… Soymilk            430             8  
## # ℹ 9 more variables: `Trans Fat (g)` <dbl>, `Saturated Fat (g)` <dbl>,
## #   `Sodium (mg)` <dbl>, `Total Carbohydrates (g)` <dbl>,
## #   `Cholesterol (mg)` <dbl>, `Dietary Fibre (g)` <dbl>, `Sugars (g)` <dbl>,
## #   `Protein (g)` <dbl>, `Caffeine (mg)` <dbl>

top_sug$`Sugars (g)`

## [1] 83 77 77 84 84 80

GGPLOTS

You can also embed plots, for example:

##  [1] "Coffee"                             "Classic Espresso Drinks"           
##  [3] "Signature Espresso Drinks"          "TazoÂ® Tea Drinks"                 
##  [5] "Shaken Iced Beverages"              NA                                  
##  [7] "Smoothies"                          "FrappuccinoÂ® Blended Coffee"      
##  [9] "FrappuccinoÂ® Light Blended Coffee" "FrappuccinoÂ® Blended CrÃ¨me"

Implementing a decision tree

#Implementing a decision tree

#Split data into 80 training and 20 testing
set.seed(333333)
sample <- sample(c(TRUE, FALSE), nrow(starbuck), replace=TRUE, prob=c(0.8,0.2))
train<- starbuck[sample,]
test<- starbuck[!sample,]

#Fitting the full regression with RSS splitting
library(rpart)

regression_tree<- rpart(Calories~`Trans Fat (g)`+Beverage_category
                        +Beverage_prep+`Total Fat (g)`+`Trans Fat (g)`+`Saturated Fat (g)`
                        + `Sodium (mg)`+`Total Carbohydrates (g)`+`Cholesterol (mg)`
                        +`Dietary Fibre (g)`+`Sugars (g)`+`Protein (g)`
                        +`Caffeine (mg)`, data=train, method="anova",
            xval=10, cp=0) #xval=number of cross-validations
printcp(regression_tree)

## 
## Regression tree:
## rpart(formula = Calories ~ `Trans Fat (g)` + Beverage_category + 
##     Beverage_prep + `Total Fat (g)` + `Trans Fat (g)` + `Saturated Fat (g)` + 
##     `Sodium (mg)` + `Total Carbohydrates (g)` + `Cholesterol (mg)` + 
##     `Dietary Fibre (g)` + `Sugars (g)` + `Protein (g)` + `Caffeine (mg)`, 
##     data = train, method = "anova", xval = 10, cp = 0)
## 
## Variables actually used in tree construction:
## [1] Cholesterol (mg) Protein (g)      Total Fat (g)    Trans Fat (g)   
## 
## Root node error: 1921594/180 = 10676
## 
## n= 180 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.5954684      0  1.000000 1.00909 0.103525
## 2  0.1189639      1  0.404532 0.42958 0.045698
## 3  0.0980527      2  0.285568 0.32016 0.041311
## 4  0.0355660      3  0.187515 0.22911 0.025358
## 5  0.0331526      4  0.151949 0.20916 0.025635
## 6  0.0209616      5  0.118796 0.17739 0.021411
## 7  0.0166832      6  0.097835 0.15701 0.019080
## 8  0.0081391      7  0.081152 0.14347 0.018135
## 9  0.0077335      8  0.073012 0.14351 0.017251
## 10 0.0052884      9  0.065279 0.13448 0.016054
## 11 0.0038764     10  0.059990 0.12324 0.015786
## 12 0.0027848     11  0.056114 0.11667 0.015735
## 13 0.0000000     12  0.053329 0.11082 0.015583

#Fitting the regression tree with rss splitting and cost-complexity pruning
reg.tree.rss<-rpart(Calories~`Trans Fat (g)`+Beverage_category
            +Beverage_prep+`Total Fat (g)`+`Trans Fat (g)`+`Saturated Fat (g)`
            + `Sodium (mg)`+`Total Carbohydrates (g)`+`Cholesterol (mg)`
            +`Dietary Fibre (g)`+`Sugars (g)`+`Protein (g)`
            +`Caffeine (mg)`, data=train, method="anova",
            cp=0.026) #xval=number of cross-validations
rpart.plot(reg.tree.rss, type=2)

plotcp(reg.tree.rss)

p <- predict(reg.tree.rss, train)
sqrt(mean((train$Calories-p)^2))

## [1] 35.61198

(cor(train$Calories,p))^2 #Not bad

## [1] 0.8812036

Computing prediction accuracy for testing data

calories_value<-predict(reg.tree.rss,newdata=test)

#Accuracy within 10%
accuracy10<-ifelse(abs(test$Calories-calories_value)<0.10*test$Calories,1,0)
print(mean(accuracy10))

## [1] 0.4

#Accuracy within 15%
accuracy15<-ifelse(abs(test$Calories-calories_value)<0.15*test$Calories,1,0)
print(mean(accuracy15))

## [1] 0.45

#Accuracy within 20%
accuracy20<-ifelse(abs(test$Calories-calories_value)<0.20*test$Calories,1,0)
print(mean(accuracy20))

## [1] 0.575

Prediction of Calorie content: lets create a model that predicts the calories of the beverages based on the nutritional attributes

testing.lm.no.bev.category<-lm(Calories~ `Trans Fat (g)`
                     +`Total Fat (g)`+`Trans Fat (g)`+`Saturated Fat (g)`
                     + `Sodium (mg)`+`Total Carbohydrates (g)`+`Cholesterol (mg)`
                     +`Dietary Fibre (g)`+`Sugars (g)`+`Protein (g)`
                     +`Caffeine (mg)`,data=starbuck)
summary(testing.lm.no.bev.category)

## 
## Call:
## lm(formula = Calories ~ `Trans Fat (g)` + `Total Fat (g)` + `Trans Fat (g)` + 
##     `Saturated Fat (g)` + `Sodium (mg)` + `Total Carbohydrates (g)` + 
##     `Cholesterol (mg)` + `Dietary Fibre (g)` + `Sugars (g)` + 
##     `Protein (g)` + `Caffeine (mg)`, data = starbuck)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4205  -3.1591   0.1326   3.9047  22.2421 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.652942   1.380978   0.473    0.637    
## `Trans Fat (g)`            -4.010404   0.802342  -4.998 1.22e-06 ***
## `Total Fat (g)`            10.701136   0.424421  25.214  < 2e-16 ***
## `Saturated Fat (g)`       -17.564298  24.375484  -0.721    0.472    
## `Sodium (mg)`               0.084362   0.226174   0.373    0.710    
## `Total Carbohydrates (g)`   0.002590   0.010354   0.250    0.803    
## `Cholesterol (mg)`          1.459109   0.359711   4.056 7.04e-05 ***
## `Dietary Fibre (g)`         1.047705   0.905380   1.157    0.249    
## `Sugars (g)`                2.556707   0.365633   6.993 3.58e-11 ***
## `Protein (g)`               4.125523   0.214781  19.208  < 2e-16 ***
## `Caffeine (mg)`             0.009717   0.007750   1.254    0.211    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.822 on 209 degrees of freedom
## Multiple R-squared:  0.9957, Adjusted R-squared:  0.9955 
## F-statistic:  4886 on 10 and 209 DF,  p-value: < 2.2e-16

testing.lm.all.variables<-lm(Calories~.,data=starbuck)
summary(testing.lm.all.variables)

## 
## Call:
## lm(formula = Calories ~ ., data = starbuck)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4677 -2.5853 -0.1322  3.0578 22.3296 
## 
## Coefficients: (9 not defined because of singularities)
##                                                            Estimate Std. Error
## (Intercept)                                                6.567237   2.497415
## Beverage_categoryCoffee                                  -14.469504   7.230937
## Beverage_categoryFrappuccinoÂ® Blended Coffee            -14.103063   3.638734
## Beverage_categoryFrappuccinoÂ® Blended CrÃ¨me             -4.066706   3.895391
## Beverage_categoryFrappuccinoÂ® Light Blended Coffee      -12.709313   5.011721
## Beverage_categoryShaken Iced Beverages                   -11.515929   8.290322
## Beverage_categorySignature Espresso Drinks                -2.562555   3.680371
## Beverage_categorySmoothies                                16.886041  10.567195
## Beverage_categoryTazoÂ® Tea Drinks                         0.699914   2.559716
## BeverageBrewed Coffee                                            NA         NA
## BeverageCaffÃ¨ Americano                                  -4.650283   5.856240
## BeverageCaffÃ¨ Latte                                       2.421215   2.516571
## BeverageCaffÃ¨ Mocha (Without Whipped Cream)             -13.014020   3.180541
## BeverageCappuccino                                        -2.484173   2.165157
## BeverageCaramel                                            5.878569   4.486360
## BeverageCaramel (Without Whipped Cream)                    5.520258   2.454053
## BeverageCaramel Apple Spice (Without Whipped Cream)        3.381636   8.307537
## BeverageCaramel Macchiato                                  2.629817   3.391630
## BeverageCoffee                                             6.409524   2.541778
## BeverageEspresso                                          -7.016961   5.244654
## BeverageHot Chocolate (Without Whipped Cream)            -11.422214   4.153804
## BeverageIced Brewed Coffee (With Classic Syrup)            0.566131   5.639929
## BeverageIced Brewed Coffee (With Milk & Classic Syrup)     1.389709   6.838224
## BeverageJava Chip                                          1.071650   4.817180
## BeverageJava Chip (Without Whipped Cream)                  0.393592   3.166319
## BeverageMocha                                              4.202803   4.321297
## BeverageMocha (Without Whipped Cream)                            NA         NA
## BeverageOrange Mango Banana Smoothie                       3.990424   4.528928
## BeverageShaken Iced TazoÂ® Tea (With Classic Syrup)              NA         NA
## BeverageSkinny Latte (Any Flavour)                         1.883665   3.326735
## BeverageStrawberries & CrÃ¨me (Without Whipped Cream)     -1.085203   2.835004
## BeverageStrawberry Banana Smoothie                         6.119981   4.573143
## BeverageTazoÂ® Chai Tea Latte                             -6.954146   2.892406
## BeverageTazoÂ® Full-Leaf Red Tea Latte (Vanilla Rooibos)  -2.997075   2.359801
## BeverageTazoÂ® Green Tea Latte                                   NA         NA
## BeverageVanilla Bean (Without Whipped Cream)                     NA         NA
## BeverageVanilla Latte (Or Other Flavoured Latte)                 NA         NA
## BeverageWhite Chocolate Mocha (Without Whipped Cream)            NA         NA
## Beverage_prepDoppio                                       -1.122542   6.563212
## Beverage_prepGrande                                       -1.160197   3.475251
## Beverage_prepGrande Nonfat Milk                            1.128095   2.469494
## Beverage_prepShort                                        -1.537238   4.934492
## Beverage_prepShort Nonfat Milk                            -1.779256   2.275787
## Beverage_prepSolo                                                NA         NA
## Beverage_prepSoymilk                                      -0.250029   2.063162
## Beverage_prepTall                                         -2.050423   3.984348
## Beverage_prepTall Nonfat Milk                             -0.134932   2.334350
## Beverage_prepVenti                                               NA         NA
## Beverage_prepVenti Nonfat Milk                             3.114194   2.975403
## Beverage_prepWhole Milk                                    0.992856   1.848747
## `Total Fat (g)`                                           11.126722   0.655794
## `Trans Fat (g)`                                           -2.405652   1.257778
## `Saturated Fat (g)`                                       -8.197054  19.757156
## `Sodium (mg)`                                             -0.240048   0.197710
## `Total Carbohydrates (g)`                                 -0.008307   0.027808
## `Cholesterol (mg)`                                         1.555906   0.527579
## `Dietary Fibre (g)`                                       -0.322717   1.226310
## `Sugars (g)`                                               2.528153   0.538311
## `Protein (g)`                                              3.405624   0.328485
## `Caffeine (mg)`                                            0.033754   0.017445
##                                                          t value Pr(>|t|)    
## (Intercept)                                                2.630 0.009342 ** 
## Beverage_categoryCoffee                                   -2.001 0.046997 *  
## Beverage_categoryFrappuccinoÂ® Blended Coffee             -3.876 0.000152 ***
## Beverage_categoryFrappuccinoÂ® Blended CrÃ¨me             -1.044 0.297995    
## Beverage_categoryFrappuccinoÂ® Light Blended Coffee       -2.536 0.012127 *  
## Beverage_categoryShaken Iced Beverages                    -1.389 0.166647    
## Beverage_categorySignature Espresso Drinks                -0.696 0.487218    
## Beverage_categorySmoothies                                 1.598 0.111929    
## Beverage_categoryTazoÂ® Tea Drinks                         0.273 0.784855    
## BeverageBrewed Coffee                                         NA       NA    
## BeverageCaffÃ¨ Americano                                  -0.794 0.428273    
## BeverageCaffÃ¨ Latte                                       0.962 0.337378    
## BeverageCaffÃ¨ Mocha (Without Whipped Cream)              -4.092 6.63e-05 ***
## BeverageCappuccino                                        -1.147 0.252872    
## BeverageCaramel                                            1.310 0.191876    
## BeverageCaramel (Without Whipped Cream)                    2.249 0.025783 *  
## BeverageCaramel Apple Spice (Without Whipped Cream)        0.407 0.684485    
## BeverageCaramel Macchiato                                  0.775 0.439202    
## BeverageCoffee                                             2.522 0.012611 *  
## BeverageEspresso                                          -1.338 0.182729    
## BeverageHot Chocolate (Without Whipped Cream)             -2.750 0.006616 ** 
## BeverageIced Brewed Coffee (With Classic Syrup)            0.100 0.920163    
## BeverageIced Brewed Coffee (With Milk & Classic Syrup)     0.203 0.839204    
## BeverageJava Chip                                          0.222 0.824223    
## BeverageJava Chip (Without Whipped Cream)                  0.124 0.901222    
## BeverageMocha                                              0.973 0.332161    
## BeverageMocha (Without Whipped Cream)                         NA       NA    
## BeverageOrange Mango Banana Smoothie                       0.881 0.379524    
## BeverageShaken Iced TazoÂ® Tea (With Classic Syrup)           NA       NA    
## BeverageSkinny Latte (Any Flavour)                         0.566 0.572000    
## BeverageStrawberries & CrÃ¨me (Without Whipped Cream)     -0.383 0.702361    
## BeverageStrawberry Banana Smoothie                         1.338 0.182625    
## BeverageTazoÂ® Chai Tea Latte                             -2.404 0.017292 *  
## BeverageTazoÂ® Full-Leaf Red Tea Latte (Vanilla Rooibos)  -1.270 0.205821    
## BeverageTazoÂ® Green Tea Latte                                NA       NA    
## BeverageVanilla Bean (Without Whipped Cream)                  NA       NA    
## BeverageVanilla Latte (Or Other Flavoured Latte)              NA       NA    
## BeverageWhite Chocolate Mocha (Without Whipped Cream)         NA       NA    
## Beverage_prepDoppio                                       -0.171 0.864402    
## Beverage_prepGrande                                       -0.334 0.738912    
## Beverage_prepGrande Nonfat Milk                            0.457 0.648396    
## Beverage_prepShort                                        -0.312 0.755785    
## Beverage_prepShort Nonfat Milk                            -0.782 0.435421    
## Beverage_prepSolo                                             NA       NA    
## Beverage_prepSoymilk                                      -0.121 0.903688    
## Beverage_prepTall                                         -0.515 0.607495    
## Beverage_prepTall Nonfat Milk                             -0.058 0.953974    
## Beverage_prepVenti                                            NA       NA    
## Beverage_prepVenti Nonfat Milk                             1.047 0.296766    
## Beverage_prepWhole Milk                                    0.537 0.591949    
## `Total Fat (g)`                                           16.967  < 2e-16 ***
## `Trans Fat (g)`                                           -1.913 0.057498 .  
## `Saturated Fat (g)`                                       -0.415 0.678751    
## `Sodium (mg)`                                             -1.214 0.226397    
## `Total Carbohydrates (g)`                                 -0.299 0.765514    
## `Cholesterol (mg)`                                         2.949 0.003641 ** 
## `Dietary Fibre (g)`                                       -0.263 0.792749    
## `Sugars (g)`                                               4.696 5.48e-06 ***
## `Protein (g)`                                             10.368  < 2e-16 ***
## `Caffeine (mg)`                                            1.935 0.054682 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.553 on 168 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9985, Adjusted R-squared:  0.998 
## F-statistic:  2200 on 50 and 168 DF,  p-value: < 2.2e-16

Is there a correlation between size of beverage and its nutritional values, such as calorie of sugar

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.3.2

corr_data=cor(starbuck[, c('Calories','Saturated Fat (g)','Sugars (g)','Caffeine (mg)',
                           'Total Fat (g)','Sodium (mg)','Total Carbohydrates (g)','Protein (g)')])
ggcorrplot(corr_data)

# Argument colors
ggcorrplot(corr_data, hc.order = TRUE, type = "lower",
           outline.col = "white",
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726"))

Classifying if a drink is healthy or not

Note: This is somewhat subjective depending on the person but for my case

I would say a drink is unhealthy if it has over 200 calories or over 25 gram of sugar

#1 is healthy and 0 is unhealthy, for reference
starbuck$healthy <- ifelse(starbuck$Calories>200 | starbuck$`Sugars (g)`>25, 0, 1)
head(starbuck)

## # A tibble: 6 × 15
##   Beverage_category       Beverage      Beverage_prep   Calories `Total Fat (g)`
##   <chr>                   <chr>         <chr>              <dbl>           <dbl>
## 1 Coffee                  Brewed Coffee Short                  3             0.1
## 2 Coffee                  Brewed Coffee Tall                   4             0.1
## 3 Coffee                  Brewed Coffee Grande                 5             0.1
## 4 Coffee                  Brewed Coffee Venti                  5             0.1
## 5 Classic Espresso Drinks CaffÃ¨ Latte  Short Nonfat M…       70             0.1
## 6 Classic Espresso Drinks CaffÃ¨ Latte  2% Milk              100             3.5
## # ℹ 10 more variables: `Trans Fat (g)` <dbl>, `Saturated Fat (g)` <dbl>,
## #   `Sodium (mg)` <dbl>, `Total Carbohydrates (g)` <dbl>,
## #   `Cholesterol (mg)` <dbl>, `Dietary Fibre (g)` <dbl>, `Sugars (g)` <dbl>,
## #   `Protein (g)` <dbl>, `Caffeine (mg)` <dbl>, healthy <dbl>

#From the linear regression output, the very 5 significant predictors were
#Trans fat, total fat, cholesterol, sugars, and protein so we will analyze
#its distribution as a continous independent variable with respect to healthy
#or not healthy

Comparing the box and whisker distribution of Healthy and unhealthy patients on the 5 important predictors

par(mfrow = c(3,2))
boxplot(`Total Fat (g)`~healthy, ylab="Total Fat (g)", xlab= "Healthy", col="light blue",data = starbuck)
boxplot(`Trans Fat (g)`~healthy, ylab="Trans Fat (g)", xlab= "Healthy", col="light blue",data = starbuck)
boxplot(`Cholesterol (mg)`~healthy, ylab="Cholesterol (mg)", xlab= "Healthy", col="light blue",data = starbuck)
boxplot(`Sugars (g)`~healthy, ylab="Sugars (g)", xlab= "Healthy", col="light blue",data = starbuck)
boxplot(`Protein (g)`~healthy, ylab="Protein (g)", xlab= "Healthy", col="light blue",data = starbuck)

#SPLITTING DATA INTO 80% TRAINING AND 20% TESTING SETS
library(tidyverse)
library(keras)

## Warning: package 'keras' was built under R version 4.3.2

## 
## Attaching package: 'keras'

## The following object is masked from 'package:party':
## 
##     fit

## The following object is masked from 'package:modeltools':
## 
##     fit

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.3.1

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

set.seed(5464654)
indexing <- sample(c(TRUE, FALSE), nrow(starbuck), replace=TRUE, prob=c(0.8,0.2))
training<- starbuck[indexing,]
testing<- starbuck[!indexing,]

#Delete the non-numeric columns
training <- training[, -(1:3)]
testing<-testing[, -(1:3)]

healthy1<-nrow(training)
healthy2<-nrow(testing)
X_train<-training[, -ncol(training)]
y_train<-training[,ncol(training)]

X_test<-testing[, -ncol(testing)]
y_test<-testing[,ncol(testing)]

#Converting x and y train/test to matrix to satisfy xgboost algorithm
X_train<-as.matrix(X_train)
y_train<-as.matrix(y_train)
X_test<-as.matrix(X_test)
y_test<-as.matrix(y_test)

# X_train <- training %>% 
#   select(-healthy) %>% 
#   scale()

# #Convert the training$healthy to factor
# factor_vector_train <- factor(training$healthy)
# y_train <- factor_vector_train
# 
# #Performing a transformation on the test set and test set labels
# X_test <- testing %>% 
#   select(-healthy) %>% 
#   scale()
# factor_vector_test <- factor(testing$healthy)
# y_test <- factor_vector_test

#FITTING EXTREME GRADIENT BOOSTED REGRESSION TREE
xgb.reg<- xgboost(data=X_train, label=y_train, max.depth=6, eta=0.01,
                  subsample=0.8, colsample_bytree=0.5, nrounds=100, objective="reg:linear")

## [17:41:38] WARNING: src/objective/regression_obj.cu:213: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:0.495336 
## [2]  train-rmse:0.490467 
## [3]  train-rmse:0.485648 
## [4]  train-rmse:0.482478 
## [5]  train-rmse:0.477739 
## [6]  train-rmse:0.473092 
## [7]  train-rmse:0.468727 
## [8]  train-rmse:0.464408 
## [9]  train-rmse:0.459848 
## [10] train-rmse:0.455860 
## [11] train-rmse:0.451677 
## [12] train-rmse:0.447553 
## [13] train-rmse:0.444567 
## [14] train-rmse:0.440405 
## [15] train-rmse:0.436079 
## [16] train-rmse:0.431799 
## [17] train-rmse:0.427603 
## [18] train-rmse:0.423615 
## [19] train-rmse:0.419461 
## [20] train-rmse:0.415383 
## [21] train-rmse:0.413481 
## [22] train-rmse:0.409678 
## [23] train-rmse:0.407962 
## [24] train-rmse:0.403957 
## [25] train-rmse:0.401357 
## [26] train-rmse:0.397655 
## [27] train-rmse:0.393758 
## [28] train-rmse:0.389896 
## [29] train-rmse:0.386110 
## [30] train-rmse:0.382319 
## [31] train-rmse:0.378574 
## [32] train-rmse:0.374852 
## [33] train-rmse:0.371176 
## [34] train-rmse:0.367533 
## [35] train-rmse:0.363934 
## [36] train-rmse:0.360517 
## [37] train-rmse:0.357096 
## [38] train-rmse:0.353764 
## [39] train-rmse:0.352232 
## [40] train-rmse:0.348980 
## [41] train-rmse:0.345555 
## [42] train-rmse:0.342174 
## [43] train-rmse:0.339241 
## [44] train-rmse:0.336012 
## [45] train-rmse:0.332719 
## [46] train-rmse:0.330280 
## [47] train-rmse:0.327203 
## [48] train-rmse:0.324124 
## [49] train-rmse:0.320942 
## [50] train-rmse:0.317802 
## [51] train-rmse:0.314916 
## [52] train-rmse:0.312056 
## [53] train-rmse:0.309000 
## [54] train-rmse:0.306146 
## [55] train-rmse:0.303145 
## [56] train-rmse:0.300965 
## [57] train-rmse:0.298020 
## [58] train-rmse:0.295145 
## [59] train-rmse:0.292491 
## [60] train-rmse:0.289984 
## [61] train-rmse:0.287465 
## [62] train-rmse:0.284759 
## [63] train-rmse:0.282080 
## [64] train-rmse:0.279484 
## [65] train-rmse:0.276756 
## [66] train-rmse:0.274052 
## [67] train-rmse:0.272105 
## [68] train-rmse:0.269571 
## [69] train-rmse:0.266943 
## [70] train-rmse:0.264335 
## [71] train-rmse:0.261917 
## [72] train-rmse:0.259475 
## [73] train-rmse:0.256950 
## [74] train-rmse:0.254480 
## [75] train-rmse:0.252008 
## [76] train-rmse:0.249851 
## [77] train-rmse:0.247587 
## [78] train-rmse:0.245171 
## [79] train-rmse:0.242807 
## [80] train-rmse:0.240453 
## [81] train-rmse:0.238108 
## [82] train-rmse:0.235800 
## [83] train-rmse:0.233528 
## [84] train-rmse:0.231270 
## [85] train-rmse:0.229246 
## [86] train-rmse:0.227162 
## [87] train-rmse:0.226059 
## [88] train-rmse:0.223979 
## [89] train-rmse:0.221808 
## [90] train-rmse:0.219652 
## [91] train-rmse:0.217669 
## [92] train-rmse:0.215563 
## [93] train-rmse:0.213624 
## [94] train-rmse:0.211550 
## [95] train-rmse:0.210330 
## [96] train-rmse:0.208669 
## [97] train-rmse:0.206816 
## [98] train-rmse:0.204811 
## [99] train-rmse:0.202835 
## [100]    train-rmse:0.201153

#eta=learning rate, colsample_bytree denes what percentage of features (columns)
# will be used for building each tree

#DISPLAYING FEATURE IMPORTANCE
print(xgb.importance(colnames(X_train), model=xgb.reg))

##                     Feature        Gain      Cover  Frequency
##  1:              Sugars (g) 0.512901628 0.23512479 0.08193980
##  2:        Cholesterol (mg) 0.240030685 0.18713799 0.10033445
##  3:                Calories 0.114895531 0.17797592 0.16053512
##  4: Total Carbohydrates (g) 0.049664692 0.12044090 0.14214047
##  5:           Total Fat (g) 0.025440894 0.06371327 0.14381271
##  6:             Protein (g) 0.017270739 0.05009302 0.08361204
##  7:           Trans Fat (g) 0.011239336 0.05132166 0.07357860
##  8:           Caffeine (mg) 0.010333830 0.05321726 0.08193980
##  9:       Dietary Fibre (g) 0.007736928 0.02667884 0.04515050
## 10:             Sodium (mg) 0.005726037 0.01786780 0.04515050
## 11:       Saturated Fat (g) 0.004759701 0.01642855 0.04180602

#Prediction accuracy for testing data
prediction.y<-predict(xgb.reg,X_test) #The numbers are not binary classification
#so we use a simple transformation shown at the code below

#accuracy within 10%
accuracy10<- ifelse(abs(y_test-prediction.y)<0.10*y_test,1,0)
print(sum(accuracy10)/length(accuracy10))

## [1] 0

#accuracy within 15%
accuracy15<- ifelse(abs(y_test-prediction.y)<0.15*y_test,1,0)
print(sum(accuracy15)/length(accuracy15))

## [1] 0

#accuracy within 20%
accuracy20<- ifelse(abs(y_test-prediction.y)<0.20*y_test,1,0)
print(sum(accuracy20)/length(accuracy20))

## [1] 0.125

#Transform this regression into a binary classification
prediction_binary<-as.numeric(prediction.y>0.5)
print(head(prediction_binary))

## [1] 1 1 1 0 0 1

#Model performance error
error <- mean(as.numeric(prediction.y > 0.5) != testing$healthy)
print(paste("test-error=", error))

## [1] "test-error= 0"

The test error is 0, it indicates that model did pretty well with the predictions. Meaning the model made great predictions on the test set in this context

cm<-table(testing$healthy, as.numeric(prediction.y > 0.5))
cm

##    
##      0  1
##   0 28  0
##   1  0 20

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Starbucks!

2024-01-16

Rmkdown