Vending Machine Project

Author

Nathan

1. Data Setup

1.1 Package Installation

# Packages to iterate over
packages <- c("dplyr", "stargazer", "ggplot2", "Amelia", "ggcorrplot", "lmtest", "e1071", "MASS", "readxl")

# Install and load packages if not already installed
for (pkg in packages) {
  if (!pkg %in% rownames(installed.packages())) {
    install.packages(pkg, repos = "http://cran.rstudio.com/", dependencies = TRUE)
  }
  library(pkg, character.only = TRUE)
}

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
Loading required package: Rcpp
## 
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
## 
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
rm(packages, pkg)  # Clean up


require("Amelia") # dependencies

rm(list = ls()) # Clear environment-remove all files from your workspace
gc()            # Clear unused memory
          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  896633 47.9    1703552   91  1373288 73.4
Vcells 1555747 11.9    8388608   64  2507084 19.2
graphics.off()  # Clear all graphs
cat("\f")       # Clear the console

1.1.2 Converting Data Type

# vending_machine <- as.numeric(vending_machine$Race)

1.2 Data Import

vending_machine <- read_xlsx("C:/Users/User/Downloads/datasetvending.xlsx")

1.3 Base Data Visualization

visdat::vis_dat(vending_machine)

2. Data Preperation

2.1 Data Clean

Amelia::missmap(vending_machine)
Warning: Unknown or uninitialised column: `arguments`.
Unknown or uninitialised column: `arguments`.
Warning: Unknown or uninitialised column: `imputations`.

2.2 Clean Data Visualization

# vending_machine <- na.omit(vending_machine)
head(vending_machine) #preview data
# A tibble: 6 × 14
      M Name     Age Sex   Race  City  States Allergies Prediction Meals_Per_Day
  <dbl> <chr>  <dbl> <chr> <chr> <chr> <chr>  <chr>          <dbl>         <dbl>
1     1 Caily…    17 F     White Bost… MA     No               800           1.5
2     2 Aarav     15 M     Asian Bost… MA     No              1300           3  
3     3 Joy       17 F     Asian Shan… Taiyu… No               600           2  
4     4 Levon     16 M     Euro… Chic… IL     No               750           1  
5     5 Hannah    17 F     Amer… Bost… MA     No               800           1.5
6     6 Zirui…    16 F     Asian Wenz… Zheji… No               200           3  
# ℹ 4 more variables: Meal_Card_Balance <dbl>, Likes_food <chr>, Jobs <dbl>,
#   checkouts <dbl>
summary(vending_machine, type = "text", digits = 3) # summary statistics
       M            Name                Age           Sex           
 Min.   : 1.0   Length:38          Min.   :15.0   Length:38         
 1st Qu.:10.2   Class :character   1st Qu.:16.0   Class :character  
 Median :19.5   Mode  :character   Median :16.0   Mode  :character  
 Mean   :19.8                      Mean   :16.4                     
 3rd Qu.:28.8                      3rd Qu.:17.0                     
 Max.   :42.0                      Max.   :19.0                     
     Race               City              States           Allergies        
 Length:38          Length:38          Length:38          Length:38         
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
   Prediction   Meals_Per_Day  Meal_Card_Balance  Likes_food       
 Min.   : 100   Min.   :0.00   Min.   :  0       Length:38         
 1st Qu.: 300   1st Qu.:1.50   1st Qu.:250       Class :character  
 Median : 500   Median :2.00   Median :300       Mode  :character  
 Mean   : 697   Mean   :1.92   Mean   :308                         
 3rd Qu.: 788   3rd Qu.:2.50   3rd Qu.:362                         
 Max.   :3500   Max.   :3.00   Max.   :599                         
      Jobs         checkouts    
 Min.   :0.000   Min.   : 0.00  
 1st Qu.:0.000   1st Qu.: 4.00  
 Median :0.000   Median : 5.00  
 Mean   :0.579   Mean   : 6.32  
 3rd Qu.:1.000   3rd Qu.: 9.75  
 Max.   :2.000   Max.   :20.00  

3. Data Analysis

3.1 Linear Regression for Meal_Card_Balance based on Likes_food, Sex, & checkouts

vending_machine_pos_model <- lm(Meal_Card_Balance ~ checkouts + Likes_food + Sex, data = vending_machine)
summary(vending_machine_pos_model) # create liner regression model that predicts target wins based on pitching and batting homerun

Call:
lm(formula = Meal_Card_Balance ~ checkouts + Likes_food + Sex, 
    data = vending_machine)

Residuals:
     Min       1Q   Median       3Q      Max 
-249.617  -53.771    0.126   71.776  245.461 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      249.62      33.57   7.435 1.27e-08 ***
checkouts         10.39       4.07   2.553   0.0153 *  
Likes_foodTRUE   -47.02      38.18  -1.232   0.2266    
SexM              27.20      38.06   0.715   0.4797    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 102.6 on 34 degrees of freedom
Multiple R-squared:  0.1888,    Adjusted R-squared:  0.1172 
F-statistic: 2.638 on 3 and 34 DF,  p-value: 0.06533
plot(vending_machine_pos_model) # produce models

3.2.1 Comparing Actual vs Predicted

names(vending_machine)
 [1] "M"                 "Name"              "Age"              
 [4] "Sex"               "Race"              "City"             
 [7] "States"            "Allergies"         "Prediction"       
[10] "Meals_Per_Day"     "Meal_Card_Balance" "Likes_food"       
[13] "Jobs"              "checkouts"        
vending_machine_pos_predictions <- vending_machine %>%
  mutate(predictions = predict(vending_machine_pos_model, newdata = vending_machine)) %>%
  dplyr::select(M, Meal_Card_Balance, checkouts, predictions) # compare actual target_wins with predictions
plot(vending_machine_pos_predictions)

3.3 Graph Meal_Card_Balance Prediction Correlation

# create line graphs for visualization of predictions vs actual
ggplot(vending_machine_pos_predictions, aes(x = checkouts)) +
  geom_line(aes(y = Meal_Card_Balance), color = "darkred") + geom_line(aes(y = predictions), color = "darkblue") +
  ggtitle("Actual vs Predicted Target Wins") +
  xlab("Number of Checkouts") +
  ylab("Meal Card Balance")

3.4 Linear Regression for Meal_Card_Balance based on Jobs & checkouts

vending_machine_neg_model <- lm(Meal_Card_Balance ~ Jobs + checkouts, data = vending_machine)
summary(vending_machine_neg_model)

Call:
lm(formula = Meal_Card_Balance ~ Jobs + checkouts, data = vending_machine)

Residuals:
     Min       1Q   Median       3Q      Max 
-256.798  -56.485   -6.782   56.419  262.935 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  256.798     35.154   7.305 1.55e-08 ***
Jobs         -18.137     24.919  -0.728   0.4716    
checkouts      9.740      4.048   2.406   0.0215 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 102.6 on 35 degrees of freedom
Multiple R-squared:  0.1646,    Adjusted R-squared:  0.1169 
F-statistic: 3.449 on 2 and 35 DF,  p-value: 0.04293
# plot model diagnostics
plot(vending_machine_neg_model)

3.5 Comparing Actual vs Predicted

vending_machine_neg_predictions <- vending_machine %>%
  mutate(predictions = predict(vending_machine_neg_model, newdata = vending_machine)) %>%
  dplyr::select(M, Meal_Card_Balance, Jobs, checkouts, predictions) # predict ven based on field errors and pitching hits (negative theoretical effects)

Graph Meals_Card_Balance

ggplot(vending_machine_neg_predictions, aes(x = Jobs)) +
  geom_line(aes(y = Meal_Card_Balance), color = "darkred") +
  geom_line(aes(y = predictions), color = "darkblue") +
  ggtitle("Actual vs Predicted Target Wins") +
  xlab("# of Jobs") +
  ylab("Meal Card Balance")

3.6 Correlation Matrix

vending_machine_cor <- cor(vending_machine[,c(3, 9, 10, 11, 13, 14)]) # input: every column besides index

ggcorrplot(vending_machine_cor, 
           hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 2,  # Increased label size
           method = "square", 
           colors = c("tomato2", "white", "blue"), 
           title = "Correlation Matrix", 
           ggtheme = ggplot2::theme_minimal() + 
                     theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 1),
                           axis.text.y = element_text(size = 12),  # increase size of y-axis labels
                           plot.margin = unit(c(1,1,1,1), "cm"))  # increasing plot margins
          )

3.7 Histogram of Meal Card Balance

ggplot(vending_machine, mapping = aes(x = Meal_Card_Balance)) + geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.