# Packages to iterate overpackages <-c("dplyr", "stargazer", "ggplot2", "Amelia", "ggcorrplot", "lmtest", "e1071", "MASS", "readxl")# Install and load packages if not already installedfor (pkg in packages) {if (!pkg %in%rownames(installed.packages())) {install.packages(pkg, repos ="http://cran.rstudio.com/", dependencies =TRUE) }library(pkg, character.only =TRUE)}
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
rm(packages, pkg) # Clean uprequire("Amelia") # dependenciesrm(list =ls()) # Clear environment-remove all files from your workspacegc() # Clear unused memory
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 896633 47.9 1703552 91 1373288 73.4
Vcells 1555747 11.9 8388608 64 2507084 19.2
graphics.off() # Clear all graphscat("\f") # Clear the console
Warning: Unknown or uninitialised column: `arguments`.
Unknown or uninitialised column: `arguments`.
Warning: Unknown or uninitialised column: `imputations`.
2.2 Clean Data Visualization
# vending_machine <- na.omit(vending_machine)
head(vending_machine) #preview data
# A tibble: 6 × 14
M Name Age Sex Race City States Allergies Prediction Meals_Per_Day
<dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 1 Caily… 17 F White Bost… MA No 800 1.5
2 2 Aarav 15 M Asian Bost… MA No 1300 3
3 3 Joy 17 F Asian Shan… Taiyu… No 600 2
4 4 Levon 16 M Euro… Chic… IL No 750 1
5 5 Hannah 17 F Amer… Bost… MA No 800 1.5
6 6 Zirui… 16 F Asian Wenz… Zheji… No 200 3
# ℹ 4 more variables: Meal_Card_Balance <dbl>, Likes_food <chr>, Jobs <dbl>,
# checkouts <dbl>
summary(vending_machine, type ="text", digits =3) # summary statistics
M Name Age Sex
Min. : 1.0 Length:38 Min. :15.0 Length:38
1st Qu.:10.2 Class :character 1st Qu.:16.0 Class :character
Median :19.5 Mode :character Median :16.0 Mode :character
Mean :19.8 Mean :16.4
3rd Qu.:28.8 3rd Qu.:17.0
Max. :42.0 Max. :19.0
Race City States Allergies
Length:38 Length:38 Length:38 Length:38
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Prediction Meals_Per_Day Meal_Card_Balance Likes_food
Min. : 100 Min. :0.00 Min. : 0 Length:38
1st Qu.: 300 1st Qu.:1.50 1st Qu.:250 Class :character
Median : 500 Median :2.00 Median :300 Mode :character
Mean : 697 Mean :1.92 Mean :308
3rd Qu.: 788 3rd Qu.:2.50 3rd Qu.:362
Max. :3500 Max. :3.00 Max. :599
Jobs checkouts
Min. :0.000 Min. : 0.00
1st Qu.:0.000 1st Qu.: 4.00
Median :0.000 Median : 5.00
Mean :0.579 Mean : 6.32
3rd Qu.:1.000 3rd Qu.: 9.75
Max. :2.000 Max. :20.00
3. Data Analysis
3.1 Linear Regression for Meal_Card_Balance based on Likes_food, Sex, & checkouts
vending_machine_pos_model <-lm(Meal_Card_Balance ~ checkouts + Likes_food + Sex, data = vending_machine)summary(vending_machine_pos_model) # create liner regression model that predicts target wins based on pitching and batting homerun
Call:
lm(formula = Meal_Card_Balance ~ checkouts + Likes_food + Sex,
data = vending_machine)
Residuals:
Min 1Q Median 3Q Max
-249.617 -53.771 0.126 71.776 245.461
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 249.62 33.57 7.435 1.27e-08 ***
checkouts 10.39 4.07 2.553 0.0153 *
Likes_foodTRUE -47.02 38.18 -1.232 0.2266
SexM 27.20 38.06 0.715 0.4797
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 102.6 on 34 degrees of freedom
Multiple R-squared: 0.1888, Adjusted R-squared: 0.1172
F-statistic: 2.638 on 3 and 34 DF, p-value: 0.06533
# create line graphs for visualization of predictions vs actualggplot(vending_machine_pos_predictions, aes(x = checkouts)) +geom_line(aes(y = Meal_Card_Balance), color ="darkred") +geom_line(aes(y = predictions), color ="darkblue") +ggtitle("Actual vs Predicted Target Wins") +xlab("Number of Checkouts") +ylab("Meal Card Balance")
3.4 Linear Regression for Meal_Card_Balance based on Jobs & checkouts
vending_machine_neg_model <-lm(Meal_Card_Balance ~ Jobs + checkouts, data = vending_machine)summary(vending_machine_neg_model)
Call:
lm(formula = Meal_Card_Balance ~ Jobs + checkouts, data = vending_machine)
Residuals:
Min 1Q Median 3Q Max
-256.798 -56.485 -6.782 56.419 262.935
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 256.798 35.154 7.305 1.55e-08 ***
Jobs -18.137 24.919 -0.728 0.4716
checkouts 9.740 4.048 2.406 0.0215 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 102.6 on 35 degrees of freedom
Multiple R-squared: 0.1646, Adjusted R-squared: 0.1169
F-statistic: 3.449 on 2 and 35 DF, p-value: 0.04293
# plot model diagnosticsplot(vending_machine_neg_model)
3.5 Comparing Actual vs Predicted
vending_machine_neg_predictions <- vending_machine %>%mutate(predictions =predict(vending_machine_neg_model, newdata = vending_machine)) %>% dplyr::select(M, Meal_Card_Balance, Jobs, checkouts, predictions) # predict ven based on field errors and pitching hits (negative theoretical effects)
Graph Meals_Card_Balance
ggplot(vending_machine_neg_predictions, aes(x = Jobs)) +geom_line(aes(y = Meal_Card_Balance), color ="darkred") +geom_line(aes(y = predictions), color ="darkblue") +ggtitle("Actual vs Predicted Target Wins") +xlab("# of Jobs") +ylab("Meal Card Balance")