#upload libraries
library(ggplot2)
library(datasets)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(dplyr)
library(knitr)
library(directlabels)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggrepel)
library(dplyr)
library(forcats)
#read the dataset file
c <- read.csv("data_fastfood_calories.csv")

#analyse the dataset
colnames(c)
##  [1] "restaurant"  "item"        "calories"    "cal_fat"     "total_fat"  
##  [6] "sat_fat"     "trans_fat"   "cholesterol" "sodium"      "total_carb" 
## [11] "fiber"       "sugar"       "protein"     "vit_a"       "vit_c"      
## [16] "calcium"
skim(c)
Data summary
Name c
Number of rows 515
Number of columns 16
_______________________
Column type frequency:
character 2
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
restaurant 0 1 5 11 0 8 0
item 0 1 5 63 0 505 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
calories 0 1.00 530.91 282.44 20 330.0 490.0 690 2430 ▇▆▁▁▁
cal_fat 0 1.00 238.81 166.41 0 120.0 210.0 310 1270 ▇▃▁▁▁
total_fat 0 1.00 26.59 18.41 0 14.0 23.0 35 141 ▇▃▁▁▁
sat_fat 0 1.00 8.15 6.42 0 4.0 7.0 11 47 ▇▃▁▁▁
trans_fat 0 1.00 0.47 0.84 0 0.0 0.0 1 8 ▇▁▁▁▁
cholesterol 0 1.00 72.46 63.16 0 35.0 60.0 95 805 ▇▁▁▁▁
sodium 0 1.00 1246.74 689.95 15 800.0 1110.0 1550 6080 ▇▆▁▁▁
total_carb 0 1.00 45.66 24.88 0 28.5 44.0 57 156 ▅▇▂▁▁
fiber 12 0.98 4.14 3.04 0 2.0 3.0 5 17 ▇▅▂▁▁
sugar 0 1.00 7.26 6.76 0 3.0 6.0 9 87 ▇▁▁▁▁
protein 1 1.00 27.89 17.68 1 16.0 24.5 36 186 ▇▂▁▁▁
vit_a 214 0.58 18.86 31.38 0 4.0 10.0 20 180 ▇▁▁▁▁
vit_c 210 0.59 20.17 30.59 0 4.0 10.0 30 400 ▇▁▁▁▁
calcium 210 0.59 24.85 25.52 0 8.0 20.0 30 290 ▇▁▁▁▁
#pull out the required dataset variables
restaurant= c[, "restaurant"]
item= c[, "item"]
calories = c[, "calories"]
sodium <- c[, "sodium"]

#make a is_salad variable
c <- c %>%
  mutate(is_salad = grepl("salad", c$item, ignore.case = TRUE))

colnames(c)
##  [1] "restaurant"  "item"        "calories"    "cal_fat"     "total_fat"  
##  [6] "sat_fat"     "trans_fat"   "cholesterol" "sodium"      "total_carb" 
## [11] "fiber"       "sugar"       "protein"     "vit_a"       "vit_c"      
## [16] "calcium"     "is_salad"
#sort the boxplots by median values
c$restaurant <- with(c, reorder(restaurant , calories, median , na.rm=T))

#plot the data
ggplot(c, aes(x = calories, y = restaurant))+
  geom_boxplot(outlier.shape = NA)+
  scale_x_continuous(trans = "log10") +
  ylab("Restaurant") +
  xlab("Calories (log10 scale)") +
  theme_bw() +
  labs(color = "Is the entree a salad?")+
  scale_color_discrete(name = "Is the entree a salad?", labels = c("Not a Salad", "Salad"))+
  geom_jitter(aes(color = is_salad))