#upload libraries
library(ggplot2)
library(datasets)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(dplyr)
library(knitr)
library(directlabels)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggrepel)
library(dplyr)
library(forcats)
#read the dataset file
c <- read.csv("data_fastfood_calories.csv")
#analyse the dataset
colnames(c)
## [1] "restaurant" "item" "calories" "cal_fat" "total_fat"
## [6] "sat_fat" "trans_fat" "cholesterol" "sodium" "total_carb"
## [11] "fiber" "sugar" "protein" "vit_a" "vit_c"
## [16] "calcium"
skim(c)
Data summary
| Name |
c |
| Number of rows |
515 |
| Number of columns |
16 |
| _______________________ |
|
| Column type frequency: |
|
| character |
2 |
| numeric |
14 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| restaurant |
0 |
1 |
5 |
11 |
0 |
8 |
0 |
| item |
0 |
1 |
5 |
63 |
0 |
505 |
0 |
Variable type: numeric
| calories |
0 |
1.00 |
530.91 |
282.44 |
20 |
330.0 |
490.0 |
690 |
2430 |
▇▆▁▁▁ |
| cal_fat |
0 |
1.00 |
238.81 |
166.41 |
0 |
120.0 |
210.0 |
310 |
1270 |
▇▃▁▁▁ |
| total_fat |
0 |
1.00 |
26.59 |
18.41 |
0 |
14.0 |
23.0 |
35 |
141 |
▇▃▁▁▁ |
| sat_fat |
0 |
1.00 |
8.15 |
6.42 |
0 |
4.0 |
7.0 |
11 |
47 |
▇▃▁▁▁ |
| trans_fat |
0 |
1.00 |
0.47 |
0.84 |
0 |
0.0 |
0.0 |
1 |
8 |
▇▁▁▁▁ |
| cholesterol |
0 |
1.00 |
72.46 |
63.16 |
0 |
35.0 |
60.0 |
95 |
805 |
▇▁▁▁▁ |
| sodium |
0 |
1.00 |
1246.74 |
689.95 |
15 |
800.0 |
1110.0 |
1550 |
6080 |
▇▆▁▁▁ |
| total_carb |
0 |
1.00 |
45.66 |
24.88 |
0 |
28.5 |
44.0 |
57 |
156 |
▅▇▂▁▁ |
| fiber |
12 |
0.98 |
4.14 |
3.04 |
0 |
2.0 |
3.0 |
5 |
17 |
▇▅▂▁▁ |
| sugar |
0 |
1.00 |
7.26 |
6.76 |
0 |
3.0 |
6.0 |
9 |
87 |
▇▁▁▁▁ |
| protein |
1 |
1.00 |
27.89 |
17.68 |
1 |
16.0 |
24.5 |
36 |
186 |
▇▂▁▁▁ |
| vit_a |
214 |
0.58 |
18.86 |
31.38 |
0 |
4.0 |
10.0 |
20 |
180 |
▇▁▁▁▁ |
| vit_c |
210 |
0.59 |
20.17 |
30.59 |
0 |
4.0 |
10.0 |
30 |
400 |
▇▁▁▁▁ |
| calcium |
210 |
0.59 |
24.85 |
25.52 |
0 |
8.0 |
20.0 |
30 |
290 |
▇▁▁▁▁ |
#pull out the required dataset variables
restaurant= c[, "restaurant"]
item= c[, "item"]
calories = c[, "calories"]
sodium <- c[, "sodium"]
#make a is_salad variable
c <- c %>%
mutate(is_salad = grepl("salad", c$item, ignore.case = TRUE))
colnames(c)
## [1] "restaurant" "item" "calories" "cal_fat" "total_fat"
## [6] "sat_fat" "trans_fat" "cholesterol" "sodium" "total_carb"
## [11] "fiber" "sugar" "protein" "vit_a" "vit_c"
## [16] "calcium" "is_salad"
#sort the boxplots by median values
c$restaurant <- with(c, reorder(restaurant , calories, median , na.rm=T))
#plot the data
ggplot(c, aes(x = calories, y = restaurant))+
geom_boxplot(outlier.shape = NA)+
scale_x_continuous(trans = "log10") +
ylab("Restaurant") +
xlab("Calories (log10 scale)") +
theme_bw() +
labs(color = "Is the entree a salad?")+
scale_color_discrete(name = "Is the entree a salad?", labels = c("Not a Salad", "Salad"))+
geom_jitter(aes(color = is_salad))
