Tidyverse_Visualization_Assignemnt

#upload libraries
library(ggplot2)
library(datasets)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)
library(dplyr)
library(knitr)
library(directlabels)
library(cowplot)

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

library(ggrepel)
library(dplyr)
library(forcats)

#read the dataset file
c <- read.csv("data_fastfood_calories.csv")

#analyse the dataset
colnames(c)

##  [1] "restaurant"  "item"        "calories"    "cal_fat"     "total_fat"  
##  [6] "sat_fat"     "trans_fat"   "cholesterol" "sodium"      "total_carb" 
## [11] "fiber"       "sugar"       "protein"     "vit_a"       "vit_c"      
## [16] "calcium"

skim(c)

Data summary
Name	c
Number of rows	515
Number of columns	16
_______________________
Column type frequency:
character	2
numeric	14
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
restaurant	0	1	5	11	0	8	0
item	0	1	5	63	0	505	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
calories	0	1.00	530.91	282.44	20	330.0	490.0	690	2430	▇▆▁▁▁
cal_fat	0	1.00	238.81	166.41	0	120.0	210.0	310	1270	▇▃▁▁▁
total_fat	0	1.00	26.59	18.41	0	14.0	23.0	35	141	▇▃▁▁▁
sat_fat	0	1.00	8.15	6.42	0	4.0	7.0	11	47	▇▃▁▁▁
trans_fat	0	1.00	0.47	0.84	0	0.0	0.0	1	8	▇▁▁▁▁
cholesterol	0	1.00	72.46	63.16	0	35.0	60.0	95	805	▇▁▁▁▁
sodium	0	1.00	1246.74	689.95	15	800.0	1110.0	1550	6080	▇▆▁▁▁
total_carb	0	1.00	45.66	24.88	0	28.5	44.0	57	156	▅▇▂▁▁
fiber	12	0.98	4.14	3.04	0	2.0	3.0	5	17	▇▅▂▁▁
sugar	0	1.00	7.26	6.76	0	3.0	6.0	9	87	▇▁▁▁▁
protein	1	1.00	27.89	17.68	1	16.0	24.5	36	186	▇▂▁▁▁
vit_a	214	0.58	18.86	31.38	0	4.0	10.0	20	180	▇▁▁▁▁
vit_c	210	0.59	20.17	30.59	0	4.0	10.0	30	400	▇▁▁▁▁
calcium	210	0.59	24.85	25.52	0	8.0	20.0	30	290	▇▁▁▁▁

#pull out the required dataset variables
restaurant= c[, "restaurant"]
item= c[, "item"]
calories = c[, "calories"]
sodium <- c[, "sodium"]

#make a is_salad variable
c <- c %>%
  mutate(is_salad = grepl("salad", c$item, ignore.case = TRUE))

colnames(c)

##  [1] "restaurant"  "item"        "calories"    "cal_fat"     "total_fat"  
##  [6] "sat_fat"     "trans_fat"   "cholesterol" "sodium"      "total_carb" 
## [11] "fiber"       "sugar"       "protein"     "vit_a"       "vit_c"      
## [16] "calcium"     "is_salad"

#sort the boxplots by median values
c$restaurant <- with(c, reorder(restaurant , calories, median , na.rm=T))

#plot the data
ggplot(c, aes(x = calories, y = restaurant))+
  geom_boxplot(outlier.shape = NA)+
  scale_x_continuous(trans = "log10") +
  ylab("Restaurant") +
  xlab("Calories (log10 scale)") +
  theme_bw() +
  labs(color = "Is the entree a salad?")+
  scale_color_discrete(name = "Is the entree a salad?", labels = c("Not a Salad", "Salad"))+
  geom_jitter(aes(color = is_salad))

Tidyverse_Visualization_Assignemnt_problem4

2023-11-19