We’ll be using the titanic.rda, Aus_athletes.rda, and cdc.txt datasets.
# Load package(s)
library(ggplot2)
library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
# Load datasets
load(file = "data/titanic.rda")
load(file = "data/Aus_athletes.rda")
# Read in the cdc dataset
cdc <- read_delim(file = "data/cdc.txt", delim = "|") %>%
mutate(genhlth = factor(genhlth,
levels = c("excellent", "very good", "good", "fair", "poor"),
labels = c("Excellent", "Very Good", "Good", "Fair", "Poor")
))
## Parsed with column specification:
## cols(
## genhlth = col_character(),
## exerany = col_double(),
## hlthplan = col_double(),
## smoke100 = col_double(),
## height = col_double(),
## weight = col_double(),
## wtdesire = col_double(),
## age = col_double(),
## gender = col_character()
## )
# Set seed
set.seed(8221984)
# Selecting a random subset of size 1000
cdc_small <- cdc %>% sample_n(1000)
Above, I have loaded packages ggplot, tidyverse, dplyr, and cowplot and datasets of titanic.rda, Aus_athletes.rda, and cdc.
labels <- c('1st' = "1st",
'2nd' = "2nd",
'3rd' = "3rd",
'0' = "died",
'1' = "survived")
ggplot(titanic, aes(sex)) +
geom_bar(aes(fill = sex), show.legend = FALSE) +
facet_grid(survived ~ class, labeller = as_labeller(labels), scales = "free") +
theme_minimal() +
scale_fill_manual(values = c("#D55E00D0","#0072B2D0") ) +
xlab(NULL)
Here, I have created a facet_grid barplot distinguished by the survived and class variables. I believe this shows effectively for us to compare the survival rates by gender and class in a single look.
# Get list of sports played by BOTH sexes
both_sports <- Aus_athletes %>%
distinct(sex, sport) %>%
count(sport) %>%
filter(n == 2) %>%
pull(sport)
# Process data
athletes_dat <- Aus_athletes %>%
filter(sport %in% both_sports) %>%
mutate(sport = case_when(
sport == "track (400m)" ~ "track",
sport == "track (sprint)" ~ "track",
TRUE ~ sport
))
Above, the athletes_dat dataset is formed from the Aus_athletes dataset.
plot.a<-ggplot(athletes_dat, aes(sex)) +
geom_bar(aes(fill = sex), show.legend = FALSE) +
scale_x_discrete(breaks = c("f", "m"),
labels = c("female", "male")) +
scale_y_continuous(breaks = c(0,25,50,75),
expand = c(0,0),
limits = c(0,95)) +
scale_fill_manual(values = c("#D55E00D0", "#0072B2D0")) +
theme_minimal() +
xlab(NULL) +
ylab("number")
plot.b<-ggplot(athletes_dat, aes(rcc, wcc)) +
geom_point(aes(colour = sex), size = 3, show.legend = FALSE) +
theme_minimal() +
xlab("RBC count") +
scale_colour_manual(values = c("#D55E00D0", "#0072B2D0")) +
ylab("WBC count")
plot.c<- ggplot(athletes_dat, aes(sport, pcBfat)) +
theme_minimal() +
geom_boxplot(width = 0.5, aes(fill = sex, colour = sex)) +
scale_fill_manual(values = c("#D55E0040", "#0072B240"),
breaks = c("f","m"),
labels = c("female", "male")) +
scale_colour_manual(values = c("#D55E00", "#0072B2"),
breaks = c("f", "m"),
labels = c("female", "male")) +
theme(legend.justification = c(1,1),
legend.position = c(1,1),
legend.title = element_blank(),
legend.direction = "horizontal") +
guides(colour = guide_legend(override.aes = list(colour = NA),
fill = c("#D55E0040","#0072B240"))) +
xlab(NULL) +
ylab("% body fat")
#Combine Plots
require(cowplot)
top_row_graphs<-plot_grid(plot.a,plot.b, labels = c('',''))
plots<-align_plots(plot.c, top_row_graphs,
align = 'v', axis = '1')
plot_grid(plots[[2]], plot.c, labels = c('',''),ncol = 1, rel_heights = c(1,1.2))
Above, I have formed three plots of plot.a, plot.b, plot.c into a single plot using the cowplot package and the plot_grid function, and by using the athletes_dat dataset.
#Modify Data
cdc_small["Weight_Loss_Gain_in_Pounds"]<-NA
cdc_small$Weight_Loss_Gain_in_Pounds<-cdc_small$wtdesire-cdc_small$weight
cdc_small$genhlth_f = factor(cdc_small$genhlth,
levels = c('Poor', 'Fair', 'Good',
'Very Good', 'Excellent'))
Above, I have modified the dataset of cdc dataset to create the new column and variable of “Weight_Loss_Gain_in_Pounds” that equals to the value of the wtdesire variable minus the weight variable.
labels3 <- c('Poor' = "Poor",
'Fair' = "Fair",
'Good' = "Good",
'Very Good' = "Very Good",
'Excellent' = "Excellent",
'f' = "Women",
'm' = "Men")
df2<-dplyr::select(cdc_small, -gender)
ggplot(cdc_small, aes(weight, Weight_Loss_Gain_in_Pounds)) +
theme_minimal() +
geom_point(data = df2, colour = "grey80") +
geom_point(aes(colour = gender), size = 2, show.legend = FALSE) +
facet_grid(gender ~ genhlth_f,labeller = as_labeller(labels3)) +
scale_x_continuous(breaks = c(100,200,300)) +
scale_y_continuous(breaks = c(-100,0)) +
scale_colour_manual(values = c("#D55E00D0","#0072B2D0")) +
xlab("Weight(lbs)") +
ylab("Weight Loss/Gain in Pounds")
Here, I created a new dataset of df2 which takes out the gender variable in order to use grey points in the background, with only the specified gender points to be colored. The facet_grid function helped with this, to create these plots differentiated by gender and general health status.