library(tidyverse)
TRUE ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
TRUE ✔ dplyr 1.1.4 ✔ readr 2.1.5
TRUE ✔ forcats 1.0.0 ✔ stringr 1.5.1
TRUE ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
TRUE ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
TRUE ✔ purrr 1.0.2
TRUE ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
TRUE ✖ dplyr::filter() masks stats::filter()
TRUE ✖ dplyr::lag() masks stats::lag()
TRUE ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dir.create("data")
download.file("https://raw.githubusercontent.com/AMMnet/AMMnet-Hackathon/main/01_data-vis/data/mockdata_cases.csv", destfile = "data/mockdata_cases.csv")
download.file("https://raw.githubusercontent.com/AMMnet/AMMnet-Hackathon/main/01_data-vis/data/mosq_mock.csv", destfile = "data/mosq_mock.csv")
malaria_data <- read_csv("data/mockdata_cases.csv")
TRUE Rows: 514 Columns: 10
TRUE ── Column specification ────────────────────────────────────────────────────────
TRUE Delimiter: ","
TRUE chr (2): location, ages
TRUE dbl (8): month, year, total, positive, xcoord, ycoord, prev, time_order_loc
TRUE
TRUE ℹ Use `spec()` to retrieve the full column specification for this data.
TRUE ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mosquito_data <- read_csv("data/mosq_mock.csv")
TRUE Rows: 104 Columns: 19
TRUE ── Column specification ────────────────────────────────────────────────────────
TRUE Delimiter: ","
TRUE chr (4): Village, Method, Location, hour
TRUE dbl (15): session, Compound.ID, ag.Male, Ag.unfed, Ag.halffed, Ag.fed, Ag.gr...
TRUE
TRUE ℹ Use `spec()` to retrieve the full column specification for this data.
TRUE ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#’ Before we start visualizing our data, we need to understand the #’ characteristics of our data. The goal is to get an idea of the #’ data structure and to understand the relationships between variables.
dim(malaria_data)
TRUE [1] 514 10
head(malaria_data)
summary(malaria_data)
TRUE location month year ages
TRUE Length:514 Min. : 1.000 Min. :2018 Length:514
TRUE Class :character 1st Qu.: 4.000 1st Qu.:2018 Class :character
TRUE Mode :character Median : 7.000 Median :2019 Mode :character
TRUE Mean : 6.486 Mean :2019
TRUE 3rd Qu.: 9.000 3rd Qu.:2020
TRUE Max. :12.000 Max. :2020
TRUE total positive xcoord ycoord
TRUE Min. : 20.0 Min. : -1.00 Min. :-21.84 Min. :28.52
TRUE 1st Qu.: 46.0 1st Qu.: 14.00 1st Qu.:-20.39 1st Qu.:29.64
TRUE Median :103.0 Median : 33.00 Median :-20.06 Median :29.99
TRUE Mean :141.5 Mean : 47.81 Mean :-20.04 Mean :30.00
TRUE 3rd Qu.:206.0 3rd Qu.: 67.00 3rd Qu.:-19.71 3rd Qu.:30.32
TRUE Max. :611.0 Max. :264.00 Max. :-18.79 Max. :31.81
TRUE prev time_order_loc
TRUE Min. :-0.04545 Min. : 1.00
TRUE 1st Qu.: 0.24615 1st Qu.: 9.00
TRUE Median : 0.33016 Median :18.00
TRUE Mean : 0.31518 Mean :17.65
TRUE 3rd Qu.: 0.39024 3rd Qu.:26.00
TRUE Max. : 0.53488 Max. :35.00
malaria_data$location # values for a single column
TRUE [1] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [6] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [11] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [16] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [21] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [26] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [31] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [36] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [41] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [46] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [51] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [56] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [61] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [66] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [71] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [76] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [81] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [86] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [91] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [96] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [101] "mordor" "mordor" "mordor" "mordor" "mordor"
TRUE [106] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [111] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [116] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [121] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [126] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [131] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [136] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [141] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [146] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [151] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [156] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [161] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [166] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [171] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [176] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [181] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [186] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [191] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [196] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [201] "narnia" "narnia" "narnia" "narnia" "narnia"
TRUE [206] "narnia" "narnia" "narnia" "narnia" "neverwhere"
TRUE [211] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [216] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [221] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [226] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [231] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [236] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [241] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [246] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [251] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [256] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [261] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [266] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [271] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [276] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [281] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [286] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [291] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [296] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [301] "neverwhere" "neverwhere" "neverwhere" "neverwhere" "neverwhere"
TRUE [306] "oz" "oz" "oz" "oz" "oz"
TRUE [311] "oz" "oz" "oz" "oz" "oz"
TRUE [316] "oz" "oz" "oz" "oz" "oz"
TRUE [321] "oz" "oz" "oz" "oz" "oz"
TRUE [326] "oz" "oz" "oz" "oz" "oz"
TRUE [331] "oz" "oz" "oz" "oz" "oz"
TRUE [336] "oz" "oz" "oz" "oz" "oz"
TRUE [341] "oz" "oz" "oz" "oz" "oz"
TRUE [346] "oz" "oz" "oz" "oz" "oz"
TRUE [351] "oz" "oz" "oz" "oz" "oz"
TRUE [356] "oz" "oz" "oz" "oz" "oz"
TRUE [361] "oz" "oz" "oz" "oz" "oz"
TRUE [366] "oz" "oz" "oz" "oz" "oz"
TRUE [371] "oz" "oz" "oz" "oz" "oz"
TRUE [376] "oz" "oz" "oz" "oz" "oz"
TRUE [381] "oz" "oz" "oz" "oz" "oz"
TRUE [386] "oz" "oz" "oz" "oz" "oz"
TRUE [391] "oz" "oz" "oz" "oz" "oz"
TRUE [396] "oz" "oz" "oz" "oz" "oz"
TRUE [401] "oz" "oz" "oz" "oz" "oz"
TRUE [406] "oz" "oz" "oz" "oz" "wonderland"
TRUE [411] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [416] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [421] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [426] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [431] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [436] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [441] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [446] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [451] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [456] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [461] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [466] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [471] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [476] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [481] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [486] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [491] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [496] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [501] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [506] "wonderland" "wonderland" "wonderland" "wonderland" "wonderland"
TRUE [511] "wonderland" "wonderland" "wonderland" "wonderland"
unique(malaria_data$location) # unique values for a single column
TRUE [1] "mordor" "narnia" "neverwhere" "oz" "wonderland"
table(malaria_data$location) # frequencies for a single column
TRUE
TRUE mordor narnia neverwhere oz wonderland
TRUE 105 104 96 104 105
table(malaria_data$location, malaria_data$ages) # frequencies for multiple columns
TRUE
TRUE 15_above 5_to_14 under_5
TRUE mordor 35 35 35
TRUE narnia 35 35 34
TRUE neverwhere 32 32 32
TRUE oz 35 35 34
TRUE wonderland 35 35 35
sum(is.na(malaria_data))
TRUE [1] 0
colSums(is.na(malaria_data))
TRUE location month year ages total
TRUE 0 0 0 0 0
TRUE positive xcoord ycoord prev time_order_loc
TRUE 0 0 0 0 0
#’ First, we will look at some exploratory data visualization #’ techniques using base R functions. The purpose of these plots #’ is to help us understand the relationships between variables and #’ characteristics of our data. They are useful for quickly exploring #’ the data and understanding the relationships, but they are not #’ are not great for sharing in scientific publications/presentations.
hist(malaria_data$prev)
hist(malaria_data$prev,
breaks = 10,
main = "Distribution of Malaria Prevalence",
xlab = "Malaria Prevalence",
ylab = "Frequency",
col = "purple",
border = "black")
barplot(table(malaria_data$ages))
table(malaria_data$ages)
TRUE
TRUE 15_above 5_to_14 under_5
TRUE 172 172 170
barplot(table(malaria_data$location))
barplot(table(malaria_data$year))
plot(malaria_data$total, malaria_data$positive)
plot(malaria_data$month, malaria_data$prev)
plot_jan<-filter(malaria_data, month==1) #scatter plot for a single month of january
plot(plot_jan$month, plot_jan$prev)
boxplot(malaria_data$prev ~ malaria_data$month)
boxplot(malaria_data$prev ~ malaria_data$location)
#’ ggplot2 is a popular visualization package for R. It provides #’ an easy-to-use interface for creating data visualizations. #’ The ggplot2 package is based on the “grammar of graphics” #’ and is a powerful way to create complex visualizations that #’ are useful for creating scientific and publication-quality #’ figures. #’ #’ The “grammar of graphics” used in ggplot2 is a set of rules that are #’ used to develop data visualizations using a layering approach. Layers #’ are added using the “+” operator. #’ # Components of a ggplot #’ There are three main components of a ggplot: #’ 1. The data: the dataset we want to visualize #’ 2. The aesthetics: the visual properties from the data used in the plot #’ 3. The geometries: the visual representations of the data (e.g., points, lines, bars)
#’ All ggplot2 plots require a data frame as input. Just running this #’ line will produce a blank plot because we have stated which elements from #’ the data we want to visualize or how we want to visualize them.
ggplot(data = malaria_data)
#’ Next, we need to specify the visual properties of the plot that are determined #’ by the data. The aesthetics are specified using the aes() function. The output should #’ now produce a blank plot but with determined visual properties (e.g., axes labels).
ggplot(data = malaria_data, aes(x = total, y = positive))
#’ Finally, we need to specify the visual representation of the data. The #’ geometries are specified using the geom_ function. There are many #’ different geometries that can be used in ggplot2. We will use geom_point #’ in this example and we will append it to the previous plot using the #’ “+” operator. The output should now produce a plot with the specified visual #’ representation of the data.
ggplot(data = malaria_data, aes(x = total, y = positive)) +
geom_point()
ggplot(data = malaria_data, aes(x = prev)) +
geom_histogram(bins = 20) # the "bins" argument specifies the number of bars
ggplot(data = malaria_data, aes(x = year)) +
geom_bar(fill = "tomato") # the "fill" argument specifies the color of the bars
ggplot(data = malaria_data, aes(x = location, y = prev)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) # geom_jitter adds jittered points to the plot, and the "alpha" argument specifies the transparency
ggplot(data = malaria_data, aes(x = location, y = prev)) +
geom_violin() + # Violin plot are similar to boxplots, but illustrate the distribution of the data
geom_jitter(alpha = 0.2)
ggplot(data = malaria_data, aes(x = total, y = positive)) +
geom_point() +
geom_smooth(method = "lm") # The smooth geom add a smoothed line to the plot, using the "lm" or other methods
TRUE `geom_smooth()` using formula = 'y ~ x'
#’ Expanding the aes() function #’ Addition visual properties, such as color, size, and shape, can be defined #’ from our input data using the aes() function. Here is an example of adding #’ color to a previous plot using the color aesthetic.
ggplot(data = malaria_data, aes(x = total, y = positive, color = location)) +
geom_point()
#’ Note that this is different then defining a color directly within the geom_point, #’ which would only apply a single color to all points.
ggplot(data = malaria_data, aes(x = total, y = positive)) +
geom_point(color = "tomato")
#’ When using the aes() function, the visual properties will be determined by a #’ variable in the dataset. This allows us to visualize relationships between #’ multiple variables at the same time.
ggplot(data = malaria_data, aes(x = prev, fill = ages)) +
geom_histogram(color = "black")
TRUE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = malaria_data, aes(x = prev, fill = ages)) +
geom_histogram(color = "black", bins = 12)
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2)
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2,aes(color=location))
ggplot(data = malaria_data, aes(x = total, y = positive, color = location), alpha = 0.5) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
TRUE `geom_smooth()` using formula = 'y ~ x'
ggplot(data = malaria_data, aes(x = xcoord, y = ycoord, color = location)) +
geom_point(alpha = 0.5)
#’ In this section, we will using additional features of ggplot2 to customize and #’ develop high-quality plots that can used in scientific publications and presentations.
#’ There are many different themes that can be used in ggplot2. #’ The “theme” function is used to specify the theme of the plot. There are many #’ preset theme functions, and further custom themes can be created using the #’ generic theme() function. #’ Typically you will want to set the theme at the end of your plot.
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
theme_classic()
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
theme_bw()
ggplot(data = malaria_data, aes(x = location, y = prev, fill = ages)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
theme_classic() +
theme(legend.position = "bottom")
#’ Labels can be added to plots using the labs() function.
ggplot(data = malaria_data, aes(x = location, y = prev, fill = ages)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
labs(title = "Malaria prevalence by location and age group",
subtitle = "Data from 2018 - 2020",
x = "Location",
y = "Prevalence",
fill = "Age group") +
theme_classic() +
theme(legend.position = "bottom")
#’ There are many different color palettes that can be used in ggplot2. #’ The “scale_color” function is used to specify the color of the plot. There are many #’ preset color palettes, and further custom color palettes can be created using the #’ generic scale_color() function.
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
scale_fill_brewer(palette = "Set1")
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
scale_fill_manual(values = c("#C6E0FF", "#136F63", "#E0CA3C", "#F34213", "#3E2F5B"))
ggplot(data = malaria_data, aes(x = total, y = positive, color = prev)) +
geom_point() +
scale_color_gradient(low = "blue", high = "red")
ggplot(data = malaria_data, aes(x = total, y = positive, color = prev)) +
geom_point() +
scale_color_viridis_c(option = "magma") # use viridis package to create custom color palettes
#’ Facets are a powerful feature of ggplot2 that allow us to create multiple plots #’ based on a single variable. This “small multiple” approach is another effective #’ way to visualize relationships between mutliple variables.
ggplot(data = malaria_data, aes(x = total, y = positive, color = prev)) +
geom_point() +
scale_color_viridis_c(option = "magma") +
facet_wrap(~ location)
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
facet_wrap(~ ages) +
coord_flip() + # flips the x and y axes
scale_fill_manual(values = c("#C6E0FF", "#136F63", "#E0CA3C", "#F34213", "#3E2F5B")) +
labs(title = "Malaria prevalence by location and age group",
subtitle = "Data from 2018 - 2020",
x = "Location",
y = "Prevalence",
fill = "Age group") +
theme_classic()
ggplot(data = malaria_data, aes(x = prev, fill = ages)) +
geom_histogram(bins = 10) +
scale_fill_viridis_d() +
facet_grid(year ~ .)
#’ ggplot2 can be exported to a variety of formats using the ggsave() function. #’ You can specify which plot to export by saving in an object and then calling the #’ object in the ggsave() function, otherwise ggsave() will save the current/last plot. #’ The width and height of the output image using the width and height can be set using #’ the width and height arguments, and the resolution of the image using the dpi argument. #’ #’ The file type can be set using the format argument, or by using a specific file extension. #’ I recommend using informative names for the output file.
ggplot(data = malaria_data, aes(x = location, y = prev, fill = location)) +
geom_boxplot() +
geom_jitter(alpha = 0.2) +
facet_wrap(~ ages) +
coord_flip() + # flips the x and y axes
scale_fill_manual(values = c("#C6E0FF", "#136F63", "#E0CA3C", "#F34213", "#3E2F5B")) +
labs(title = "Malaria prevalence by location and age group",
subtitle = "Data from 2018 - 2020",
x = "Location",
y = "Prevalence",
fill = "Age group") +
theme_classic()
ggsave("malaria-prevalence-age-boxplot.png", width = 10, height = 6, dpi = 300)
#’ CHALLENGE 1: Create a figure showing how the Anopheles gambiae total counts #’ vary each day and by location.