# Parallel Coordinates Plot
library(GGally)
## Loading required package: ggplot2
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(iris)
ggparcoord(
iris,
columns = 1:4,
groupColumn = 5,
scale = "std",
alphaLines = 0.6
) +
geom_line(size = 1) +
theme_minimal() +
labs(
title = "Parallel Coordinates Plot of Iris Measurements",
x = "Flower Measurements",
y = "Standardized Value"
) +
scale_color_brewer(palette = "Set1")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#This is appropriate for the given data because we have a range of numerical data such as petal length/width and sepal length/width that need to be plotted at once. There are also different species that must be plotted. The parallel coordinates plot allows for the measurements of each flower from each species to be plotted as well as the numerical lengths for each variable for easy comparison.
#Map
library(ggplot2)
library(dplyr)
install.packages("maps")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(maps)
data("USArrests")
crime_data <- USArrests %>%
mutate(state = tolower(rownames(USArrests)))
# US map
us_map <- map_data("state")
# dataa with map
map_data_crime <- us_map %>%
left_join(crime_data, by = c("region" = "state"))
#plot w customization
ggplot(map_data_crime, aes(long, lat, group = group, fill = Murder)) +
geom_polygon(color = "white", size = 0.2) +
coord_fixed(1.3) +
scale_fill_gradient(low = "lightyellow", high = "darkred") +
theme_void() +
labs(
title = "US Murder Rates by State",
fill = "Murder Rate"
)
#This plot is appropriate because this data includes a geographic element but we are also looking at a numerical variable (murder). It is very easy to interpret data from this plot based on the color gradient. It also allows readers to see areas where crimes such as murders are more common and areas where its less common based on the color gradient.
#Network or Flow Diagram
library(ggplot2)
install.packages("ggalluvial")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggalluvial)
library(dplyr)
# make dataframe
titanic_df <- as.data.frame(Titanic)
# Alluvial, Sankey-Style Plot
ggplot(
titanic_df,
aes(
axis1 = Class,
axis2 = Sex,
axis3 = Age,
axis4 = Survived,
y = Freq
)
) +
geom_alluvium(aes(fill = Survived), alpha = 0.7) +
geom_stratum(color = "black", width = 1/10) +
geom_text(
stat = "stratum",
aes(label = after_stat(stratum)),
size = 3
) +
scale_x_discrete(
limits = c("Class", "Sex", "Age", "Survived"),
expand = c(0.1, 0.1)
) +
labs(
title = "Titanic Passenger Flow: Class → Sex → Age → Survival",
y = "Number of Passengers",
fill = "Survived"
) +
theme_minimal()
#This type of plot is appropriate because with the titanic dataset it is useful to see connections between variables and not always necessarily numbers. The sankey plot allows this to be done because you can look at who survived and who didn't. We can also see how this is effected by different identity characteristics like age, gender, class, and the network flow between these variables.
#Distribution Visualization
library(ggplot2)
install.packages("ggforce")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggforce)
library(dplyr)
ggplot(iris, aes(x = Species, y = Petal.Length, color = Species)) +
geom_sina(size = 1.5, alpha = 0.7) +
theme_minimal() +
labs(
title = "Sina Plot of Petal Length Across Iris Species",
x = "Species",
y = "Petal Length (cm)"
) +
theme(legend.position = "right")
#This is appropriate for the given data because this allows the raw data to be seen, giving valuable insights on distribution and spread. We can easily compare a categorical variable to a numeric one. This also allows for easy comparison between the different species to see any potential overlap with the raw data points.
#Research Theme: What factors influenced survival of passengers on the Titanic?
#Plot 1: Bar Plot
#Comparison: How does gender effect survival rate on the Titanic?
library(ggplot2)
library(dplyr)
titanic_df <- as.data.frame(Titanic)
sex_survival <- titanic_df %>%
group_by(Sex, Survived) %>%
summarise(Freq = sum(Freq), .groups = "drop") %>%
mutate(se = sqrt(Freq))
ggplot(sex_survival,
aes(x = Sex, y = Freq, fill = Survived)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_errorbar(
aes(ymin = Freq - se, ymax = Freq + se),
position = position_dodge(width = 0.9),
width = 0.2
) +
labs(
title = "Titanic Survival by Gender with Error Bars",
x = "Gender",
y = "Number of Passengers"
) +
theme_minimal()
#Interpretation: Males were significantly more likely to die than to survive. Females were slightly more likely to survive then to die.
#This type of plot is appropriate because the comparison that is wanted is categorical variables (sex & survival) and a numeric variable (number of passengers). Here we were just looking at pure numbers instead of rates, so a traditional bar plot made sense.
#Plot 2: Stacked Barplot
# comparison: How did class effect survival on Titanic?
library(ggplot2)
ggplot(as.data.frame(Titanic),
aes(x = Class, y = Freq, fill = Survived)) +
geom_bar(stat = "identity") +
labs(
title = "Survival by Passenger Class",
x = "Passenger Class",
y = "Number of Passengers"
) +
theme_minimal()
#Interpretation: Crew members and 3rd class passengers had the most passengers die, while second class and first class had much less passengers die (and much less passengers in general). First class is the only group where people were more likely to survive than die, and had the most people survive by pure numbers.
# This is appropriate because this compares number of passengers (numeric) with survival and class. It allows for easy comparison between classes and survival which helps to draw insights on the comparison between class and survival.
#Plot 3: Cleveland Plot
#Comparison: Which passengers based on there class and sex had the greatest number of survivors?
library(ggplot2)
titanic_df <- as.data.frame(Titanic)
ggplot(titanic_df,
aes(x = Freq,
y = interaction(Class, Sex),
color = Survived)) +
geom_point(size = 4) +
labs(
title = "Titanic Survival Counts by Class and Gender",
x = "Number of Passengers",
y = "Passenger Group"
) +
theme_minimal()
#Interpretation: Crew members that were male had the greatest number of deaths by far, followed by third class males. Male crew members also had the greatest number of survivors followed by 1st class females.
#This is appropriate because Cleveland Plots allow for easy comparison between categorical variables and numerical variables without the clutter that other graphs have. The simple dots allow for easy and direct comparisons between specified groups.
#Plot 4: Mosaic Plot
#Comparison: How does sex effect survival on the Titanic?
install.packages("vcd")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(vcd)
## Loading required package: grid
mosaic(
~ Sex + Survived,
data = titanic_df,
shade = TRUE,
legend = TRUE,
main = "Titanic Survival by Sex"
)
#Interpretation: There were a lot more males than females on the Titanic, and even still there were many more females that survived the Titanic than males.The largest area were males that died, making this a very common outcome. The smallest area was females the died making this an unlikely outcome.
#This is appropriate because it is a very simple and streamlined way of understanding the way that sex and survival compare to each other. Both of these are categorical which makes plotting a little harder, but a mosaic plot is a good way of showing association for two categorical variables.This shows association rather than just counts or pure numbers.
#Plot 5: Proportional Bar Chart
#Comparison: How did proportions of survival change based on passenger class?
library(ggplot2)
titanic_df <- as.data.frame(Titanic)
ggplot(titanic_df,
aes(x = Class,
y = Freq,
fill = Survived)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Proportion of Survivors by Passenger Class",
x = "Passenger Class",
y = "Percentage of Passengers"
) +
theme_minimal()
#Interpretaion: First class passengers had well over a 50% survival rate while crew members only had about a 25% survival rate. Survival rate generally goes down across class with second class having about a 45% survival rate and third class and crew members have about equal survival rates at 25%.
#This is appropriate for whats trying to be shown because this type of plot shows proprtions and takes this into account rather than just numbers. It does a better job of showing liklihood of survival based on class, rather than just number of total survivors, allowing for us to interpret how class effects survival in a more direct way.