Import Data
outer_space_objects <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-04-23/outer_space_objects.csv')
## Rows: 1175 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, num_objects
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Tidy data
set.seed(2) # for reproducible outcome
data_top10_launchers <- outer_space_objects %>%
# Select three columns
select(Entity, Year, num_objects ) %>%
group_by(Entity) %>%
filter(num_objects == max(num_objects)) %>%
ungroup() %>%
arrange(desc(num_objects)) %>%
slice_head(n = 10)
print(data_top10_launchers)
## # A tibble: 10 × 3
## Entity Year num_objects
## <chr> <dbl> <dbl>
## 1 World 2023 2664
## 2 United States 2023 2166
## 3 United Kingdom 2021 289
## 4 China 2022 182
## 5 Russia 1981 124
## 6 Belgium 2017 28
## 7 Japan 2014 24
## 8 Japan 2021 24
## 9 France 2011 19
## 10 Spain 2022 19
Introduction
Questions
Variation
Visualizing distributions
Bar Plot
data_top10_launchers %>%
ggplot(aes(x = reorder(Entity, -num_objects), y = num_objects, fill = factor(Year))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top Space Object Launches Globally and by Country", x = "Entity", y = "Number of Objects Launched") +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects), by = 250)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.ticks.x = element_blank())

Unusual values
ScatterPlot that shows that The United States makes almost all of
the launches that contribute to the total World count, which is a
significant outlier in the dataset
data_top10_launchers %>%
ggplot(aes(x = Year, y = num_objects, color = Entity)) +
geom_point() +
labs(title = "Space Object Launches Over the Years", x = "Year", y = "Number of Objects Launched") +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects), by = 250)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Missing Values
I have set a limiter on the previous tables y axis count to remove
outliers with launches under 5 for each year and over 2000 for each
year, this removed our outlier values for the United States
library(tidyverse)
data_top10_launchers %>%
mutate(num_objects = ifelse(num_objects < 5 | num_objects > 2000, NA, num_objects)) %>%
ggplot(aes(x = Year, y = num_objects, color = Entity)) +
geom_point() +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects, na.rm = TRUE), by = 25)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Space Object Launches with Outliers Removed", x = "Year", y = "Number of Objects Launched")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
library(tidyverse)
data_top10_launchers %>%
ggplot(aes(x = reorder(Entity, -num_objects), y = num_objects, fill = Entity)) +
geom_col() +
labs(title = "Number of Space Objects Launched by Country",
x = "Entity",
y = "Number of Objects Launched") +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects), by = 250)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

Two categorical variables
Isn’t applicable for this dataset, my tidied set has one
categorical, the full set had two but it wouldn’t give any insight
because they were just country names in the ‘entity’ column and their
abbreviations in a ‘Code’ column.
Two continuous variables
library(hexbin)
library(tidyverse)
library(hexbin)
data_top10_launchers %>%
ggplot(aes(x = Year, y = num_objects)) +
geom_hex() +
labs(title = "Hexbin Plot of Year vs. Number of Objects Launched",
x = "Year",
y = "Number of Objects Launched") +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects), by = 250)) +
theme_minimal()

Patterns and models
This is a scatterplot utilizing a color coding scheme for each
entity and what the red line indicated is the modelled increase in
object launches every year, as can be seen theres overall a positive
nonlinear correlation in the model shown.
# Building the polynomial model
mod_poly <- lm(num_objects ~ poly(Year, 2), data = data_top10_launchers)
# Adding residuals to the data
data_top10_with_resid <- data_top10_launchers %>%
modelr::add_residuals(mod_poly)
# Plotting the fitted polynomial model with color-coded dots
data_top10_launchers %>%
ggplot(aes(x = Year, y = num_objects, color = Entity)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red") +
labs(title = "Number of Space Objects Launched Over the Years",
x = "Year",
y = "Number of Objects Launched") +
theme_minimal() +
scale_y_continuous(breaks = seq(0, max(data_top10_launchers$num_objects), by = 250)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
