library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
nasa_data <- read_delim("C:/Users/imaya/Downloads/cleaned_5250.csv",delim = ",")
## Rows: 5250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, planet_type, mass_wrt, radius_wrt, detection_method
## dbl (8): distance, stellar_magnitude, discovery_year, mass_multiplier, radiu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(nasa_data)
## # A tibble: 6 × 13
## name distance stellar_magnitude planet_type discovery_year mass_multiplier
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 11 Coma… 304 4.72 Gas Giant 2007 19.4
## 2 11 Ursa… 409 5.01 Gas Giant 2009 14.7
## 3 14 Andr… 246 5.23 Gas Giant 2008 4.8
## 4 14 Herc… 58 6.62 Gas Giant 2002 8.14
## 5 16 Cygn… 69 6.22 Gas Giant 1996 1.78
## 6 17 Scor… 408 5.23 Gas Giant 2020 4.32
## # ℹ 7 more variables: mass_wrt <chr>, radius_multiplier <dbl>,
## # radius_wrt <chr>, orbital_radius <dbl>, orbital_period <dbl>,
## # eccentricity <dbl>, detection_method <chr>
df_t <- nasa_data %>%
group_by(detection_method) %>%
summarise(count = n(), .groups = "drop")
print(df_t)
## # A tibble: 11 × 2
## detection_method count
## <chr> <int>
## 1 Astrometry 2
## 2 Direct Imaging 62
## 3 Disk Kinematics 1
## 4 Eclipse Timing Variations 17
## 5 Gravitational Microlensing 154
## 6 Orbital Brightness Modulation 9
## 7 Pulsar Timing 7
## 8 Pulsation Timing Variations 2
## 9 Radial Velocity 1027
## 10 Transit 3945
## 11 Transit Timing Variations 24
df_m <- nasa_data %>%
group_by(detection_method) %>%
summarise(
count = n(),
avg_stell = mean(stellar_magnitude, na.rm = TRUE),
probability = n() / nrow(nasa_data),
rare_tag = ifelse(n() < 50, "Rare", "Common"),
.groups = "drop"
)
ggplot(df_m, aes(x = detection_method, y = avg_stell, fill = rare_tag)) +
geom_col() +
coord_flip() +
labs(
title = "Average Stellar Magnitude by Detection Method",
x = "Detection Method",
y = "Average Stellar Magnitude",
fill = "Rarity"
) +
theme_minimal()
df_m
## # A tibble: 11 × 5
## detection_method count avg_stell probability rare_tag
## <chr> <int> <dbl> <dbl> <chr>
## 1 Astrometry 2 10.0 0.000381 Rare
## 2 Direct Imaging 62 9.98 0.0118 Common
## 3 Disk Kinematics 1 8.44 0.000190 Rare
## 4 Eclipse Timing Variations 17 14.9 0.00324 Rare
## 5 Gravitational Microlensing 154 24.1 0.0293 Common
## 6 Orbital Brightness Modulation 9 14.5 0.00171 Rare
## 7 Pulsar Timing 7 20.9 0.00133 Rare
## 8 Pulsation Timing Variations 2 14.0 0.000381 Rare
## 9 Radial Velocity 1027 8.09 0.196 Common
## 10 Transit 3945 13.9 0.751 Common
## 11 Transit Timing Variations 24 13.4 0.00457 Rare
The Transit method accounts for the majority of planet discoveries and averages a higher stellar magnitude(dimmer stars). If a planet is randomly selected from this data set, it is most likely to have been discovered using the Transit method and is dim.
A testable hypothesis is that detection methods differ in effectiveness depending on observable star and planet characteristics. Specifically, certain methods may be better suited to detecting planets around dimmer or more distant stars.
nasa_data <- nasa_data %>%
mutate(bin_5yr = floor(discovery_year / 5) * 5)
df_bin5 <- nasa_data %>%
group_by(bin_5yr) %>%
summarise(
count = sum(!is.na(stellar_magnitude)),
avg_stell = mean(stellar_magnitude, na.rm = TRUE),
rare_tag = ifelse(count < 500, "Rare", "Common"),
.groups = "drop"
)
df_bin5_clean <- df_bin5 %>%
filter(!is.na(avg_stell), !is.nan(avg_stell), avg_stell > 0)
ggplot(df_bin5_clean, aes(x = factor(bin_5yr), y = avg_stell, fill = rare_tag)) +
geom_col() +
geom_text(aes(label = count), vjust = -0.5) +
labs(
title = "Average Stellar Magnitude by 5-Year Intervals",
x = "5-Year Intervals",
y = "Average Stellar Magnitude",
fill = "Rarity"
) +
theme_minimal()
df_bin5_clean
## # A tibble: 6 × 4
## bin_5yr count avg_stell rare_tag
## <dbl> <int> <dbl> <chr>
## 1 1995 27 6.26 Rare
## 2 2000 104 7.84 Rare
## 3 2005 267 8.83 Rare
## 4 2010 1350 13.2 Common
## 5 2015 2300 13.4 Common
## 6 2020 1041 12.2 Common
The earliest five year intervals contain far fewer planets, meaning the probability of randomly selecting a planet from those periods is much lower than selecting one from more recent bins. There is a trend in average stellar magnitude which may suggest that more recent discoveries involve dimmer stars. A testable hypothesis is that this pattern is due to improved detection technology or increasing average distance of discovered stars, which may affect stellar magnitude values.
df_t <- nasa_data %>%
group_by(planet_type) %>%
summarise(
count = n(),
avg_stell = mean(stellar_magnitude, na.rm = TRUE),
probability = n() / nrow(nasa_data),
rare_tag = ifelse(n() < 200, "Rare", "Common"),
.groups = "drop"
)
nasa_data %>%
filter(!is.na(stellar_magnitude), stellar_magnitude > 0) %>%
left_join(df_t, by = "planet_type") %>%
ggplot(aes(x = planet_type, y = stellar_magnitude, fill = rare_tag)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_log10() +
coord_flip() +
labs(
title = "Distribution of Stellar Magnitude by Planet Type",
x = "Planet Type",
y = "Stellar Magnitude (log scale)",
fill = "Rarity"
)
df_t
## # A tibble: 5 × 5
## planet_type count avg_stell probability rare_tag
## <chr> <int> <dbl> <dbl> <chr>
## 1 Gas Giant 1630 10.4 0.310 Common
## 2 Neptune-like 1825 13.5 0.348 Common
## 3 Super Earth 1595 13.9 0.304 Common
## 4 Terrestrial 195 13.4 0.0371 Rare
## 5 Unknown 5 14.3 0.000952 Rare
The lowest probability groups were tagged as ‘Rare’. Stellar magnitude measures how bright a star appears, with lower numbers indicating brighter stars (for example, the Sun has a stellar magnitude of -26.7). The planets in this dataset orbit stars with magnitudes between 10 and 14, so none of them are very bright. Gas Giants are the most commonly discovered planets and tend to orbit relatively brighter stars, while Terrestrial planets and Unknown types are less commonly found and tend to orbit dimmer stars. A testable hypothesis is that planets orbiting dimmer stars are less likely to be discovered, such as terrestrial-type stars, which may make it harder to detect them with the current methods.
df_combo <- nasa_data %>%
filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
group_by(planet_type, mass_wrt) %>%
summarise(
n = n(),
probability = n() / nrow(nasa_data),
.groups = "drop"
) %>%
mutate(
rare_tag = ifelse(n < 50, "Rare", "Common")
)
all_combos <- expand.grid(
planet_type = unique(nasa_data$planet_type),
mass_wrt = unique(nasa_data$mass_wrt)
)
missing_combos <- anti_join(
all_combos,
df_combo,
by = c("planet_type", "mass_wrt")
)
df_combo <- nasa_data %>%
filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
count(planet_type, mass_wrt) %>%
mutate(probability = n /sum(n))
ggplot(df_combo, aes(x = mass_wrt, y = planet_type, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c(trans = "log") +
labs(
title = "Planet Type by Mass Category",
x = "Mass Category (relative to Earth)",
y = "Planet Type",
fill = "Count"
)
print(df_combo)
## # A tibble: 8 × 4
## planet_type mass_wrt n probability
## <chr> <chr> <int> <dbl>
## 1 Gas Giant Earth 9 0.00172
## 2 Gas Giant Jupiter 1603 0.307
## 3 Neptune-like Earth 1807 0.346
## 4 Neptune-like Jupiter 18 0.00344
## 5 Super Earth Earth 1576 0.302
## 6 Super Earth Jupiter 19 0.00363
## 7 Terrestrial Earth 194 0.0371
## 8 Terrestrial Jupiter 1 0.000191
print(missing_combos)
## planet_type mass_wrt
## 1 Unknown Jupiter
## 2 Unknown Earth
## 3 Gas Giant <NA>
## 4 Super Earth <NA>
## 5 Neptune-like <NA>
## 6 Terrestrial <NA>
## 7 Unknown <NA>
Comparing planet type by mass category shows that Gas Giants are almost exclusively in the Jupiter-weight category (1603) compared to the Earth-weight (9), which makes sense because Gas Giants are much larger and more massive than Earth-like planets. Most other planet types, like Neptune-like and Super Earth, are primarily in the Earth-weight category but also have some Jupiter-weight examples. The combination of ‘Unknown’ planet types with either mass category does not exist in the data, which is why these rows appear in the missing_combos table. These missing combinations likely reflect the fact that ‘Unknown’ planets have insufficient information about their mass.