library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
nasa_data <- read_delim("C:/Users/imaya/Downloads/cleaned_5250.csv",delim = ",")
## Rows: 5250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, planet_type, mass_wrt, radius_wrt, detection_method
## dbl (8): distance, stellar_magnitude, discovery_year, mass_multiplier, radiu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(nasa_data)
## # A tibble: 6 × 13
## name distance stellar_magnitude planet_type discovery_year mass_multiplier
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 11 Coma… 304 4.72 Gas Giant 2007 19.4
## 2 11 Ursa… 409 5.01 Gas Giant 2009 14.7
## 3 14 Andr… 246 5.23 Gas Giant 2008 4.8
## 4 14 Herc… 58 6.62 Gas Giant 2002 8.14
## 5 16 Cygn… 69 6.22 Gas Giant 1996 1.78
## 6 17 Scor… 408 5.23 Gas Giant 2020 4.32
## # ℹ 7 more variables: mass_wrt <chr>, radius_multiplier <dbl>,
## # radius_wrt <chr>, orbital_radius <dbl>, orbital_period <dbl>,
## # eccentricity <dbl>, detection_method <chr>
df_m <- nasa_data %>%
group_by(detection_method) %>%
summarise(
count = n(),
avg_mass = mean(mass_multiplier, na.rm = TRUE)
) %>%
mutate(
probability = count / sum(count),
rare_tag = ifelse(count < 100, "Rare", "Common")
)%>%
arrange(count)
vis_m <- ggplot(df_m, aes(x = detection_method, y = count, fill = rare_tag)) +
geom_col() +
coord_flip() +
scale_y_log10()+
labs(title = "Planet Discoveries by Detection Method",
x = "Detection Method",
y = "Number of Planets",
fill ="Rarity"
) +
print (df_m)
## # A tibble: 11 × 5
## detection_method count avg_mass probability rare_tag
## <chr> <int> <dbl> <dbl> <chr>
## 1 Disk Kinematics 1 2.5 0.000190 Rare
## 2 Astrometry 2 15.4 0.000381 Rare
## 3 Pulsation Timing Variations 2 7.5 0.000381 Rare
## 4 Pulsar Timing 7 2.10 0.00133 Rare
## 5 Orbital Brightness Modulation 9 1.28 0.00171 Rare
## 6 Eclipse Timing Variations 17 6.78 0.00324 Rare
## 7 Transit Timing Variations 24 7.81 0.00457 Rare
## 8 Direct Imaging 62 24.9 0.0118 Rare
## 9 Gravitational Microlensing 154 6.11 0.0293 Common
## 10 Radial Velocity 1027 6.45 0.196 Common
## 11 Transit 3945 6.16 0.751 Common
vis_m
The lowest probability group was tagged as ‘Rare’. The Rare methods, including Disk Kinematics (.019%), Astrometry (.038%), and Pulsation Timing Variations (.038%), had fewer than ten discoveries, giving those methods an extremely low probability of detecting a planet. The Common methods, such as radial velocity, Gravitational Microlensing, and transit, accounted for most of the planet discoveries. A testable hypothesis is that detection methods vary in effectiveness depending on planet size and distance. For example, small or dim planets are less likely to be detected unless the method is specifically suited to their properties.
df_t <- nasa_data %>%
group_by(planet_type) %>%
summarise(
count = n(),
avg_stell = mean(stellar_magnitude, na.rm = TRUE),
probability = n() / nrow(nasa_data),
rare_tag = ifelse(n() < 200, "Rare", "Common"),
.groups = "drop"
)
nasa_data %>%
filter(!is.na(stellar_magnitude), stellar_magnitude > 0) %>%
left_join(df_t, by = "planet_type") %>%
ggplot(aes(x = planet_type, y = stellar_magnitude, fill = rare_tag)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_log10() +
coord_flip() +
labs(
title = "Distribution of Stellar Magnitude by Planet Type",
x = "Planet Type",
y = "Stellar Magnitude (log scale)",
fill = "Rarity"
)
df_t
## # A tibble: 5 × 5
## planet_type count avg_stell probability rare_tag
## <chr> <int> <dbl> <dbl> <chr>
## 1 Gas Giant 1630 10.4 0.310 Common
## 2 Neptune-like 1825 13.5 0.348 Common
## 3 Super Earth 1595 13.9 0.304 Common
## 4 Terrestrial 195 13.4 0.0371 Rare
## 5 Unknown 5 14.3 0.000952 Rare
The lowest probability groups were tagged as ‘Rare’. Stellar magnitude measures how bright a star appears, with lower numbers indicating brighter stars (for example, the Sun has a stellar magnitude of -26.7). The planets in this dataset orbit stars with magnitudes between 10 and 14, so none of them are very bright. Gas Giants are the most commonly discovered planets and tend to orbit relatively brighter stars, while Terrestrial planets and Unknown types are less commonly found and tend to orbit dimmer stars. A testable hypothesis is that planets orbiting dimmer stars are less likely to be discovered, such as terrestrial-type stars, which may make it harder to detect them with the current methods.
df_combo <- nasa_data %>%
filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
group_by(planet_type, mass_wrt) %>%
summarise(
n = n(), # count of each combination
probability = n() / nrow(nasa_data),
.groups = "drop"
) %>%
mutate(
rare_tag = ifelse(n < 50, "Rare", "Common")
)
all_combos <- expand.grid(
planet_type = unique(nasa_data$planet_type),
mass_wrt = unique(nasa_data$mass_wrt)
)
missing_combos <- anti_join(
all_combos,
df_combo,
by = c("planet_type", "mass_wrt")
)
df_combo <- nasa_data %>%
filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
count(planet_type, mass_wrt) %>%
mutate(probability = n /sum(n))
ggplot(df_combo, aes(x = mass_wrt, y = planet_type, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c(trans = "log") +
labs(
title = "Planet Type by Mass Category",
x = "Mass Category (relative to Earth)",
y = "Planet Type",
fill = "Count"
)
print(df_combo)
## # A tibble: 8 × 4
## planet_type mass_wrt n probability
## <chr> <chr> <int> <dbl>
## 1 Gas Giant Earth 9 0.00172
## 2 Gas Giant Jupiter 1603 0.307
## 3 Neptune-like Earth 1807 0.346
## 4 Neptune-like Jupiter 18 0.00344
## 5 Super Earth Earth 1576 0.302
## 6 Super Earth Jupiter 19 0.00363
## 7 Terrestrial Earth 194 0.0371
## 8 Terrestrial Jupiter 1 0.000191
print(missing_combos)
## planet_type mass_wrt
## 1 Unknown Jupiter
## 2 Unknown Earth
## 3 Gas Giant <NA>
## 4 Super Earth <NA>
## 5 Neptune-like <NA>
## 6 Terrestrial <NA>
## 7 Unknown <NA>
Comparing planet type by mass category shows that Gas Giants are almost exclusively in the Jupiter-weight category (1603) compared to the Earth-weight (9), which makes sense because Gas Giants are much larger and more massive than Earth-like planets. Most other planet types, like Neptune-like and Super Earth, are primarily in the Earth-weight category but also have some Jupiter-weight examples. The combination of ‘Unknown’ planet types with either mass category does not exist in the data, which is why these rows appear in the missing_combos table. These missing combinations likely reflect the fact that ‘Unknown’ planets have insufficient information about their mass.