Data Dive Two

Group By and Probabilities

Load library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'stringr' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

Load NASA data

nasa_data <- read_delim("C:/Users/imaya/Downloads/cleaned_5250.csv",delim = ",")

## Rows: 5250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, planet_type, mass_wrt, radius_wrt, detection_method
## dbl (8): distance, stellar_magnitude, discovery_year, mass_multiplier, radiu...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(nasa_data)

## # A tibble: 6 × 13
##   name     distance stellar_magnitude planet_type discovery_year mass_multiplier
##   <chr>       <dbl>             <dbl> <chr>                <dbl>           <dbl>
## 1 11 Coma…      304              4.72 Gas Giant             2007           19.4 
## 2 11 Ursa…      409              5.01 Gas Giant             2009           14.7 
## 3 14 Andr…      246              5.23 Gas Giant             2008            4.8 
## 4 14 Herc…       58              6.62 Gas Giant             2002            8.14
## 5 16 Cygn…       69              6.22 Gas Giant             1996            1.78
## 6 17 Scor…      408              5.23 Gas Giant             2020            4.32
## # ℹ 7 more variables: mass_wrt <chr>, radius_multiplier <dbl>,
## #   radius_wrt <chr>, orbital_radius <dbl>, orbital_period <dbl>,
## #   eccentricity <dbl>, detection_method <chr>

Planet Discoveries by Detection Method

df_m <- nasa_data %>%
  group_by(detection_method) %>%
  summarise(
    count = n(),
    avg_mass = mean(mass_multiplier, na.rm = TRUE)
  ) %>%
  mutate(
    probability = count / sum(count),
    rare_tag = ifelse(count < 100, "Rare", "Common")
  )%>%
  arrange(count)


vis_m <- ggplot(df_m, aes(x = detection_method, y = count, fill = rare_tag)) +
  geom_col() +
  coord_flip() +
  scale_y_log10()+
  labs(title = "Planet Discoveries by Detection Method",
       x = "Detection Method",
       y = "Number of Planets",
       fill ="Rarity"
       ) +
     



print (df_m)

## # A tibble: 11 × 5
##    detection_method              count avg_mass probability rare_tag
##    <chr>                         <int>    <dbl>       <dbl> <chr>   
##  1 Disk Kinematics                   1     2.5     0.000190 Rare    
##  2 Astrometry                        2    15.4     0.000381 Rare    
##  3 Pulsation Timing Variations       2     7.5     0.000381 Rare    
##  4 Pulsar Timing                     7     2.10    0.00133  Rare    
##  5 Orbital Brightness Modulation     9     1.28    0.00171  Rare    
##  6 Eclipse Timing Variations        17     6.78    0.00324  Rare    
##  7 Transit Timing Variations        24     7.81    0.00457  Rare    
##  8 Direct Imaging                   62    24.9     0.0118   Rare    
##  9 Gravitational Microlensing      154     6.11    0.0293   Common  
## 10 Radial Velocity                1027     6.45    0.196    Common  
## 11 Transit                        3945     6.16    0.751    Common

vis_m

The lowest probability group was tagged as ‘Rare’. The Rare methods, including Disk Kinematics (.019%), Astrometry (.038%), and Pulsation Timing Variations (.038%), had fewer than ten discoveries, giving those methods an extremely low probability of detecting a planet. The Common methods, such as radial velocity, Gravitational Microlensing, and transit, accounted for most of the planet discoveries. A testable hypothesis is that detection methods vary in effectiveness depending on planet size and distance. For example, small or dim planets are less likely to be detected unless the method is specifically suited to their properties.

Distribution of Stellar Magnitude by Planet Type

df_t <- nasa_data %>%
  group_by(planet_type) %>%
  summarise(
    count = n(),
    avg_stell = mean(stellar_magnitude, na.rm = TRUE),
    probability = n() / nrow(nasa_data),
    rare_tag = ifelse(n() < 200, "Rare", "Common"),
    .groups = "drop"
  )
nasa_data %>%
  filter(!is.na(stellar_magnitude), stellar_magnitude > 0) %>%
  left_join(df_t, by = "planet_type") %>%
  ggplot(aes(x = planet_type, y = stellar_magnitude, fill = rare_tag)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_log10() +
  coord_flip() +
  labs(
    title = "Distribution of Stellar Magnitude by Planet Type",
    x = "Planet Type",
    y = "Stellar Magnitude (log scale)",
    fill = "Rarity"
  )

 df_t

## # A tibble: 5 × 5
##   planet_type  count avg_stell probability rare_tag
##   <chr>        <int>     <dbl>       <dbl> <chr>   
## 1 Gas Giant     1630      10.4    0.310    Common  
## 2 Neptune-like  1825      13.5    0.348    Common  
## 3 Super Earth   1595      13.9    0.304    Common  
## 4 Terrestrial    195      13.4    0.0371   Rare    
## 5 Unknown          5      14.3    0.000952 Rare

The lowest probability groups were tagged as ‘Rare’. Stellar magnitude measures how bright a star appears, with lower numbers indicating brighter stars (for example, the Sun has a stellar magnitude of -26.7). The planets in this dataset orbit stars with magnitudes between 10 and 14, so none of them are very bright. Gas Giants are the most commonly discovered planets and tend to orbit relatively brighter stars, while Terrestrial planets and Unknown types are less commonly found and tend to orbit dimmer stars. A testable hypothesis is that planets orbiting dimmer stars are less likely to be discovered, such as terrestrial-type stars, which may make it harder to detect them with the current methods.

Planet Type By Mass Category

df_combo <- nasa_data %>%
    filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
  group_by(planet_type, mass_wrt) %>%
  summarise(
    n = n(),  # count of each combination
    probability = n() / nrow(nasa_data),
    .groups = "drop"
  ) %>%
  mutate(
    rare_tag = ifelse(n < 50, "Rare", "Common")
  )
all_combos <- expand.grid(
  planet_type = unique(nasa_data$planet_type),
  mass_wrt = unique(nasa_data$mass_wrt)
)


missing_combos <- anti_join(
  all_combos,
  df_combo,
  by = c("planet_type", "mass_wrt")
)

df_combo <- nasa_data %>% 
  filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
  count(planet_type, mass_wrt) %>%
  mutate(probability = n /sum(n))

ggplot(df_combo, aes(x = mass_wrt, y = planet_type, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(trans = "log") +
  labs(
    title = "Planet Type by Mass Category",
    x = "Mass Category (relative to Earth)",
    y = "Planet Type",
    fill = "Count"
  )

print(df_combo)

## # A tibble: 8 × 4
##   planet_type  mass_wrt     n probability
##   <chr>        <chr>    <int>       <dbl>
## 1 Gas Giant    Earth        9    0.00172 
## 2 Gas Giant    Jupiter   1603    0.307   
## 3 Neptune-like Earth     1807    0.346   
## 4 Neptune-like Jupiter     18    0.00344 
## 5 Super Earth  Earth     1576    0.302   
## 6 Super Earth  Jupiter     19    0.00363 
## 7 Terrestrial  Earth      194    0.0371  
## 8 Terrestrial  Jupiter      1    0.000191

print(missing_combos)

##    planet_type mass_wrt
## 1      Unknown  Jupiter
## 2      Unknown    Earth
## 3    Gas Giant     <NA>
## 4  Super Earth     <NA>
## 5 Neptune-like     <NA>
## 6  Terrestrial     <NA>
## 7      Unknown     <NA>

Comparing planet type by mass category shows that Gas Giants are almost exclusively in the Jupiter-weight category (1603) compared to the Earth-weight (9), which makes sense because Gas Giants are much larger and more massive than Earth-like planets. Most other planet types, like Neptune-like and Super Earth, are primarily in the Earth-weight category but also have some Jupiter-weight examples. The combination of ‘Unknown’ planet types with either mass category does not exist in the data, which is why these rows appear in the missing_combos table. These missing combinations likely reflect the fact that ‘Unknown’ planets have insufficient information about their mass.

DataDiveTwo

2026-02-03