Data Dive Two

Group By and Probabilities

Load library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'stringr' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

Load NASA data

nasa_data <- read_delim("C:/Users/imaya/Downloads/cleaned_5250.csv",delim = ",")

## Rows: 5250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, planet_type, mass_wrt, radius_wrt, detection_method
## dbl (8): distance, stellar_magnitude, discovery_year, mass_multiplier, radiu...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(nasa_data)

## # A tibble: 6 × 13
##   name     distance stellar_magnitude planet_type discovery_year mass_multiplier
##   <chr>       <dbl>             <dbl> <chr>                <dbl>           <dbl>
## 1 11 Coma…      304              4.72 Gas Giant             2007           19.4 
## 2 11 Ursa…      409              5.01 Gas Giant             2009           14.7 
## 3 14 Andr…      246              5.23 Gas Giant             2008            4.8 
## 4 14 Herc…       58              6.62 Gas Giant             2002            8.14
## 5 16 Cygn…       69              6.22 Gas Giant             1996            1.78
## 6 17 Scor…      408              5.23 Gas Giant             2020            4.32
## # ℹ 7 more variables: mass_wrt <chr>, radius_multiplier <dbl>,
## #   radius_wrt <chr>, orbital_radius <dbl>, orbital_period <dbl>,
## #   eccentricity <dbl>, detection_method <chr>

Stellar Magnitude vs Detection Method

df_t <- nasa_data %>%
  group_by(detection_method) %>%
  summarise(count = n(), .groups = "drop")

print(df_t)

## # A tibble: 11 × 2
##    detection_method              count
##    <chr>                         <int>
##  1 Astrometry                        2
##  2 Direct Imaging                   62
##  3 Disk Kinematics                   1
##  4 Eclipse Timing Variations        17
##  5 Gravitational Microlensing      154
##  6 Orbital Brightness Modulation     9
##  7 Pulsar Timing                     7
##  8 Pulsation Timing Variations       2
##  9 Radial Velocity                1027
## 10 Transit                        3945
## 11 Transit Timing Variations        24

df_m <- nasa_data %>%
  group_by(detection_method) %>%           
  summarise(
    count = n(),
    avg_stell = mean(stellar_magnitude, na.rm = TRUE),
    probability = n() / nrow(nasa_data),
    rare_tag = ifelse(n() < 50, "Rare", "Common"),  
    .groups = "drop"
  )

ggplot(df_m, aes(x = detection_method, y = avg_stell, fill = rare_tag)) +
  geom_col() +               
  coord_flip() +
  labs(
    title = "Average Stellar Magnitude by Detection Method",
    x = "Detection Method",
    y = "Average Stellar Magnitude",
    fill = "Rarity"
  ) +
  theme_minimal()

df_m

## # A tibble: 11 × 5
##    detection_method              count avg_stell probability rare_tag
##    <chr>                         <int>     <dbl>       <dbl> <chr>   
##  1 Astrometry                        2     10.0     0.000381 Rare    
##  2 Direct Imaging                   62      9.98    0.0118   Common  
##  3 Disk Kinematics                   1      8.44    0.000190 Rare    
##  4 Eclipse Timing Variations        17     14.9     0.00324  Rare    
##  5 Gravitational Microlensing      154     24.1     0.0293   Common  
##  6 Orbital Brightness Modulation     9     14.5     0.00171  Rare    
##  7 Pulsar Timing                     7     20.9     0.00133  Rare    
##  8 Pulsation Timing Variations       2     14.0     0.000381 Rare    
##  9 Radial Velocity                1027      8.09    0.196    Common  
## 10 Transit                        3945     13.9     0.751    Common  
## 11 Transit Timing Variations        24     13.4     0.00457  Rare

The Transit method accounts for the majority of planet discoveries and averages a higher stellar magnitude(dimmer stars). If a planet is randomly selected from this data set, it is most likely to have been discovered using the Transit method and is dim.

A testable hypothesis is that detection methods differ in effectiveness depending on observable star and planet characteristics. Specifically, certain methods may be better suited to detecting planets around dimmer or more distant stars.

Stellar Magnitude vs Discovery Year

nasa_data <- nasa_data %>%
  mutate(bin_5yr = floor(discovery_year / 5) * 5)  

df_bin5 <- nasa_data %>%
  group_by(bin_5yr) %>%
  summarise(
    count = sum(!is.na(stellar_magnitude)),          
    avg_stell = mean(stellar_magnitude, na.rm = TRUE),      
    rare_tag = ifelse(count < 500, "Rare", "Common"),       
    .groups = "drop"
  )

df_bin5_clean <- df_bin5 %>%
  filter(!is.na(avg_stell), !is.nan(avg_stell), avg_stell > 0)


ggplot(df_bin5_clean, aes(x = factor(bin_5yr), y = avg_stell, fill = rare_tag)) +
  geom_col() +                              
  geom_text(aes(label = count), vjust = -0.5) +  
  labs(
    title = "Average Stellar Magnitude by 5-Year Intervals",
    x = "5-Year Intervals",
    y = "Average Stellar Magnitude",
    fill = "Rarity"
  ) +
  theme_minimal()

df_bin5_clean

## # A tibble: 6 × 4
##   bin_5yr count avg_stell rare_tag
##     <dbl> <int>     <dbl> <chr>   
## 1    1995    27      6.26 Rare    
## 2    2000   104      7.84 Rare    
## 3    2005   267      8.83 Rare    
## 4    2010  1350     13.2  Common  
## 5    2015  2300     13.4  Common  
## 6    2020  1041     12.2  Common

The earliest five year intervals contain far fewer planets, meaning the probability of randomly selecting a planet from those periods is much lower than selecting one from more recent bins. There is a trend in average stellar magnitude which may suggest that more recent discoveries involve dimmer stars. A testable hypothesis is that this pattern is due to improved detection technology or increasing average distance of discovered stars, which may affect stellar magnitude values.

Distribution of Stellar Magnitude by Planet Type

df_t <- nasa_data %>%
  group_by(planet_type) %>%
  summarise(
    count = n(),
    avg_stell = mean(stellar_magnitude, na.rm = TRUE),
    probability = n() / nrow(nasa_data),
    rare_tag = ifelse(n() < 200, "Rare", "Common"),
    .groups = "drop"
  )
nasa_data %>%
  filter(!is.na(stellar_magnitude), stellar_magnitude > 0) %>%
  left_join(df_t, by = "planet_type") %>%
  ggplot(aes(x = planet_type, y = stellar_magnitude, fill = rare_tag)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_log10() +
  coord_flip() +
  labs(
    title = "Distribution of Stellar Magnitude by Planet Type",
    x = "Planet Type",
    y = "Stellar Magnitude (log scale)",
    fill = "Rarity"
  )

 df_t

## # A tibble: 5 × 5
##   planet_type  count avg_stell probability rare_tag
##   <chr>        <int>     <dbl>       <dbl> <chr>   
## 1 Gas Giant     1630      10.4    0.310    Common  
## 2 Neptune-like  1825      13.5    0.348    Common  
## 3 Super Earth   1595      13.9    0.304    Common  
## 4 Terrestrial    195      13.4    0.0371   Rare    
## 5 Unknown          5      14.3    0.000952 Rare

The lowest probability groups were tagged as ‘Rare’. Stellar magnitude measures how bright a star appears, with lower numbers indicating brighter stars (for example, the Sun has a stellar magnitude of -26.7). The planets in this dataset orbit stars with magnitudes between 10 and 14, so none of them are very bright. Gas Giants are the most commonly discovered planets and tend to orbit relatively brighter stars, while Terrestrial planets and Unknown types are less commonly found and tend to orbit dimmer stars. A testable hypothesis is that planets orbiting dimmer stars are less likely to be discovered, such as terrestrial-type stars, which may make it harder to detect them with the current methods.

Planet Type By Mass Category

df_combo <- nasa_data %>%
    filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
  group_by(planet_type, mass_wrt) %>%
  summarise(
    n = n(),  
    probability = n() / nrow(nasa_data),
    .groups = "drop"
  ) %>%
  mutate(
    rare_tag = ifelse(n < 50, "Rare", "Common")
  )
all_combos <- expand.grid(
  planet_type = unique(nasa_data$planet_type),
  mass_wrt = unique(nasa_data$mass_wrt)
)


missing_combos <- anti_join(
  all_combos,
  df_combo,
  by = c("planet_type", "mass_wrt")
)

df_combo <- nasa_data %>% 
  filter(!is.na(planet_type), !is.na(mass_wrt)) %>%
  count(planet_type, mass_wrt) %>%
  mutate(probability = n /sum(n))

ggplot(df_combo, aes(x = mass_wrt, y = planet_type, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(trans = "log") +
  labs(
    title = "Planet Type by Mass Category",
    x = "Mass Category (relative to Earth)",
    y = "Planet Type",
    fill = "Count"
  )

print(df_combo)

## # A tibble: 8 × 4
##   planet_type  mass_wrt     n probability
##   <chr>        <chr>    <int>       <dbl>
## 1 Gas Giant    Earth        9    0.00172 
## 2 Gas Giant    Jupiter   1603    0.307   
## 3 Neptune-like Earth     1807    0.346   
## 4 Neptune-like Jupiter     18    0.00344 
## 5 Super Earth  Earth     1576    0.302   
## 6 Super Earth  Jupiter     19    0.00363 
## 7 Terrestrial  Earth      194    0.0371  
## 8 Terrestrial  Jupiter      1    0.000191

print(missing_combos)

##    planet_type mass_wrt
## 1      Unknown  Jupiter
## 2      Unknown    Earth
## 3    Gas Giant     <NA>
## 4  Super Earth     <NA>
## 5 Neptune-like     <NA>
## 6  Terrestrial     <NA>
## 7      Unknown     <NA>

Comparing planet type by mass category shows that Gas Giants are almost exclusively in the Jupiter-weight category (1603) compared to the Earth-weight (9), which makes sense because Gas Giants are much larger and more massive than Earth-like planets. Most other planet types, like Neptune-like and Super Earth, are primarily in the Earth-weight category but also have some Jupiter-weight examples. The combination of ‘Unknown’ planet types with either mass category does not exist in the data, which is why these rows appear in the missing_combos table. These missing combinations likely reflect the fact that ‘Unknown’ planets have insufficient information about their mass.

DataDiveTwo

2026-02-03