Data606 Project Proposal

Author

Anthony Josue Roman

Data Preperation

# Load Gaia data
rawgaia <- getURL("https://raw.githubusercontent.com/spacerome/Data607_Project_2/refs/heads/main/gaiadata.csv")

rawgaia2 <- getURL("https://raw.githubusercontent.com/spacerome/Data607_Project_2/refs/heads/main/gaiadata2.csv")

# Prepare the data

gaiadf <- data.frame(read.csv(text=rawgaia, sep= "\t", stringsAsFactors = FALSE, check.names = FALSE))

gaiadf2 <- data.frame(read.csv(text=rawgaia2, sep= "\t", stringsAsFactors = FALSE, check.names = FALSE))

gaiadf_tidy <- gaiadf %>%
  pivot_longer(
    cols = -SOURCE_ID, 
    names_to = c("measurement_type", "band", "replicate"),
    names_pattern = "(\\w+)_(\\w+)_(\\d)", 
    values_to = "value"
  ) %>%
  filter(!is.na(value))

head(gaiadf_tidy)
# A tibble: 6 × 5
  SOURCE_ID measurement_type band  replicate value
      <dbl> <chr>            <chr> <chr>     <dbl>
1   4.04e18 dec              bp    1         -34.4
2   4.04e18 dec              bp    2         -34.4
3   4.04e18 dec              g     1         -34.4
4   4.04e18 dec              g     2         -34.4
5   4.04e18 dec              rp    1         -34.4
6   4.04e18 dec              rp    2         -34.4
gaiadf2_tidy <- gaiadf2 %>%
  pivot_longer(
    cols = -source_id, 
    names_to = "attribute", 
    values_to = "value"     
  ) %>%
  filter(!is.na(value)) 

head(gaiadf2_tidy)
# A tibble: 6 × 3
  source_id attribute        value
      <dbl> <chr>            <dbl>
1   4.19e17 ra               10.1 
2   4.19e17 dec              56.5 
3   4.19e17 parallax         14.1 
4   4.19e17 phot_g_mean_mag   1.94
5   4.19e17 phot_bp_mean_mag  2.99
6   4.19e17 phot_rp_mean_mag  1.84
gaiadf_tidy %>%
  group_by(measurement_type, band) %>%
  summarize(
    mean_value = mean(as.numeric(value), na.rm = TRUE),
    sd_value = sd(as.numeric(value), na.rm = TRUE),
    min_value = min(as.numeric(value), na.rm = TRUE),
    max_value = max(as.numeric(value), na.rm = TRUE),
    count = n()
  ) %>%
  arrange(measurement_type, band)
`summarise()` has grouped output by 'measurement_type'. You can override using
the `.groups` argument.
# A tibble: 12 × 7
# Groups:   measurement_type [4]
   measurement_type band  mean_value sd_value  min_value max_value count
   <chr>            <chr>      <dbl>    <dbl>      <dbl>     <dbl> <int>
 1 dec              bp        -66.2      2.69 -70.2          -34.4 10000
 2 dec              g         -66.2      2.69 -70.2          -34.4 10000
 3 dec              rp        -66.2      2.69 -70.2          -34.4 10000
 4 magnitude        bp         19.2      2.09   7.61          22.6  9928
 5 magnitude        g          18.5      1.99   7.28          21.1  9998
 6 magnitude        rp         17.7      1.90   5.93          20.8  9930
 7 parallax         bp          1.01     1.39   0.000521      61.2 10000
 8 parallax         g           1.01     1.39   0.000521      61.2 10000
 9 parallax         rp          1.01     1.39   0.000521      61.2 10000
10 ra               bp         58.1      9.89  50.7          274.  10000
11 ra               g          58.1      9.89  50.7          274.  10000
12 ra               rp         58.1      9.89  50.7          274.  10000
gaiadf2_tidy %>%
  group_by(attribute) %>%
  summarize(
    mean_value = mean(as.numeric(value), na.rm = TRUE),
    sd_value = sd(as.numeric(value), na.rm = TRUE),
    min_value = min(as.numeric(value), na.rm = TRUE),
    max_value = max(as.numeric(value), na.rm = TRUE),
    count = n()
  ) %>%
  arrange(attribute)
# A tibble: 8 × 6
  attribute        mean_value sd_value min_value max_value count
  <chr>                 <dbl>    <dbl>     <dbl>     <dbl> <int>
1 dec                   -3.43   40.4     -83.7       87.0   1000
2 parallax              18.3    26.1       0.119    311.    1000
3 phot_bp_mean_mag       4.40    0.707     2.88       8.05  1000
4 phot_g_mean_mag        3.80    0.534     1.94       4.44  1000
5 phot_rp_mean_mag       3.28    0.610     1.84       4.59  1000
6 pmdec                -27.1   195.    -3422.      1165.    1000
7 pmra                   5.69  243.    -2240.      3967.    1000
8 ra                   183.    101.        0.398    360.    1000
# Scatter plot of Proper Motion in RA vs Dec
gaiadf2_pm <- gaiadf2_tidy %>%
  filter(attribute %in% c("pmra", "pmdec")) %>%
  pivot_wider(names_from = attribute, values_from = value)

parallax_magnitude <- gaiadf2_tidy %>%
  filter(attribute %in% c("parallax", "phot_g_mean_mag")) %>%
  pivot_wider(names_from = attribute, values_from = value)

Research Question

How do stellar kinematics and luminosity distributions vary based on the position within the Milky Way galaxy, as observed in the Gaia catalog?

Cases

The cases in the Gaia dataset are individual stars observed in the Milky Way galaxy. The dataset contains millions of cases, but we will focus on a manageable subset based on specific criteria, such as brightness and distance.

Data Collection

The data was collected through the Gaia space observatory, a mission by the European Space Agency. Gaia continuously surveys the sky to map the positions, velocities, and physical characteristics of billions of stars with unprecedented precision.

Type of Study

This is an observational study. The data was collected without manipulating any variables, purely observing and recording the characteristics of stars in their natural state.

Data Source

The data is sourced from the European Space Agency’s Gaia catalog. Citation: European Space Agency, Gaia Archive. Access the Gaia Archive. The dataset includes various measurements and attributes of stars observed by the Gaia space observatory, and it will be on my Github Page

Variables

  • Stellar Parallax: The apparent shift in a star’s position due to the Earth’s orbit around the Sun.

  • Proper Motion: The angular change in a star’s position over time.

  • Radial Velocity: The speed at which a star moves towards or away from the observer.

  • Apparent Magnitude: The brightness of a star as seen from Earth.

  • Luminosity: The total amount of energy emitted by a star per unit time.

  • Color Index: A measure of a star’s color based on the difference in brightness between two spectral bands.

  • Spectral Classification: The classification of stars based on their spectral characteristics.

  • Star Types: Categories of stars based on their evolutionary stage, such as main sequence, giant, supergiant, etc.

Describe Your Variables

  • Quantitative Variables: Stellar parallax, proper motion, radial velocity, apparent magnitude, luminosity, and color index.
  • Qualitative Variables: Spectral classification and star types (e.g., main sequence, giant, etc.).
  • Dependent Variable: If running a regression analysis, the dependent variable could be luminosity or apparent magnitude, depending on the research focus.

Relevant Summary Statistics

Below are summary statistics for key variables and visualizations that help address the research question.

# Summary statistics
summary(gaiadf)
   SOURCE_ID            dec_bp_1         dec_bp_2         dec_g_1      
 Min.   :4.045e+18   Min.   :-70.19   Min.   :-70.19   Min.   :-70.19  
 1st Qu.:4.668e+18   1st Qu.:-67.31   1st Qu.:-67.31   1st Qu.:-67.31  
 Median :4.671e+18   Median :-66.85   Median :-66.85   Median :-66.85  
 Mean   :4.670e+18   Mean   :-66.22   Mean   :-66.22   Mean   :-66.22  
 3rd Qu.:4.674e+18   3rd Qu.:-63.71   3rd Qu.:-63.71   3rd Qu.:-63.71  
 Max.   :4.676e+18   Max.   :-34.35   Max.   :-34.35   Max.   :-34.35  
                                                                       
    dec_g_2          dec_rp_1         dec_rp_2      magnitude_bp_1  
 Min.   :-70.19   Min.   :-70.19   Min.   :-70.19   Min.   : 7.612  
 1st Qu.:-67.31   1st Qu.:-67.31   1st Qu.:-67.31   1st Qu.:18.015  
 Median :-66.85   Median :-66.85   Median :-66.85   Median :19.715  
 Mean   :-66.22   Mean   :-66.22   Mean   :-66.22   Mean   :19.137  
 3rd Qu.:-63.71   3rd Qu.:-63.71   3rd Qu.:-63.71   3rd Qu.:20.775  
 Max.   :-34.35   Max.   :-34.35   Max.   :-34.35   Max.   :22.459  
                                                    NA's   :36      
 magnitude_bp_2   magnitude_g_1    magnitude_g_2    magnitude_rp_1  
 Min.   : 7.712   Min.   : 7.278   Min.   : 7.378   Min.   : 5.929  
 1st Qu.:18.115   1st Qu.:17.415   1st Qu.:17.515   1st Qu.:16.636  
 Median :19.815   Median :18.974   Median :19.074   Median :18.106  
 Mean   :19.237   Mean   :18.466   Mean   :18.566   Mean   :17.640  
 3rd Qu.:20.875   3rd Qu.:19.993   3rd Qu.:20.093   3rd Qu.:19.058  
 Max.   :22.559   Max.   :20.954   Max.   :21.054   Max.   :20.672  
 NA's   :36       NA's   :1        NA's   :1        NA's   :35      
 magnitude_rp_2   parallax_bp_1      parallax_bp_2       parallax_g_1     
 Min.   : 6.029   Min.   : 0.00052   Min.   : 0.00052   Min.   : 0.00052  
 1st Qu.:16.736   1st Qu.: 0.34031   1st Qu.: 0.34031   1st Qu.: 0.34031  
 Median :18.206   Median : 0.68036   Median : 0.68036   Median : 0.68036  
 Mean   :17.740   Mean   : 1.00771   Mean   : 1.00771   Mean   : 1.00771  
 3rd Qu.:19.158   3rd Qu.: 1.29593   3rd Qu.: 1.29593   3rd Qu.: 1.29593  
 Max.   :20.772   Max.   :61.22712   Max.   :61.22712   Max.   :61.22712  
 NA's   :35                                                               
  parallax_g_2      parallax_rp_1      parallax_rp_2         ra_bp_1      
 Min.   : 0.00052   Min.   : 0.00052   Min.   : 0.00052   Min.   : 50.72  
 1st Qu.: 0.34031   1st Qu.: 0.34031   1st Qu.: 0.34031   1st Qu.: 53.32  
 Median : 0.68036   Median : 0.68036   Median : 0.68036   Median : 57.66  
 Mean   : 1.00771   Mean   : 1.00771   Mean   : 1.00771   Mean   : 58.08  
 3rd Qu.: 1.29593   3rd Qu.: 1.29593   3rd Qu.: 1.29593   3rd Qu.: 62.78  
 Max.   :61.22712   Max.   :61.22712   Max.   :61.22712   Max.   :274.43  
                                                                          
    ra_bp_2           ra_g_1           ra_g_2          ra_rp_1      
 Min.   : 50.72   Min.   : 50.72   Min.   : 50.72   Min.   : 50.72  
 1st Qu.: 53.32   1st Qu.: 53.32   1st Qu.: 53.32   1st Qu.: 53.32  
 Median : 57.66   Median : 57.66   Median : 57.66   Median : 57.66  
 Mean   : 58.08   Mean   : 58.08   Mean   : 58.08   Mean   : 58.08  
 3rd Qu.: 62.78   3rd Qu.: 62.78   3rd Qu.: 62.78   3rd Qu.: 62.78  
 Max.   :274.43   Max.   :274.43   Max.   :274.43   Max.   :274.43  
                                                                    
    ra_rp_2      
 Min.   : 50.72  
 1st Qu.: 53.32  
 Median : 57.66  
 Mean   : 58.08  
 3rd Qu.: 62.78  
 Max.   :274.43  
                 
summary(gaiadf2)
   source_id               ra                dec             parallax      
 Min.   :1.117e+16   Min.   :  0.3976   Min.   :-83.668   Min.   :  0.119  
 1st Qu.:1.891e+18   1st Qu.: 95.8385   1st Qu.:-37.057   1st Qu.:  5.722  
 Median :3.511e+18   Median :183.8217   Median : -3.801   Median : 10.257  
 Mean   :3.582e+18   Mean   :182.8800   Mean   : -3.431   Mean   : 18.254  
 3rd Qu.:5.381e+18   3rd Qu.:271.3863   3rd Qu.: 28.793   3rd Qu.: 21.315  
 Max.   :6.914e+18   Max.   :359.8285   Max.   : 87.020   Max.   :310.577  
 phot_g_mean_mag phot_bp_mean_mag phot_rp_mean_mag      pmra          
 Min.   :1.943   Min.   :2.884    Min.   :1.844    Min.   :-2240.085  
 1st Qu.:3.491   1st Qu.:3.926    1st Qu.:2.845    1st Qu.:  -31.536  
 Median :3.941   Median :4.335    Median :3.281    Median :   -3.744  
 Mean   :3.796   Mean   :4.397    Mean   :3.284    Mean   :    5.689  
 3rd Qu.:4.224   3rd Qu.:4.803    3rd Qu.:3.726    3rd Qu.:   28.247  
 Max.   :4.437   Max.   :8.051    Max.   :4.590    Max.   : 3966.661  
     pmdec          
 Min.   :-3421.809  
 1st Qu.:  -41.816  
 Median :  -10.685  
 Mean   :  -27.139  
 3rd Qu.:    8.686  
 Max.   : 1164.959  
ggplot(gaiadf_tidy, aes(x = band, y = as.numeric(value), fill = band)) +
  geom_boxplot() +
  facet_wrap(~ measurement_type, scales = "free_y") +
  labs(title = "Distribution of Measurements by Band and Type",
       y = "Value", x = "Band") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5)
  )

ggplot(gaiadf2_tidy, aes(x = as.numeric(value), fill = attribute)) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.6) +
  facet_wrap(~ attribute, scales = "free") +
  labs(title = "Histogram of Attributes in gaiadf2_tidy",
       x = "Value", y = "Frequency") +
  theme_minimal()  +
  theme(
    plot.title = element_text(hjust = 0.5))

ggplot(gaiadf2_pm, aes(x = as.numeric(pmra), y = as.numeric(pmdec))) +
  geom_point(alpha = 0.6, color = "red") +
  labs(title = "Proper Motion: PMRA vs PMDEC",
       x = "Proper Motion in RA", y = "Proper Motion in Dec") +
  theme_minimal()  +
  theme(
    plot.title = element_text(hjust = 0.5))

ggplot(parallax_magnitude, aes(x = as.numeric(parallax), y = as.numeric(phot_g_mean_mag))) +
  geom_point(color = "purple", alpha = 0.6) +
  labs(title = "Parallax vs G-band Magnitude",
       x = "Parallax", y = "G-band Magnitude") +
  theme_minimal()  +
  theme(
    plot.title = element_text(hjust = 0.5))