DSLabs Datasets: Stellar Catalog

Author

Ash Ibasan

Introduction

Setup and load necessary libraries

library(tidyverse)      # ggthemes and ggrepel part of tidyverse

Warning: package 'tidyverse' was built under R version 4.4.1

Warning: package 'ggplot2' was built under R version 4.4.1

Warning: package 'tibble' was built under R version 4.4.1

Warning: package 'tidyr' was built under R version 4.4.1

Warning: package 'readr' was built under R version 4.4.1

Warning: package 'purrr' was built under R version 4.4.1

Warning: package 'dplyr' was built under R version 4.4.1

Warning: package 'stringr' was built under R version 4.4.1

Warning: package 'forcats' was built under R version 4.4.1

Warning: package 'lubridate' was built under R version 4.4.1

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(scales)         # more scaling functions for ggplot2

Warning: package 'scales' was built under R version 4.4.1


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

library(highcharter)    # for interactivity

Warning: package 'highcharter' was built under R version 4.4.1

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use

library(RColorBrewer)   # color palettes
library(dslabs)

Warning: package 'dslabs' was built under R version 4.4.1


Attaching package: 'dslabs'

The following object is masked from 'package:highcharter':

    stars

View dataset from DSLabs (Data Science Labs) package

data(package="dslabs")
list.files(system.file("script", package = "dslabs"))

 [1] "make-admissions.R"                   
 [2] "make-brca.R"                         
 [3] "make-brexit_polls.R"                 
 [4] "make-calificaciones.R"               
 [5] "make-death_prob.R"                   
 [6] "make-divorce_margarine.R"            
 [7] "make-gapminder-rdas.R"               
 [8] "make-greenhouse_gases.R"             
 [9] "make-historic_co2.R"                 
[10] "make-mice_weights.R"                 
[11] "make-mnist_127.R"                    
[12] "make-mnist_27.R"                     
[13] "make-movielens.R"                    
[14] "make-murders-rda.R"                  
[15] "make-na_example-rda.R"               
[16] "make-nyc_regents_scores.R"           
[17] "make-olive.R"                        
[18] "make-outlier_example.R"              
[19] "make-polls_2008.R"                   
[20] "make-polls_us_election_2016.R"       
[21] "make-pr_death_counts.R"              
[22] "make-reported_heights-rda.R"         
[23] "make-research_funding_rates.R"       
[24] "make-stars.R"                        
[25] "make-temp_carbon.R"                  
[26] "make-tissue-gene-expression.R"       
[27] "make-trump_tweets.R"                 
[28] "make-weekly_us_contagious_diseases.R"
[29] "save-gapminder-example-csv.R"

Load stars dataset

data("stars")
write_csv(stars, "stars.csv", na="")

View font list

names(postscriptFonts())

 [1] "serif"                "sans"                 "mono"                
 [4] "AvantGarde"           "Bookman"              "Courier"             
 [7] "Helvetica"            "Helvetica-Narrow"     "NewCenturySchoolbook"
[10] "Palatino"             "Times"                "URWGothic"           
[13] "URWBookman"           "NimbusMon"            "NimbusSan"           
[16] "URWHelvetica"         "NimbusSanCond"        "CenturySch"          
[19] "URWPalladio"          "NimbusRom"            "URWTimes"            
[22] "URW2Helvetica"        "URW2HelveticaItalic"  "URW2Times"           
[25] "NimbusMonoPS"         "ArialMT"              "ComputerModern"      
[28] "ComputerModernItalic" "Japan1"               "Japan1HeiMin"        
[31] "Japan1GothicBBB"      "Japan1Ryumin"         "Korea1"              
[34] "Korea1deb"            "CNS1"                 "GB1"

Exploratory data analysis (EDA)

Glimpse and take a look at dataset

glimpse(stars) # quick look at data structure

Rows: 96
Columns: 4
$ star      <fct> Sun, SiriusA, Canopus, Arcturus, AlphaCentauriA, Vega, Capel…
$ magnitude <dbl> 4.8, 1.4, -3.1, -0.4, 4.3, 0.5, -0.6, -7.2, 2.6, -5.7, -2.4,…
$ temp      <int> 5840, 9620, 7400, 4590, 5840, 9900, 5150, 12140, 6580, 3200,…
$ type      <chr> "G", "A", "F", "K", "G", "A", "G", "B", "F", "M", "B", "B", …

summary(stars) # stats

          star      magnitude           temp           type          
 Altair     : 2   Min.   :-8.000   Min.   : 2500   Length:96         
 *40EridaniA: 1   1st Qu.:-1.800   1st Qu.: 3168   Class :character  
 *40EridaniB: 1   Median : 2.400   Median : 5050   Mode  :character  
 *40EridaniC: 1   Mean   : 4.257   Mean   : 8752                     
 *61CygniA  : 1   3rd Qu.:11.325   3rd Qu.: 9900                     
 *61CygniB  : 1   Max.   :17.000   Max.   :33600                     
 (Other)    :89

head(stars) # look at first few rows for data interpretation

            star magnitude temp type
1            Sun       4.8 5840    G
2        SiriusA       1.4 9620    A
3        Canopus      -3.1 7400    F
4       Arcturus      -0.4 4590    K
5 AlphaCentauriA       4.3 5840    G
6           Vega       0.5 9900    A

Checking dataset

Check for and remove duplicates

dupes <- stars[duplicated(stars), ]
dupes # show duplicated rows

     star magnitude temp type
89 Altair       2.2 8060    A

stars <- stars %>% distinct() # removed Altair dupe

Check for missing values

colSums(is.na(stars))  # shows amount of missing values; there are none in this dataset

     star magnitude      temp      type 
        0         0         0         0

Rename column names

# for readability
stars <- stars %>% rename( 
  temperature = temp,  # temp to temperature
  # keep magnitude: 
  spectral_class = type   # type to spectral_class
)

Explore key variables and relationships

Summary of key variables

summary(stars$temperature)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2500    3135    4950    8760    9900   33600

summary(stars$magnitude)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -8.000  -1.900   2.600   4.279  11.450  17.000

summary(stars$spectral_class)

   Length     Class      Mode 
       95 character character

Exploratory plot to observe general relationships among variables

ggplot(stars, aes(x = temperature, y = magnitude, color = spectral_class)) +
  geom_point() +
  labs(title = "Distribution of Star Magnitude and Temperature Across Spectral Classes",
       x = "Temperature (K)", 
       y = "Magnitude"
     ) +
  scale_color_discrete(name = "Spectral Class")

Advanced customization for exploratory plots

Density plot of temperature by spectral class

The density plot helps understanding on how temperature varies across spectral classes. For instance, a high peak in a specific temperature ranges means most stars in that spectral class have temperatures around that value, which can highlight distinctions between classes, such as cooler stars being more common in certain classes.

ggplot(stars, aes(x = temperature, fill = spectral_class)) +
  geom_density(alpha = 0.6, color = NA) +
  labs(
    title = "Thermal Density Profiles of Stellar Classes",
    x = "Temperature (K)",
    y = "Density (Proportion of Stars)"
  ) +
  scale_fill_discrete(name = "Spectral Class") +
  theme_minimal() +
  theme(panel.border = element_blank())

Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.

Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf
Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

Boxplot of magnitude spread across star classes

The boxplot shows magnitude spread within each spectral class, where the box length indicates variability within that class, while outliers show particularly bright or dim stars. A smaller range indicates a more consistent brightness, whereas a larger range suggest more variety within that class. Observing whether some classes generally contain brighter or dimmer stars can be interpreted by visualizing the magnitude distribution per class.

ggplot(stars, aes(x = spectral_class, y = magnitude, fill = spectral_class)) +
  geom_boxplot() +
  labs(
    title = "Range of Stellar Magnitudes by Spectral Class",
    x = "Spectral Class",
    y = "Magnitude"
  ) +
  scale_fill_discrete(name = "Spectral Class") +
  theme_minimal()

  theme(panel.border = element_blank())

List of 1
 $ panel.border: list()
  ..- attr(*, "class")= chr [1:2] "element_blank" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE

Heatmap clustering by temperature and magnitude

The heatmap shows where stars are concentrated across the axes of temperature and brightness. Darker areas represent high densities, indicating regions where many stars have similar temperature and magnitude, and lighter areas are where fewer stars share similar temperature and magnitudes. Also, the concentration shown in the visualization can reveal patterns like the prevalence of cooler, dimmer stars versus hot, bright stars.

ggplot(stars, aes(x = temperature, y = magnitude)) +
  geom_bin2d(bins = 30) +
  scale_fill_gradient(name = "Star Density", low = "turquoise", high = "dodgerblue4") +
  labs(
    title = "Clusters of Stars by Temperature and Magnitude",
    x = "Temperature (K)",
    y = "Magnitude"
  ) +
  theme_minimal() +
  theme(panel.border = element_blank())

Contour map of star density across temperature and magnitude

Although a contour map is similar to a heat map, unlike one, a contour map is intended for a smooth view of clustering patterns. It maps star concentration across temperature and magnitude. Darker contours show areas of higher density. The legend uses scientific notation to represent very low densities, which is common due to the sparse distribution of stars.

ggplot(stars, aes(x = temperature, y = magnitude)) +
  geom_density_2d_filled() +
  scale_fill_viridis_d(name = "Density Level", option = "plasma") +
  labs(
    title = "Mapping Star Concentrations by Temperature and Magnitude",
    x = "Temperature (K)",
    y = "Magnitude"
  ) +
  theme_minimal() +
  theme(
    panel.border = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),  # for slanting x-axis labels 45 degrees
    plot.title = element_text(size = 16, face = "bold") 
  )

Scatterplot matrix of temperature and magnitude

A scatterplot matrix allows an opportunity to look for correlations between variables by displaying all pairwise relationships. Each plot represents the relationship between two variables; clustering or patterns can indicate a correlation. For example, if temperature and magnitude form a linear pattern, there might be a significant correlation between these two aspects of a star’s properties.

pairs(
  stars[, c("temperature", "magnitude")],
  main = "Exploring Interrelationships Among Star Properties"
)

Faceted plot by spectral class

Faceting by spectral class provides separate plots for each class, allowing a side-by-side comparison of the temperature and brightness trends within each class. This approach can reveal differences in temperature ranges or brightness distributions across classes. For instance, if one class has a wider spread in temperature or a different trend, it’s easier to see in a faceted plot. Visualizing through a faceted plot is a powerful way to compare groups directly without overlaying them, making patterns and outliers clearer within each class.

ggplot(stars, aes(x = temperature, y = magnitude, color = spectral_class)) +
  geom_point() +
  facet_wrap(~ spectral_class) +
  labs(
    title = "Class-Specific Patterns in Star Temperature and Brightness",
    x = "Temperature (K)",
    y = "Magnitude"
  ) +
  theme_minimal()

Interactive visualization

Setup stellar catalog: fit linear model

lm_fit <- lm(magnitude ~ temperature, data = stars) # for trend line
stars$fit <- predict(lm_fit, stars)  # add fitted values to data

Stellar Catalog

Key terms for the stellar catalog

# - Temperature (K): star surface temperature in Kelvin (K), typical for stellar temperature measurements
# - Magnitude: star brightness scale, where lower numbers mean brighter stars; higher numbers mean dimmer ones.
# - Star: each point in the interactive catalog represents an individual star.
# - Spectral class: categorization of stars based on based on characteristics like temperature, using classes from hot to cool: O, B, A, F, G, K, M. D is a sub classification, meaning dwarf star. e.g. DA, DF, etc.

highchart() %>%
  hc_add_series(data = stars,
                type = "scatter",
                hcaes(x = temperature, y = magnitude, color = spectral_class, name = star),
                marker = list(symbol = "diamond", radius = 4),
                name = "Stellar Data") %>%
  hc_add_series(data = stars,
                type = "line", # trend line
                hcaes(x = temperature, y = fit),
                name = "Trend Line",
                color = "white",
                lineWidth = 1,
                dashStyle = "ShortDash") %>%
  hc_xAxis(title = list(text = "Temperature (K)")) %>%
  hc_yAxis(title = list(text = "Magnitude")) %>%
  hc_title(text = "Temperature and Brightness in the Stellar Catalog") %>%
  hc_subtitle(text = "Exploring the relationship between star temperature and magnitude") %>%
  hc_tooltip(pointFormat = "<b>{point.name}</b><br>Temperature: {point.x} K<br>Magnitude: {point.y}<br>Spectral Class: {point.spectral_class}") %>%
  hc_add_theme(hc_theme_darkunica()) %>%
  hc_chart(backgroundColor = "#4a4a4a")

Conclusion

I explored the stars dataset from the dslabs package to analyze relationships between temperature, magnitude (star brightness), and spectral class among various stars. Starting with setup, cleanup, EDA, and then advanced exploration, I used multiple visualizations to glimpse the relationships of the variables used in stellar characteristics, where each plot offered a unique perspective and interpretation. As a highlight, I created the interactive stellar catalog through highchart where each point represents an individual star in the data set. I added a trend line to display the general relationship between temperature and magnitude. As a result, as stellar temperature increases, brightness tends to decrease. Overall, the catalog is user-friendly, so those who interact with it can explore the star’s details and view the trend between the two variables, providing clear context to all interacting with it.