library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Welcome to the space journey!

EXPLORATORY DATA ANALYSIS

The main objective of this exploratory data analysis (EDA) is to analyze astronaut mission data to identify patterns of collaboration, specifically focusing on which astronauts traveled together on specific missions. Additionally, the goal is to perform hypothesis testing to examine potential gender differences within the military category, and to investigate the timeline of international space collaborations by analyzing astronaut nationality and mission data.

astro <- read_delim('/Users/sneha/H510-Statistics/astronaut-data.csv')
## Rows: 1277 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, sex, nationality, military_civilian, selection, occupation, ...
## dbl (13): id, number, nationwide_number, year_of_birth, year_of_selection, m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
astro_clean <- drop_na(astro)

Lets find out astronauts who travelled together for space missions

# Modified code to show missions with more than 4 astronauts
astronauts_together <- astro_clean |>
  group_by(mission_title, ascend_shuttle, year_of_mission) |>
  summarise(astronauts = paste(name, collapse = ", "), .groups = "drop") |>
  filter(str_count(astronauts, ",") + 1 > 4)  # Only select missions with more than 4 astronauts

# Print the result
print(astronauts_together)
## # A tibble: 170 × 4
##    mission_title ascend_shuttle year_of_mission astronauts                      
##    <chr>         <chr>                    <dbl> <chr>                           
##  1 3             STS-105                   2001 Culbertson, Frank L., Jr., Dezh…
##  2 42            Soyuz TMA-15M             2014 Virts, Terry W., Jr., Cristofre…
##  3 49            Soyuz MS-02               2016 Kimbrough, Robert S., Borisenko…
##  4 50            Soyuz MS-03               2016 Whitson, Peggy A., Novitski, Ol…
##  5 52            Soyuz MS-05               2017 Nespoli, Paolo A., Bresnik, Ran…
##  6 53            Soyuz MS-06               2017 Acaba, Joseph M., Misurkin, Ale…
##  7 54            Soyuz MS-07               2017 Shkaplerov, Anton, Tingle, Scot…
##  8 55            Soyuz MS-08               2018 Arnold, Richard R., II, Feustel…
##  9 56            Soyuz MS-09               2018 Gerst, Alexander, Prokopyev, Se…
## 10 57            Soyuz MS-11               2018 Kononenko, Oleg D., Saint-Jaque…
## # ℹ 160 more rows
#storing counts
astronauts_together <- astronauts_together |>
  mutate(num_astronauts = str_count(astronauts, ",") + 1)

# Limit to top 50 data points (based on number of astronauts)
astronauts_together_top_50 <- astronauts_together |>
  arrange(desc(num_astronauts)) |>
  head(50)

#plot
p <- ggplot(astronauts_together_top_50, aes(x = mission_title, y = num_astronauts, fill = ascend_shuttle, text = astronauts)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Top 50 Missions with More Than 4 Astronauts Traveling Together",
       x = "Mission Title",
       y = "Number of Astronauts") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#  (hover shows astronaut names)
interactive_plot <- ggplotly(p, tooltip = "text")

# Show the interactive plot
interactive_plot

filtered out top 50 space shuttles with more than 4 astronauts, it seems like there are a lot of shuttles with more than 4. so i thought of focusing on shuttles with single astronauts and the most number of astronauts ( min and max)

shuttle_summary <- astro_clean |>
  group_by(mission_title, ascend_shuttle, year_of_mission) |>
  summarise(
    astronauts = paste(name, collapse = ", "), 
    nationalities = paste(nationality, collapse = ", "),
    astronaut_count = str_count(astronauts, ",") + 1, 
    .groups = "drop"
  )

# Ascend shuttle with exactly one astronaut
shuttle_one_astronaut <- shuttle_summary |>
  filter(astronaut_count == 1)

# Ascend shuttle with the highest number of astronauts
shuttle_highest_astronauts <- shuttle_summary |>
  slice_max(astronaut_count, n = 1)

Ascend shuttle with exactly one astronaut

shuttle_one_astronaut
## # A tibble: 13 × 6
##    mission_title   ascend_shuttle year_of_mission astronauts       nationalities
##    <chr>           <chr>                    <dbl> <chr>            <chr>        
##  1 2               STS 102                   2001 Yuri Vladimirov… U.S.S.R/Russ…
##  2 34              Soyuz TMA-07M             2012 Chris Hadfield   Canada       
##  3 Mir EO-15       Soyuz TM-18               1994 Yuri Vladimirov… U.S.S.R/Russ…
##  4 Mir EO-21       Soyuz TM-23               1996 Yuri Vladimirov… U.S.S.R/Russ…
##  5 STS 101         STS 101                   2000 Yuri Vladimirov… U.S.S.R/Russ…
##  6 Salyut 6        Soyuz 37                  1980 Pham Tuan        Vietnam      
##  7 Shenzhou 10     Shenzhou 10               2008 Nie Haisheng     China        
##  8 Soyuz 28        Soyuz 28                  1978 Vladimir Remek   Czechoslovak…
##  9 Soyuz 30        Soyuz 30                  1978 Miroslaw Hermas… Poland       
## 10 Soyuz 40        Soyuz 40                  1981 Dumitru Prunariu Romania      
## 11 Soyuz TM-12/11  Soyuz TM-12               1991 Helen Sharman    U.K.         
## 12 Soyuz TMA-16/14 Soyuz TMA-16              2009 Guy Laliberte    Canada       
## 13 sts-46          STS-46                    1992 Franco Malerba   Italy        
## # ℹ 1 more variable: astronaut_count <dbl>

AscendAscend shuttle with the highest number of astronauts

shuttle_highest_astronauts
## # A tibble: 3 × 6
##   mission_title ascend_shuttle year_of_mission astronauts          nationalities
##   <chr>         <chr>                    <dbl> <chr>               <chr>        
## 1 STS-39        STS-39                    1991 Bluford, Guion S.,… U.S., U.S., …
## 2 STS-55        STS-55                    1993 Nagel, Steven R., … U.S., U.S., …
## 3 STS-95        STS-95                    1998 Glenn, John H., Jr… U.S., U.S., …
## # ℹ 1 more variable: astronaut_count <dbl>
# Visualization for shuttle_highest_astronauts
library(ggplot2)

# Create a bar chart for the shuttle with the highest number of astronauts
ggplot(shuttle_highest_astronauts, aes(x = reorder(ascend_shuttle, astronaut_count), y = astronaut_count)) +
  geom_bar(stat = "identity", fill = "darkred", alpha = 0.8) +
  geom_text(aes(label = paste("Astronauts:", astronauts, "\nNationalities:", nationalities)), 
            vjust = -0.5, size = 4, color = "black") +
  labs(
    title = "Ascend Shuttle with the Highest Number of Astronauts",
    x = "Ascend Shuttle",
    y = "Astronaut Count",
    subtitle = "Details of the mission with the highest astronaut count",
    caption = "Data source: Astro dataset"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Now, we see that the second bar graph shows a collaboration between nationalities, which makes us deep dive into the concept of international space collaboration and when it started!

Missions between different nations have increased collaboration post-2000.

travel_together_2000 <- astro_clean |>
  group_by(mission_title, year_of_mission) |>
  summarise(nationality_count = n_distinct(nationality), .groups = 'drop') |>
  arrange(year_of_mission)
p <- ggplot(travel_together_2000, aes(x = year_of_mission, y = nationality_count,fill = nationality_count)) +
  geom_bar(stat = "identity", position = "stack") +  
  labs(title = "Yearly contribution analysis", 
       x = "year_of_mission", 
       y = "Nationality count",
       fill = "Nationality") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9), 
        plot.title = element_text(size = 10, face = "bold"))

interactive_plot <- ggplotly(p)
interactive_plot

After doing a random sampling analysis on this dataset, i discovered an interesting fact about women in space : Lets explore on that :-

Random sampling:

sample_size  <- floor(0.5 * nrow(astro_clean))
sample_size
## [1] 637
df_1 <- astro_clean[sample(1:nrow(astro_clean), sample_size, replace = TRUE), ]
df_2 <- astro_clean[sample(1:nrow(astro_clean), sample_size, replace = TRUE), ]
df_3 <- astro_clean[sample(1:nrow(astro_clean), sample_size, replace = TRUE), ]
df_4 <- astro_clean[sample(1:nrow(astro_clean), sample_size, replace = TRUE), ]

Scrutinize the subsamples using GROUP BY:

Here i am considering to group the dataset by sex and year of mission and also to get the total mission completed by each gender category.

df1_summary <- df_1 |>
  group_by(year_of_mission,sex) |>
  summarize(
    total_missions = n(),
    mean_missions = mean(total_number_of_missions)
  )
## `summarise()` has grouped output by 'year_of_mission'. You can override using
## the `.groups` argument.
df2_summary <- df_2 |>
  group_by(year_of_mission,sex) |>
  summarize(
    total_missions = n(),
    mean_missions = mean(total_number_of_missions)
  )
## `summarise()` has grouped output by 'year_of_mission'. You can override using
## the `.groups` argument.
df3_summary <- df_3 |>
  group_by(year_of_mission,sex) |>
  summarize(
    total_missions = n(),
    mean_missions = mean(total_number_of_missions)
  )
## `summarise()` has grouped output by 'year_of_mission'. You can override using
## the `.groups` argument.
df4_summary <- df_4 |>
  group_by(year_of_mission,sex) |>
  summarize(
    total_missions = n(),
    mean_missions = mean(total_number_of_missions)
  )
## `summarise()` has grouped output by 'year_of_mission'. You can override using
## the `.groups` argument.

Using Bar graph to plot the total number of missions for each gender category in each year.

Red line = Female

Blue line = Male

ggplot(df_1, aes(x = factor(year_of_mission), y = total_number_of_missions, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Subsample 1: Missions Over the Years grouped by Sex", 
       x = "Year", 
       y = "Total Missions") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(df_2, aes(x = factor(year_of_mission), y = total_number_of_missions, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Subsample 2: Missions Over the Years grouped by Sex", 
       x = "Year", 
       y = "Total Missions") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(df_3, aes(x = factor(year_of_mission), y = total_number_of_missions, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Subsample 3: Missions Over the Years grouped by Sex", 
       x = "Year", 
       y = "Total Missions") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(df_3, aes(x = factor(year_of_mission), y = total_number_of_missions, color = sex)) +
  geom_point(size = 3, alpha = 0.8) +
  labs(
    title = "Subsample 1: Missions Over the Years Grouped by Sex",
    x = "Year",
    y = "Total Missions"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

ggplot(df_4, aes(x = factor(year_of_mission), y = total_number_of_missions, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Subsample 4: Missions Over the Years grouped by Sex", 
       x = "Year", 
       y = "Total Missions") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90, hjust = 1))

Finding out who that female astronauts was ?

df_1_anomaly <- df_1 |>
  filter(sex == "female") |>
  filter(year_of_mission >= 1960 & year_of_mission <= 1980)
df_1_anomaly
## # A tibble: 0 × 23
## # ℹ 23 variables: id <dbl>, number <dbl>, nationwide_number <dbl>, name <chr>,
## #   sex <chr>, year_of_birth <dbl>, nationality <chr>, military_civilian <chr>,
## #   selection <chr>, year_of_selection <dbl>, mission_number <dbl>,
## #   total_number_of_missions <dbl>, occupation <chr>, year_of_mission <dbl>,
## #   mission_title <chr>, ascend_shuttle <chr>, in_orbit <chr>,
## #   descend_shuttle <chr>, hours_mission <dbl>, total_hrs_sum <dbl>,
## #   field21 <dbl>, eva_hrs_mission <dbl>, total_eva_hrs <dbl>

As we can see, Russia had sent a female pilot to space in the year 1963.

Upon searching about this fact in the internet i found Valentina Tereshkova is a Soviet cosmonaut and the first woman to travel into space. On June 16, 1963, she was launched in the spacecraft Vostok 6,which completed 48 orbits in 71 hours. Interestingly, this information aligns perfectly with our analysis and is exactly the same information retrieved above .

This finding strengthens our confidence in the authenticity of the dataset and underscores the accuracy of our analysis. It illustrates that the dataset is credible and can be used to uncover significant narratives and trends.

Insights and Conclusion

  • With the help of line graphs, we could Identify trends in space missions over time. It started around 1960 and peaked between the year 1980 to 2010. Also we were able to find and validate who was the first women to go to space.

  • We could also identify and compare temporal patterns across subsamples and detect anomalies such as female representation in the year 1963 where Russia send a female pilot to space.

  • We could also find that there is a declining interest in space missions among nations at present.

Hypothesis

Here we are concentrating on female astronauts and their military status:

To test whether the total number of missions and sex interactively affect the military/civilian status, we can use a logistic regression model with an interaction term between sex and total number of missions.

Null Hypothesis (H₀):
An astronaut’s gender has no influence on the likelihood of having a military background.

Alternative Hypothesis (H₁):
Being a male astronaut significantly increases the likelihood having a military background.

creating a binary variable:

we convert the “military_civilian” column to binary: 1 for military, 0 for civilian. I chose miliary_civilian variable because it can be easily converted to a binary variable as the column contains either “military” or “civilian”.

astro_clean$military_binary <- ifelse(astro_clean$military_civilian == "military", 1, 0)
contingency_table <- table(astro_clean$sex, astro_clean$military_binary)

fisher_test_result <- fisher.test(contingency_table)


fisher_test_result
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  3.488181 7.982270
## sample estimates:
## odds ratio 
##   5.228219
contingency_table
##         
##            0   1
##   female 106  37
##   male   400 731

0: Represent’s not being in the military

1: Represent’s being in the military

Row 1 (FEMALE): 106 females are in category 0 (not in the military)| 37 females are in category 1 (in the military)

Row 2 (MALE): 400 males are in category 0(not in the military) | 731 males are in category 1(in the military)

A significantly larger proportion of males (731 out of 1,131 total males) are in category 1 compared to females (37 out of 143 total females)

This suggests a gender disparity in the outcome variable - military involvement

Opportunities for Women:

The data indicates lower representation of females in category 1, emphasizing the need to explore why fewer women are in this category and what can be done to encourage their participation.

The Fisher’s Exact Test was performed to assess the association between gender and the likelihood of being in the military.

Analysis:

P-value:

The p-value is reported as < 2.2e-16, which is extremely small and well below the common significance level of 0.05. This indicates strong evidence against the null hypothesis. Therefore, we reject the null hypothesis that there is no association between gender and military status.

The alternative hypothesis states that the true odds ratio is not equal to 1, meaning there is a significant association between gender and military status.

Odds ratio:

The estimated odds ratio is 5.23. This implies that males are approximately 5.23 times more likely to be in the military compared to females, assuming all other factors are constant.

Confidence interval:

The confidence interval for the odds ratio is (3.49, 7.98). Since this range does not include 1, it further supports the conclusion that the odds of being in the military differ significantly between genders.

Conclusion:

The test results provide strong statistical evidence that gender is significantly associated with military status. Specifically, males are significantly more likely to be in the military compared to females.

Logistic regression:

Building a logistic regression model

Building a logistic regression model with up to three explanatory variables.

Using “military_binary” as the response variable, and explanatory variables like “sex”, “year_of_birth”, “total_number_of_missions”, we create a logistic regression model.

logistic_model <- glm(military_binary ~ sex + year_of_birth + total_number_of_missions, data = astro_clean, family = binomial)
summary(logistic_model)
## 
## Call:
## glm(formula = military_binary ~ sex + year_of_birth + total_number_of_missions, 
##     family = binomial, data = astro_clean)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               9.350351  10.353022   0.903    0.366    
## sexmale                   1.620785   0.203805   7.953 1.83e-15 ***
## year_of_birth            -0.005262   0.005285  -0.996    0.319    
## total_number_of_missions -0.034297   0.042234  -0.812    0.417    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1711.9  on 1273  degrees of freedom
## Residual deviance: 1631.5  on 1270  degrees of freedom
## AIC: 1639.5
## 
## Number of Fisher Scoring iterations: 4

The logistic regression model assesses the relationship between being in the military and predictors: sex, year of birth, and total number of missions.

Key coefficients:

sexmale:

  • p-value : 1.83e-15

  • estimate : 1.620785

  • std error : 0.203805

  • Highly significant; males are much more likely to be in the military.

year_of_birth:

  • p-value : 0.319

  • estimate : -0.005262

  • std error : 0.005285

  • Not significant; year of birth has no clear impact on military status.

total_number_of_missions:

  • p-value : 0.417

  • estimate : -0.034297

  • std error : 0.042234

  • Not significant; total missions show no association with military status.

Significance:

sexmale is the only predictor with a highly significant p-value, indicating a strong association between being male and being in the military.

Other predictors have p-values > 0.05, suggesting they do not have a statistically significant impact on military status.

Conclusion:

The analysis confirms a significant relationship between sex and military status, with males being much more likely to be in the military.

Neither year of birth nor total number of missions significantly impacts the likelihood of being in the military.

Actionable Conclusions and Recommendations

Sex as a Strong Predictor for Military Status

Conclusion:

The logistic regression results show that sex is a highly significant predictor of being in the military, with males being much more likely to be in the military than females (odds ratio = 5.06). This is a strong indicator that gender disparities exist in military participation among astronauts.

Recommendations:

Research: Investigating underlying causes of this disparity, such as societal norms, educational opportunities, or institutional biases, may help identify specific areas to focus on for future improvement.

Model Improvement and Further Research:

The model currently identifies sex as the primary factor influencing military status, while other predictors like year of birth and total number of missions do not show a significant relationship.

For Model Refinement: Future models could incorporate additional predictors, such as physical fitness or something similiar to improve prediction accuracy. Interaction terms between sex and other factors (e.g., year of birth, training background) might uncover more significant relationships.