# Load necessary libraries
# I want to ensure that the required libraries are available for this analysis.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Simulating a hypothetical dataset of 200 planets
# I decided to create a dataset with random values to mimic planetary attributes and their habitability.
set.seed(42) # I set the seed for reproducibility
planet_data <- data.frame(
Solar_Radiation = runif(200, 0, 1000), # I generated random solar radiation values
Atmospheric_Composition = runif(200, 0, 1), # I represented this as a proportion of breathable air
Distance_from_Star = runif(200, 0.1, 10), # I used arbitrary units for distance
Habitability_Status = sample(c(0, 1), 200, replace = TRUE) # I assumed a binary habitability status
)
# Visualizing the relationship between distance from the star and surface temperature
# I am curious to explore how these variables might relate.
ggplot(planet_data, aes(x = Distance_from_Star, y = Solar_Radiation)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") + # I chose a linear model to approximate the relationship
labs(title = "Distance from Star vs. Solar Radiation",
x = "Distance from Star (AU)",
y = "Solar Radiation (arbitrary units)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Building a simple linear model
# I aim to understand the influence of planetary attributes on habitability.
model <- lm(Habitability_Status ~ Solar_Radiation + Atmospheric_Composition + Distance_from_Star, data = planet_data)
summary(model) # I reviewed the model's summary to check the significance of predictors
##
## Call:
## lm(formula = Habitability_Status ~ Solar_Radiation + Atmospheric_Composition +
## Distance_from_Star, data = planet_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6173 -0.4992 0.3758 0.4890 0.5906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3895914 0.1120100 3.478 0.000622 ***
## Solar_Radiation 0.0000281 0.0001227 0.229 0.819102
## Atmospheric_Composition 0.0610043 0.1217741 0.501 0.616960
## Distance_from_Star 0.0162089 0.0122814 1.320 0.188442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5024 on 196 degrees of freedom
## Multiple R-squared: 0.01018, Adjusted R-squared: -0.004967
## F-statistic: 0.6722 on 3 and 196 DF, p-value: 0.5701
# Plotting residuals to check for randomness
# I want to ensure that the residuals (errors) are randomly distributed around zero.
ggplot(data = planet_data, aes(x = model$fitted.values, y = model$residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Fitted Values",
x = "Fitted Values",
y = "Residuals") +
theme_minimal()

# Conclusion: Interpreting the model
# I observed the results to determine which variables significantly influence habitability.