# Load necessary libraries
# I want to ensure that the required libraries are available for this analysis.
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Simulating a hypothetical dataset of 200 planets
# I decided to create a dataset with random values to mimic planetary attributes and their habitability.
set.seed(42)  # I set the seed for reproducibility
planet_data <- data.frame(
  Solar_Radiation = runif(200, 0, 1000),  # I generated random solar radiation values
  Atmospheric_Composition = runif(200, 0, 1),  # I represented this as a proportion of breathable air
  Distance_from_Star = runif(200, 0.1, 10),  # I used arbitrary units for distance
  Habitability_Status = sample(c(0, 1), 200, replace = TRUE)  # I assumed a binary habitability status
)

# Visualizing the relationship between distance from the star and surface temperature
# I am curious to explore how these variables might relate.
ggplot(planet_data, aes(x = Distance_from_Star, y = Solar_Radiation)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +  # I chose a linear model to approximate the relationship
  labs(title = "Distance from Star vs. Solar Radiation",
       x = "Distance from Star (AU)",
       y = "Solar Radiation (arbitrary units)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Building a simple linear model
# I aim to understand the influence of planetary attributes on habitability.
model <- lm(Habitability_Status ~ Solar_Radiation + Atmospheric_Composition + Distance_from_Star, data = planet_data)
summary(model)  # I reviewed the model's summary to check the significance of predictors
## 
## Call:
## lm(formula = Habitability_Status ~ Solar_Radiation + Atmospheric_Composition + 
##     Distance_from_Star, data = planet_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6173 -0.4992  0.3758  0.4890  0.5906 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.3895914  0.1120100   3.478 0.000622 ***
## Solar_Radiation         0.0000281  0.0001227   0.229 0.819102    
## Atmospheric_Composition 0.0610043  0.1217741   0.501 0.616960    
## Distance_from_Star      0.0162089  0.0122814   1.320 0.188442    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5024 on 196 degrees of freedom
## Multiple R-squared:  0.01018,    Adjusted R-squared:  -0.004967 
## F-statistic: 0.6722 on 3 and 196 DF,  p-value: 0.5701
# Plotting residuals to check for randomness
# I want to ensure that the residuals (errors) are randomly distributed around zero.
ggplot(data = planet_data, aes(x = model$fitted.values, y = model$residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs. Fitted Values",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal()

# Conclusion: Interpreting the model
# I observed the results to determine which variables significantly influence habitability.