library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
library(ggthemes)
library(effsize)
library(GGally)
library(ggplot2)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(dplyr)
library(lindia)
library(broom)
nasa_data <- read_delim("C:/Users/imaya/Downloads/cleaned_5250.csv",delim = ",")
## Rows: 5250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, planet_type, mass_wrt, radius_wrt, detection_method
## dbl (8): distance, stellar_magnitude, discovery_year, mass_multiplier, radiu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(nasa_data)
## # A tibble: 6 × 13
## name distance stellar_magnitude planet_type discovery_year mass_multiplier
## <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 11 Coma… 304 4.72 Gas Giant 2007 19.4
## 2 11 Ursa… 409 5.01 Gas Giant 2009 14.7
## 3 14 Andr… 246 5.23 Gas Giant 2008 4.8
## 4 14 Herc… 58 6.62 Gas Giant 2002 8.14
## 5 16 Cygn… 69 6.22 Gas Giant 1996 1.78
## 6 17 Scor… 408 5.23 Gas Giant 2020 4.32
## # ℹ 7 more variables: mass_wrt <chr>, radius_multiplier <dbl>,
## # radius_wrt <chr>, orbital_radius <dbl>, orbital_period <dbl>,
## # eccentricity <dbl>, detection_method <chr>
For this dataset, I created a time variable based on the discovery year to analyze how exoplanet discoveries have changed over time. I grouped the data by discovery year and detection method, counted the number of discoveries, and then converted the year into a date format for time-based analysis
method_year <- nasa_data |>
count(discovery_year, detection_method) |>
mutate(date = as.Date(paste0(discovery_year, "-01-01")))
method_ts <- method_year |>
as_tsibble(index = date, key = detection_method)
method_ts <- method_ts |>
fill_gaps(n = 0)
library(ggplot2)
method_ts |>
ggplot(aes(x = date, y = n, color = detection_method)) +
geom_line() +
labs(
title = "Exoplanet Discoveries Over Time by Detection Method",
x = "Year",
y = "Number of Discoveries"
)
The plot shows the number of exoplanets discovered using each detection method over time. The Transit method accounts for the highest number of discoveries, with approximately 1,400 planets, while most other methods have fewer than 500 discoveries. This difference may be due to how effective the Transit method is at detecting planets, as well as the time period in which discoveries were made. Earlier methods, such as pulsar timing, have fewer detections. The Transit method works by observing a star and detecting small dips in brightness as a planet passes in front of it.
method_ts |>
ggplot(aes(x = date, y = n, color = detection_method)) +
geom_line(alpha = 0.5) +
geom_smooth(method = "loess", se = FALSE) +
labs(
title = "Exoplanet Discoveries Over Time (Smoothed)",
x = "Year",
y = "Number of Discoveries"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(n ~ discovery_year, data = method_ts)
summary(model)
##
## Call:
## lm(formula = n ~ discovery_year, data = method_ts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.07 -48.22 -27.45 3.29 1397.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5944.617 4130.488 -1.439 0.153
## discovery_year 2.976 2.053 1.450 0.150
##
## Residual standard error: 157.6 on 116 degrees of freedom
## (59065 observations deleted due to missingness)
## Multiple R-squared: 0.0178, Adjusted R-squared: 0.009335
## F-statistic: 2.102 on 1 and 116 DF, p-value: 0.1498
This plot shows trends in exoplanet discoveries over time by detection method. In the earlier years, there were relatively few discoveries, likely because many detection methods had not yet been fully developed or widely used.
As time progresses, the number of discoveries increases significantly, with a noticeable peak around 2016. After this point, the number of discoveries begins to decline slightly. This decrease may reflect the increasing difficulty of detecting additional exoplanets, especially smaller or more distant ones, after many of the easier-to-detect planets have already been found
year_data <- nasa_data |>
group_by(discovery_year) |>
summarise(planet_count = n(), .groups = "drop") |>
distinct()
year_data
## # A tibble: 31 × 2
## discovery_year planet_count
## <dbl> <int>
## 1 1992 2
## 2 1994 1
## 3 1995 1
## 4 1996 6
## 5 1997 1
## 6 1998 6
## 7 1999 13
## 8 2000 16
## 9 2001 12
## 10 2002 29
## # ℹ 21 more rows
acf(year_data$planet_count,
ci = 0.95,
na.action = na.exclude)