# Required Libraries
library(readr)
library(dplyr)
library(magrittr)
library(ggplot2)
library(shiny)
library(rstan)
library(shinystan)
library(ggiraph)
library(gganimate)
library(reshape2)
library(tidyr)
library(ggmap)
data <- read_csv("D:/UCD/Advanced R/Assignment/Final Project/exo_data.csv")
Importing the csv data file using read_csv() as tibble.
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3659 obs. of 25 variables:
## $ id : chr "KOI-1843.03" "Kepler-974 b" "KOI-1843.02" "Kepler-9 b" ...
## $ flag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mass : num 0.0014 NA NA 0.25 0.17 0.022 0.0321 NA 0.6 5.21 ...
## $ radius : num 0.054 0.14 0.071 0.84 0.82 0.147 NA 0.192 1.24 NA ...
## $ period : num 0.177 4.194 6.356 19.224 39.031 ...
## $ axis : num 0.0048 0.039 0.052 0.143 0.229 0.0271 0.053 NA 0.0449 1.33 ...
## $ ecc : num NA NA NA 0.0626 0.0684 NA 0.06 NA NA 0.15 ...
## $ per : num NA NA NA NA NA ...
## $ lon : num NA NA NA NA NA NA NA NA NA NA ...
## $ asc : num NA NA NA NA NA NA NA NA NA NA ...
## $ incl : num 72 89.4 88.2 87.1 87.2 ...
## $ temp : num NA NA NA 707 558 ...
## $ age : logi NA NA NA NA NA NA ...
## $ meth : chr "transit" "transit" "transit" "transit" ...
## $ year : num 2012 NA NA 2010 2010 ...
## $ recency : chr "13/07/15" "17/11/28" NA "15/12/03" ...
## $ r_asc : chr "19 00 03.14" "19 00 03.14" "19 00 03.14" "19 02 17" ...
## $ decl : chr "+40 13 14.7" "+40 13 14.7" "+40 13 14.7" "+38 24 03" ...
## $ dist : num NA NA NA 650 650 ...
## $ host_mass: num 0.52 0.52 0.52 1.07 1.07 1.07 0.69 0.83 1.07 0.82 ...
## $ host_rad : num 0.5 0.5 0.5 1.02 1.02 1.02 NA 0.79 NA NA ...
## $ host_met : num 0.07 0.07 0.07 0.12 0.12 0.12 NA -0.01 -0.02 -0.18 ...
## $ host_temp: num 3687 3687 3687 5777 5777 ...
## $ host_age : num NA NA NA NA NA NA NA NA NA NA ...
## $ lists : chr "Controversial" "Confirmed planets" "Controversial" "Confirmed planets" ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 5 variables:
## ..$ row : int 1712 2970
## ..$ col : chr "age" "age"
## ..$ expected: chr "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE"
## ..$ actual : chr "0.0055" "3.0"
## ..$ file : chr "'D:/UCD/Advanced R/Assignment/Final Project/exo_data.csv'" "'D:/UCD/Advanced R/Assignment/Final Project/exo_data.csv'"
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. flag = col_double(),
## .. mass = col_double(),
## .. radius = col_double(),
## .. period = col_double(),
## .. axis = col_double(),
## .. ecc = col_double(),
## .. per = col_double(),
## .. lon = col_double(),
## .. asc = col_double(),
## .. incl = col_double(),
## .. temp = col_double(),
## .. age = col_logical(),
## .. meth = col_character(),
## .. year = col_double(),
## .. recency = col_character(),
## .. r_asc = col_character(),
## .. decl = col_character(),
## .. dist = col_double(),
## .. host_mass = col_double(),
## .. host_rad = col_double(),
## .. host_met = col_double(),
## .. host_temp = col_double(),
## .. host_age = col_double(),
## .. lists = col_character()
## .. )
Using the str() function to have a brief overview of the dataframe.
Columns 1, 16, 17, 18, 25 are already in “character” type. However, we need to convert 2,14,15 to appropriate datatype.
I use the “%<>%” function of “magrittr” package to convert 3 variables (year, flag, meth) to suitable datatypes.
#Columns 1, 16, 17, 18, 25 ALREADY be characters.
data$year %<>% as.integer # column 15 = year
data$flag %<>% as.factor # column 2 = flag
#unique(data[,14]) # There are 5 different levels and NA for col14=meth.
data$meth %<>% as.factor
data <- data %>% drop_na(meth)
Using drop_na() of dplyr package to drop rows containing missing values of “Meth”- method variable.
Dataset now reduce to 3596 obs after remove NA of “meth”.
ggplot(data, aes(x=log(dist), fill=meth, color=meth)) +
geom_histogram(position="identity") +
labs(title="Log-Dist from Sun Histogram",x="Log-Dist from Sun (parsec)", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1409 rows containing non-finite values (stat_bin).
Using ggplot to create a histogram for the log-distances from the Sun, highlighting the methods of discovery by color.
(paste the id after http://www.openexoplanetcatalogue.com/planet/ ).
First, I create “onclick” variable to have the website link for each planet. After that, using ggplot to createscatterplots of the log-mass versus log-distances, separating by methods of discovery. Lastly, using “geom_point_interactive” to create Hovering with the cursor highlights the point and displays its name.
If you click on the scatter point, the exoplanet’s page on the Open Exoplanet Catalogue will be opened.
data$onclick <- sprintf("window.open(\"%s%s\")",
"http://www.openexoplanetcatalogue.com/planet/",
data$id)
gg_graph = ggplot(data,
aes(x = log(mass),
y = log(dist),
color = meth)) +
xlab('Log(Mass)') +
ylab('Log(Dist') +
scale_color_discrete(name="Discovery Method")+
geom_point_interactive(aes(data_id = id,
tooltip = id,
onclick = onclick)) +
labs(title="Scatterplots of log-mass vs. log-distances")
ggiraph(code = print(gg_graph))
## Warning: Removed 2355 rows containing missing values
## (geom_interactive_point).
data <- data %>%
rename(jupiter_radius = radius ) # rename() function from tidyverse with pipe.
data <- data %>%
mutate(earth_radius = jupiter_radius / 11.2 )
Using rename() to rename the radius into jupiter_radius
Using mutate() to create new variable earth_radius from jupiter_radius.
First of all, I create new dataframe called “data_clustering” from “data”. The “data_clustering” only focus on the rows where radius of Earth and period have no missing values. Also, we add Log-scale for both radius of Earth and period. We only use this 2 covariates to perform k-means clustering with 4-cluster.
data_clustering <- data # create new df for clustering from data
# Focus only on the rows where radius of Earth and period have no missing values
data_clustering <- data %>% drop_na(earth_radius, period) # 2732 obs
#log-radius of Earth and log-period
data_clustering <- data_clustering %>%
mutate(LogERadius = log(earth_radius),
LogPeriod = log(period))
# data to perform Kmeans
data_kmeans <- data_clustering %>%
select(LogERadius,LogPeriod)
# perform k-means
set.seed(123)
cluster_kmeans <- kmeans(data_kmeans, 4)
table(cluster_kmeans$cluster)
##
## 1 2 3 4
## 416 798 1133 385
cluster_kmeans$cluster <- as.factor(cluster_kmeans$cluster)
ggplot(data_kmeans, aes(LogPeriod,LogERadius ,color = cluster_kmeans$cluster)) + geom_point() +
labs(title="Clustering solutions of Exoplanets")
We visualize the clustering solutions and the plot highlights 4 clusters by colors. Refering to https://en.wikipedia.org/wiki/Exoplanet#/media/File:ExoplanetPopulations-20170616.png , Add the clustering labels to the dataset.
The cluster 1,2,3,4 are equivalent to “cold_gas_giants”, “others”, “Rocky”, “hot_jupiters”, respectively.
# Using https://en.wikipedia.org/wiki/Exoplanet#/media/File:ExoplanetPopulations-20170616.png we have:
# 1 = cold_gas_giants
# 2 = others
# 3 = Rocky
# 4 = hot_jupiters
data_clustering$type <- cluster_kmeans$cluster
data_clustering$type <- as.numeric(data_clustering$type)
data_clustering$type[data_clustering$type == 1] <- "cold_gas_giants"
data_clustering$type[data_clustering$type == 2] <- "others"
data_clustering$type[data_clustering$type == 3] <- "Rocky"
data_clustering$type[data_clustering$type == 4] <- "hot_jupiters"
table(cluster_kmeans$cluster)
##
## 1 2 3 4
## 416 798 1133 385
table(data_clustering$type) ## checking
##
## cold_gas_giants hot_jupiters others Rocky
## 416 385 798 1133
Using ggplot to illustrate how these clusters relate to the log-mass of the exoplanet. geom_histogram for histogram, and geom_violin for violin plot.
# Histogram
ggplot(data_clustering, aes(x = log(mass))) +
geom_histogram(aes(color = type, fill = type),
position = "identity", bins = 30, alpha = 0.4) +
labs(title="Histogram of the log-mass of the exoplanet")
## Warning: Removed 2236 rows containing non-finite values (stat_bin).
# Violin
ggplot(data_clustering, aes(x = type, y = log(mass))) +
geom_violin() + labs(title="Violin plot of log-mass of the exoplanet")
## Warning: Removed 2236 rows containing non-finite values (stat_ydensity).
head(data$r_asc) # [hh mm ss]
## [1] "19 00 03.14" "19 00 03.14" "19 00 03.14" "19 02 17" "19 02 17"
## [6] "19 02 17"
head(data$decl) #Declination [+/-dd mm ss]
## [1] "+40 13 14.7" "+40 13 14.7" "+40 13 14.7" "+38 24 03" "+38 24 03"
## [6] "+38 24 03"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
# conver r_asc to seconds and save as r_asc_sec
data$r_asc <- gsub(" ", ":", data$r_asc, fixed=TRUE) # convert to hh:mm:ss
data$r_asc <- hms(data$r_asc)
## Warning in .parse_hms(..., order = "HMS", quiet = quiet): Some strings
## failed to parse, or all strings are NAs
data$r_asc_sec <- period_to_seconds(data$r_asc)
# convert Declination to seconds and save as decl_sec
data$decl <- gsub(" ", ":", data$decl, fixed=TRUE) # convert to dd:mm:ss, where dd=3600ss
data$decl <- hms(data$decl) # for Decl, dd is similar to hh where :=3600ss
## Warning in .parse_hms(..., order = "HMS", quiet = quiet): Some strings
## failed to parse, or all strings are NAs
data$decl_sec <- period_to_seconds(data$decl)
# scatter plot represents a celestial map for the exoplanets
ggplot(data, aes(r_asc_sec, decl_sec, color= meth)) +
geom_point() +
labs(title="Celestial Map for Exoplanets", x="Right ascension (seconds)",
y="declination (seconds)")
## Warning: Removed 1 rows containing missing values (geom_point).
ts.data <- data %>% group_by(meth, year) %>% summarise(Count = length(meth)) %>%
mutate(Count = cumsum(Count))
ts.data <- na.omit(ts.data)
ggplot(ts.data, aes(x = year, y = Count, group = meth)) +
geom_line(aes(color = meth)) +
geom_point(size = 2) +
transition_reveal(year) +
labs(title = 'Evolution Total number of exoplanets discovered by methods', y = 'Number Discovered')
fileName <- "D:/UCD/Advanced R/Assignment/Final Project/Submit/MLR_Project.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)
## data {
## // Define all the data here
## int<lower=0> N; // number of observations
## int<lower=0> K; // number of explanatory variables
## matrix[N, K] x; // explanatory variables
## vector[N] y; // response variable
## }
## parameters {
## // Define parameters here
## real alpha; // intercept
## vector[K] beta; // slope
## real<lower=0> sigma; // residual sd
## }
## model {
## // Model Likelihood
## y ~ normal(alpha + x * beta, sigma);
## }
##
stan.data <- data[,c("host_mass","host_temp","axis", "period")] #select Covariates to stan.data
stan.data.complete <- na.omit(stan.data) #exclude rows that contain at least one missing value
stan.data.complete <- stan.data.complete %>% # Log scale of all Variables.
mutate(host_mass = log(host_mass),
host_temp = log(host_temp),
axis = log(axis),
period = log(period))
# to save time when you recompile an already compiled file:
rstan_options(auto_write = TRUE)
# Always good to enable parallel running if available:
options(mc.cores = parallel::detectCores())
# Set up your data into the correct format, save it as a list with the same names as in the stan file
data_mlr = list(N = nrow(stan.data.complete), # number of observations
K = 3, # number of explanatory variables
y = stan.data.complete$period,
x = as.matrix(stan.data.complete[,c("host_mass","host_temp","axis")])) # x now is matrix with [N,K] dimensions
# Call Model from separate Stan file
stan_model_mlr = stan_model('D:/UCD/Advanced R/Assignment/Final Project/Submit/MLR_Project.stan')
#Fit the model with either the optimizing (Maximum likelihood version because have not specified Prior)
stan_run_mlr = optimizing(stan_model_mlr, data = data_mlr)
# Print the output likelihood maximisation on a regression model
print(stan_run_mlr)
## $par
## alpha beta[1] beta[2] beta[3] sigma
## 5.6691009 -0.4097280 0.0246718 1.4874893 0.2145052
##
## $value
## [1] 1316.948
##
## $return_code
## [1] 0
##
## $theta_tilde
## alpha beta[1] beta[2] beta[3] sigma
## [1,] 5.669101 -0.409728 0.0246718 1.487489 0.2145052
fileName1 <- "D:/UCD/Advanced R/Assignment/Final Project/Submit/MLR_Project_Prior.stan"
stan_code1 <- readChar(fileName1, file.info(fileName1)$size)
cat(stan_code1)
## data {
## // Define all the data here
## int<lower=0> N; // number of observations
## int<lower=0> K; // number of explanatory variables
## matrix[N, K] x; // explanatory variables
## vector[N] y; // response variable
## }
## parameters {
## // Define parameters here
## real alpha; // intercept
## vector[K] beta; // slope
## real<lower=0> sigma; // residual sd
## }
## model {
## // Prior for Intercept alpha, standard Gaussian
## alpha ~ normal(0,1);
##
## // Prior for slope beta, standard Gaussian
## beta ~ normal(0,1);
##
## // Prior for Sigma, Gamma(1,1)
## sigma ~ gamma(1,1);
##
## // Model Likelihood
## y ~ normal(alpha + x * beta, sigma);
## }
##
stan_model_mlr_prior = stan_model('D:/UCD/Advanced R/Assignment/Final Project/Submit/MLR_Project_Prior.stan')
# The full Bayesian way
stan_run_lr_bayes = sampling(stan_model_mlr_prior,
data = data_mlr)
print(stan_run_lr_bayes)
## Inference for Stan model: MLR_Project_Prior.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
## alpha 5.23 0.01 0.28 4.67 5.05 5.23 5.41 5.78 1126
## beta[1] -0.42 0.00 0.02 -0.45 -0.43 -0.42 -0.41 -0.39 1570
## beta[2] 0.08 0.00 0.03 0.01 0.05 0.08 0.10 0.14 1123
## beta[3] 1.49 0.00 0.00 1.48 1.49 1.49 1.49 1.50 2770
## sigma 0.22 0.00 0.00 0.21 0.21 0.21 0.22 0.22 2286
## lp__ 1296.68 0.04 1.60 1292.67 1295.86 1297.02 1297.87 1298.76 1696
## Rhat
## alpha 1
## beta[1] 1
## beta[2] 1
## beta[3] 1
## sigma 1
## lp__ 1
##
## Samples were drawn using NUTS(diag_e) at Sun Mar 01 13:28:54 2020.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(stan_run_lr_bayes) # Not always helpful if parameters on very different scales
## ci_level: 0.8 (80% intervals)
## outer_level: 0.95 (95% intervals)
plot(stan_run_lr_bayes, show_density = TRUE, ci_level = 0.5, fill_color = "purple")
## ci_level: 0.5 (50% intervals)
## outer_level: 0.95 (95% intervals)
stan_hist(stan_run_lr_bayes)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot(stan_run_lr_bayes, plotfun = "trace")