options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(corrplot)
library(forecast)

data exploration

This dataset is a a collection of measures from a NASA solar flare observatory : Reuven Ramaty High Energy Solar Spectroscopic Imager (RHESSI, originally High Energy Solar Spectroscopic Imager or HESSI). It was launched in 2002 and its primary mission is to explore the physics of particle acceleration and energy release in solar flares.

df<-read.csv('hessi.solar.flare.2002to2016.csv',sep=',')
print(colnames(df))
##  [1] "flare"            "start.date"       "start.time"       "peak"             "end"             
##  [6] "duration.s"       "peak.c.s"         "total.counts"     "energy.kev"       "x.pos.asec"      
## [11] "y.pos.asec"       "radial"           "active.region.ar" "flag.1"           "flag.2"          
## [16] "flag.3"           "flag.4"           "flag.5"
print(any(is.na(df)))
## [1] FALSE
#missmap(df,cols=c('black','yellow'))

We see there is no NA values. We may re-arrange/create new features to make the plots easier to interpret. In particular :

df$energy.kev.ordered<-ordered(df$energy.kev, levels = c('3-6','6-12','12-25','25-50','50-100','100-300','300-800','800-7000','7000-20000'))
#ggplot(data=df,aes(x=radial)) + geom_histogram(bins=100) +scale_y_log10()
df$radialFactor<-cut(df$radial,breaks=c(-1,5000,10000,15000),labels=c('R1','R2','R3'))

Features distributions

Energy
ggplot(data=df,aes(x=energy.kev.ordered),alpha=.25) + geom_bar() + scale_y_log10()  + xlab('Energy range [keV]')

We see that most of the flares measures are from the range [6-12] keV. High energy rays (>1MeV) are less present.

Radial Distance
ggplot(data=df,aes(x=radial)) + geom_histogram(bins=200) + scale_y_log10() + xlab('Radial distance [m?]')

This plot show the motivation of creating a factor variable for the radial distance. I guess the units is in meters [s.i.]

Total counts
ggplot(data=df,aes(x=total.counts)) + geom_histogram(bins=200) + scale_y_log10() + xlab('counts in[6-12] keV integrated over time')

Duration
ggplot(data=df,aes(x=duration.s)) + geom_histogram(bins=200) + scale_y_log10() + xlab('Duration of the flare [sec.]')

Counts vs. duration

ggplot(data=df,aes(x=duration.s,y=total.counts)) + geom_point(aes(size=radialFactor,color=energy.kev.ordered),alpha=.25) + scale_y_log10() 

ggplot(data=df,aes(x=duration.s,y=total.counts)) + geom_point(aes(color=radialFactor),alpha=.25) + scale_y_log10() + facet_wrap(~energy.kev.ordered)

Counts vs. date

df$peakDate <- as.POSIXct(paste(df$start.date, df$peak), format="%Y-%M-%d %H:%M:%S")
df$year<-as.numeric(format(as.POSIXct(df$start.date,format="%Y-%m-%d"),"%Y"))
df$month<-as.numeric(format(as.POSIXct(df$start.date,format="%Y-%m-%d"),"%m"))
Number of Solar flares per year

According wikipedia :

The solar cycle or solar magnetic activity cycle is the nearly periodic 11-year change in the Sun's activity (including changes in the levels of solar radiation and ejection of solar material) and appearance (changes in the number and size of sunspots, flares, and other manifestations).

df %>% group_by(year,energy.kev.ordered) %>% summarise(number= n()) %>% ggplot(aes(x=factor(year),y=number,fill=energy.kev.ordered)) + geom_bar(stat='identity',position='dodge') + theme(legend.position='top')

11 year-cycle confirmed for all energies !

Number of Solar flares per month/year
res <- df %>% group_by(year,month) %>% summarise(number= n())
mymonths <- c("January","February","March","April","May","June","July","August","September","October","November","December")
res$MonthAbb <- mymonths[ res$month ]
res$ordered_month <- factor(res$MonthAbb, levels = month.name)
ggplot(data=res, aes(x=year,y=ordered_month)) + geom_tile(aes(fill = number),colour = "white") + scale_fill_gradient(low="steelblue", high="black") + theme(axis.title.y=element_blank(),axis.title.x=element_blank())

Prediction

#count the number of all Solar flares per month/year
#make a time serie
#apply Holt-Winter to predict at 5 months
byYearMonth<-as.data.frame(df %>% group_by(year,month) %>% summarise(number= n()))
myts <- ts(byYearMonth$number, start=c(2002, 2), end=c(2016, 12), frequency=12)
Time serie plot
plot(myts)

Decomposition of the time serie
plot(decompose(myts))

We see the 11-year cycle in the Trend component

Prediction with HoltWinters
model <- HoltWinters(myts, beta=FALSE, gamma=FALSE)
forecasts <- forecast.HoltWinters(model, h=12)
plot.forecast(forecasts)

Prediction with Exponential Smoothing(ETS)
model2 <-ets(myts)
plot(forecast(model2, 12))

Prediction with HoltWinters
model3<-auto.arima(myts)
plot(forecast(model3, 12))

History :

  • version 1 : initial commit : data exploration ,time serie analysis