Libraries to use.

library(plyr)
library(ggplot2)
library(plotly)
library(DT)
library(ggExtra)
library(GGally)
library(knitr)
library(measurements)
library(gtools)
library(ggfortify)
library(olsrr)

1 Probability and Statistics

1.1 Sampling Variability

A dataset containing Height and Weight information for 1000 male and female. Get the data from https://www.kaggle.com/mustafaali96/weight-height and place in the same folder as the RMarkdown script.

mydata<-read.csv("weight-height.csv")
mydata$Height<-conv_unit(mydata$Height, "inch", "cm") # Use measurement library  
mydata$Height<-round(mydata$Height,2)
mydata$Weight<-conv_unit(mydata$Weight, "lbs", "kg")
mydata$Weight<-round(mydata$Weight,2)
summary(mydata)

    Gender         Height          Weight      
 Female:5000   Min.   :137.8   Min.   : 29.35  
 Male  :5000   1st Qu.:161.3   1st Qu.: 61.61  
               Median :168.4   Median : 73.12  
               Mean   :168.6   Mean   : 73.23  
               3rd Qu.:175.7   3rd Qu.: 84.90  
               Max.   :200.7   Max.   :122.47

# Use DT library
datatable(mydata, options = list(pageLength = 10, 
                                  dom = 'tip', 
                                  scrollY = 500,
                                  scroller = TRUE, 
                                  scrollX = TRUE,
                                  fixedColumns = TRUE), width = "100%", height = "100%")

Population consisting of 10000 members. The mean for the population height is

mean(mydata$Height)

[1] 168.5736

Take a simple random sample of size 10.

x<-sample(mydata$Height,10)

Find its mean.

mean(x)

[1] 162.384

Repeat this 1000 times.

heightSM<-0
smapleSize<-10
rep<-1000
set.seed(100)
for (i in 1:rep) {
     heightSM[i]<-mean(sample(mydata$Height,smapleSize))
}
heightSM<-as.data.frame(heightSM)
kable(head(heightSM), caption = "A few of the Mean Heights") # Use knitr library

A few of the Mean Heights
heightSM
172.909
170.725
163.015
170.169
167.784
172.352

Plot a histogram of the repeated resampling procedure.

# Use ggplot2 library
p <- ggplot(data=heightSM, aes(x=heightSM)) + 
     geom_histogram(aes(y =..density..), position="identity", 
                    breaks=seq(min(heightSM$heightSM), max(heightSM$heightSM), 
                               length.out = 20), 
                    alpha=.5, color="light blue", fill="red") + 
     geom_density(alpha=.3, fill="brown", col="black") + 
     geom_vline(xintercept=mean(heightSM$heightSM), linetype="dashed", color = "red", size=1) +
     geom_vline(xintercept=mean(mydata$Height), linetype="dashed", color = "blue", size=1) +
     annotate(geom="text", x=162, y=0.108, label="Population Mean",
              color="blue") + 
     geom_curve(aes(x = 162.5, y = 0.11, xend = mean(mydata$Height), yend = 0.14), arrow = arrow(length = unit(0.5, "cm")), curvature = -0.2)+ 
  labs(title="Sample Mean Histogram")+
  xlab(paste("Mean Height for Each Sample of Size", smapleSize))
p

#ggplotly(p)

1.2 Simple Random Sampling

Take a sample of size n from a vector x.

n<-12
x<-seq(1, 60)
(y<-sample(x, n, replace = FALSE))

 [1] 40  2 24 58 37 48 16 56 13 42  5 23

1.3 Basic Statistical Visualisation

Produce a histogram of height.

# Use plotly library
p <- ggplot(data=mydata, aes(x=Height)) + 
     geom_histogram(position="identity", 
                    breaks=seq(min(mydata$Height), max(mydata$Height), 
                               length.out = 20), 
                    alpha=.5, color="light blue", fill="red") + 
  labs(title="Population Height Histogram")+
  xlab("Height")
ggplotly(p)

Histogram for height with density on the y-axis and a smooth curve. Adjusted so the histogram’s area (i.e., the total area defined by the boxes) equals one.

p <- ggplot(data=mydata, aes(x=Height)) + 
     geom_histogram(aes(y =..density..), position="identity", 
                    breaks=seq(min(mydata$Height), max(mydata$Height), 
                               length.out = 20), 
                    alpha=.5, color="light blue", fill="red") + 
     geom_density(alpha=.3, fill="brown", col="black") + 
  labs(title="Population Height Histogram")+
  xlab("Height")
ggplotly(p)

stem(mydata$Height)


  The decimal point is 1 digit(s) to the right of the |

  13 | 899
  14 | 011122222233333444444444444444
  14 | 55555555555555555666666666666666667777777777777777777777777777888888+73
  15 | 00000000000000000000000000000000000000000000000000000000000000111111+462
  15 | 55555555555555555555555555555555555555555555555555555555555555555555+1139
  16 | 00000000000000000000000000000000000000000000000000000000000000000000+1571
  16 | 55555555555555555555555555555555555555555555555555555555555555555555+1695
  17 | 00000000000000000000000000000000000000000000000000000000000000000000+1651
  17 | 55555555555555555555555555555555555555555555555555555555555555555555+1360
  18 | 00000000000000000000000000000000000000000000000000000000000000000000+880
  18 | 55555555555555555555555555555555555555555555555555555555555555555555+297
  19 | 00000000000000000000000000000000001111111111111111111112222222222222+17
  19 | 55555555556666777899
  20 | 01

p <- plot_ly(data = mydata, x = ~Height, y = ~Weight, color = ~Gender, type = 'scatter', mode = 'markers', marker = list(size=8))
p

# Use ggextra library
p<- ggplot(data = mydata, mapping = aes(x = Height, y = Weight, colour = Gender)) + 
    geom_point() + stat_smooth(method = "lm")
ggMarginal(p, type = c("violin"), margins = c("both"), size = 5,
  xparams = list(), yparams = list(), groupColour = TRUE,
  groupFill = TRUE)

1.3.1 Dataset: Bear Information

br <- read.table("http://media.pearsoncmg.com/cmg/pmmg_mml_shared/mathstatsresources/Akritas/BearsData.txt", header=T)
datatable(br, options = list(pageLength = 10, 
                                  dom = 'tip', 
                                  scrollY = 500,
                                  scroller = TRUE, 
                                  scrollX = TRUE,
                                  fixedColumns = TRUE), width = "100%", height = "100%")

Scatter plots for pairs.

ggpairs(br, columns=4:8, aes(color = Sex, alpha = .5))

Correlation plots.

ggcorr(br[,4:8], palette = "RdBu", label = TRUE)

1.3.2 Dataset Iris

The Iris flower data set or Fisher’s Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 https://en.wikipedia.org/wiki/Iris_flower_data_set. The dataset contains a set of 150 records under five attributes - petal length, petal width, sepal length, sepal width and species.

datatable(iris, options = list(pageLength = 10, 
                                  dom = 'tip', 
                                  scrollY = 500,
                                  scroller = TRUE, 
                                  scrollX = TRUE,
                                  fixedColumns = TRUE), width = "100%", height = "100%")

ggpairs(iris, aes(color = Species, alpha = .5))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p <- ggplot(iris, aes(x = Species, y = Sepal.Length, colour = Species)) + geom_boxplot()
ggplotly(p)

p <- plot_ly(iris) %>% 
          add_markers(x=~Sepal.Length, y=~Sepal.Width, z=~Petal.Length, 
                      color= ~Species, 
                      mode = 'marker', 
                      name = paste('Species', iris$Species),
                      text = paste('Petal Width', iris$Petal.Width),
                      marker = list(size = ~Petal.Width*15)
                                    )
p <- p %>% layout(
               scene = list(
                    camera=list(
                         eye = list(x=0.8, y=1.5, z=1)
                    ),
                    title = 'Iris Data',  
                    xaxis = list(title = 'Sepal Length'),
                    yaxis = list(title = 'Sepal Width'),
                    zaxis = list(title = 'Petal Length'))
               , autosize = T
          )
p

ggcorr(iris, palette = "RdBu", label = TRUE)

1.4 Populations/Samples Parameters

set.seed(1)
x<-sample( c("red", "blue", "green"), 1000, replace = TRUE, prob = c(0.1, 0.6, 0.3))
table(x)

x
 blue green   red 
  606   284   110

prop.table(table(x))

x
 blue green   red 
0.606 0.284 0.110

set.seed(1)
y<-sample(x,100)
table(y)

y
 blue green   red 
   79    12     9

prop.table(table(y))

y
 blue green   red 
 0.79  0.12  0.09

(x<-seq(-1, 1, by = 0.1))

 [1] -1.0 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1  0.0  0.1  0.2  0.3
[15]  0.4  0.5  0.6  0.7  0.8  0.9  1.0

mean(x)

[1] 5.283162e-17

var(x) # sample variance

[1] 0.385

sd(x) # sample standard deviation

[1] 0.6204837

mean((x-mean(x))^2) # Population varinace

[1] 0.3666667

mean((x-mean(x))^2)==(length(x)-1)*var(x)/length(x)

[1] TRUE

1.5 Population/Sample Variance

heightSV<-0
heightPV<-0
smapleSize<-5
rep<-1000
for (i in 1:rep) {
     heightSample<-sample(mydata$Height,smapleSize)
     heightSV[i]<-var(heightSample)
     heightPV[i]<-mean((heightSample-mean(heightSample))^2)
}
heightSPVar<-cbind.data.frame(heightSV, heightPV)
kable(head(heightSPVar), caption = "A few of the Sample/Population Variance Heights")

A few of the Sample/Population Variance Heights
heightSV	heightPV
258.72013	206.97610
37.39507	29.91606
158.52195	126.81756
114.71627	91.77302
83.59060	66.87248
87.68842	70.15074

Plot a histogram of the repeated resampling procedure.

heightSPVarP<-cbind.data.frame(height= c(heightSV, heightPV),
                               computation=factor(rep(c("Sample Variance", "Population Variance"), each=rep)))
popVari<-mean((mydata$Height-mean(mydata$Height))^2)
mu <- ddply(heightSPVarP, "computation", summarise, grp.mean=mean(height))
mu[3,]<-popVari
mu[,1]<-as.factor(c("Population Variance", "Sample Variance", "True Variance"))
names(mu)[1]<-"computation.mean"
p <- ggplot(data=heightSPVarP, aes(x=height, fill=computation, color=computation)) + 
     geom_histogram(aes(y =..density..),
                    position="identity", 
                    breaks=seq(min(heightSPVarP$height), 
                               max(heightSPVarP$height), 
                               length.out = 20), 
                    alpha=.5) + 
     geom_density(alpha=.2) + 
     geom_vline(data=mu, aes(xintercept=grp.mean, color=computation.mean),
           linetype="dashed", size=1) +
  labs(title="Variance Histogram")+
  xlab(paste("Variance Height for Each Sample of Size", smapleSize))
ggplotly(p)

Population variance is biased for samples, this is shown by the red line lagging behind the blue line. The True variance and sample variance almost match more closely.

mu

     computation.mean grp.mean
1 Population Variance 76.45993
2     Sample Variance 95.57491
3       True Variance 95.49720

1.6 Probability

set.seed(1)
z<-0
rep<-10000
n<-1000
for (i in 1:rep) {
     x<-sample(1:6,n, replace = TRUE)
     y<-sample(1:6,n, replace = TRUE)
     z[i]<-nrow(cbind(x,y)[x==6 & y==6,])/n
}

dataz<-as.data.frame(z)

# Use ggplot2 library
p <- ggplot(data=dataz, aes(x=z)) + 
     geom_histogram(aes(y =..density..), position="identity", 
                    breaks=seq(min(dataz$z), max(dataz$z), 
                               length.out = 20), 
                    alpha=.5, color="light blue", fill="red") + 
     geom_density(alpha=.3, fill="brown", col="black") + 
     geom_vline(xintercept=mean(dataz$z), linetype="dashed", color = "red", size=1) +
     geom_vline(xintercept=1/36, linetype="dashed", color = "blue", size=1) +
     annotate(geom="text", x=0.015, y=55, label="Theoretical Probability 1/36",
              color="blue") + 
     geom_curve(aes(x = 0.0155, y = 60, xend = mean(dataz$z), yend = 65), arrow = arrow(length = unit(0.5, "cm")), curvature = -0.2)+ 
  labs(title="Probability Histogram")+
  xlab(paste("Probability of getting two sixes"))
p

#ggplotly(p)

Factorial

n=4
factorial(n)

[1] 24

Combination of \(k\) units out of \(n\).

n=4
k=2
choose(n, k)

[1] 6

To actually have the combinations and permutations use gtools library

# Use gtools library
n=4
k=2
combinations(n, k, v=1:n, set=TRUE, repeats.allowed=FALSE)

     [,1] [,2]
[1,]    1    2
[2,]    1    3
[3,]    1    4
[4,]    2    3
[5,]    2    4
[6,]    3    4

permutations(n, k, v=1:n, set=TRUE, repeats.allowed=FALSE)

      [,1] [,2]
 [1,]    1    2
 [2,]    1    3
 [3,]    1    4
 [4,]    2    1
 [5,]    2    3
 [6,]    2    4
 [7,]    3    1
 [8,]    3    2
 [9,]    3    4
[10,]    4    1
[11,]    4    2
[12,]    4    3

For Exe 10.

choose(4, 2)*choose(4, 1)/choose(52,3)

[1] 0.001085973

2 Random Variables

n=10
prob=0.2
rbinom(n,1,prob)

 [1] 1 0 0 0 0 0 1 1 0 0

runif(n,min=0,max=1)

 [1] 0.69601752 0.78701285 0.88207892 0.34780817 0.20874892 0.01817447
 [7] 0.12665581 0.97636847 0.61139940 0.60878280

2.1 Distributions

p for “probability”, the cumulative distribution function (c. d. f.)
q for “quantile”, the inverse c. d. f.
d for “density”, the density function (p. f. or p. d. f.)
r for “random”, a random variable having the specified distribution

Distribution	Functions
Beta	pbeta qbeta dbeta rbeta
Binomial	pbinom qbinom dbinom rbinom
Cauchy	pcauchy qcauchy dcauchy rcauchy
Chi-Square	pchisq qchisq dchisq rchisq
Exponential	pexp qexp dexp rexp
F	pf qf df rf
Gamma	pgamma qgamma dgamma rgamma
Geometric	pgeom qgeom dgeom rgeom
Hypergeometric	phyper qhyper dhyper rhyper
Logistic	plogis qlogis dlogis rlogis
Log Normal	plnorm qlnorm dlnorm rlnorm
Negative Binomial	pnbinom qnbinom dnbinom rnbinom
Normal	pnorm qnorm dnorm rnorm
Poisson	ppois qpois dpois rpois
Student	t pt qt dt rt
Studentized Range	ptukey qtukey dtukey rtukey
Uniform	punif qunif dunif runif
Weibull	pweibull qweibull dweibull rweibull
Wilcoxon Rank Sum Statistic	pwilcox qwilcox dwilcox rwilcox
Wilcoxon Signed Rank Statistic	psignrank qsignrank dsignrank rsignrank

-exp(-0.005*600)+exp(-0.005*300)

[1] 0.1733431

1-exp(-0.005*600)

[1] 0.9502129

pexp(600,0.005)

[1] 0.9502129

1-exp(-0.005*300)

[1] 0.7768698

pexp(300,0.005)

[1] 0.7768698

2.2 Discrete

n=15
p=0.7
prob=dbinom(1:n, n, prob=p)
#prob=dgeom(1:n, 0.7)

df <- data.frame(x=1:n, prob)
df$csum<-cumsum(df$prob)

p1 <- plot_ly(data = df, x = ~x, y = ~prob, color = ~prob, type = 'scatter', mode = 'markers', marker = list(size=8), name = 'Probability')

p2 <- plot_ly(data = df, x = ~x, y = ~csum, color = ~csum, type = 'scatter', mode = 'markers', marker = list(size=8), name = 'Cumulative')

 subplot(p1, p2)

Binomial Bin(15, 0.7)

2.3 Continous

mu<-0.5
sigmad<-1
x<-seq(mu-4*sigmad  , mu+4*sigmad, length=50)
prob<-dnorm(x, mean=mu, sd=sigmad) # Change 

df <- data.frame(x, prob)
df$csum<-cumsum(df$prob)

p1 <- plot_ly(data = df, x = ~x, y = ~prob, color = ~prob, type = 'scatter', mode = 'markers+lines', marker = list(size=5), line = list(width = 5, color="cyan"),name = 'Probability')

p2 <- plot_ly(data = df, x = ~x, y = ~csum, color = ~csum, type = 'scatter', mode = 'markers+lines', marker = list(size=5), line = list(width = 5, color="cyan"), name = 'Cumulative')

subplot(p1, p2)

Normal N(0.5, 1)

2.4 Integrate

f=function(x){1/((x+1)*sqrt(x))}; integrate(f, lower=0, upper=5)

2.300524 with absolute error < 0.00011

f=function(x){1/x**2}; integrate(f, lower=1, upper=Inf)

1 with absolute error < 1.1e-14

3 Pearson’s Correlation and Regression

3.1 Correlation

n <- 200 # True correlation 1
m <- 1
x <- rnorm(n, 5, 1)
sigmac <- 2
noise <- rnorm(n, 0, sigmac)
y <- 10 + m*x + noise
df <- data.frame(x, y)

df%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~y, marker = list(size=5), color = ~y, name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(y~x, data = df)), 
            line = list(width = 4, color="orange"), 
            name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))

cor(x,y)*sd(y)/sd(x) # m

[1] 1.090688

cor(x,y) # cor

[1] 0.4309702

cor.test(x, y)


    Pearson's product-moment correlation

data:  x and y
t = 6.7204, df = 198, p-value = 1.885e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.3108137 0.5375682
sample estimates:
      cor 
0.4309702

# For x choose y= y, z, t, v, for x1 use u
height <- x
weight <- y
df1<-data.frame(height,weight)

linearMod <- lm(weight ~ height, data=df1) 
summary(linearMod)


Call:
lm(formula = weight ~ height, data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.6909 -1.3864  0.1708  1.4898  8.3337 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.4157     0.8338   11.29  < 2e-16 ***
height        1.0907     0.1623    6.72 1.89e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.259 on 198 degrees of freedom
Multiple R-squared:  0.1857,    Adjusted R-squared:  0.1816 
F-statistic: 45.16 on 1 and 198 DF,  p-value: 1.885e-10

anova(linearMod)

Analysis of Variance Table

Response: weight
           Df  Sum Sq Mean Sq F value    Pr(>F)    
height      1  230.38 230.379  45.164 1.885e-10 ***
Residuals 198 1009.98   5.101                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")

shapiro.test(linearMod$residuals)


    Shapiro-Wilk normality test

data:  linearMod$residuals
W = 0.99191, p-value = 0.3324

p <- ggplot(data=data.frame(residuals=linearMod$residuals), 
            aes(x=residuals)) +
     geom_histogram(aes(y =..density..), position="identity", 
                    breaks=seq(min(linearMod$residuals), 
                               max(linearMod$residuals), 
                               length.out = 15), 
                    alpha=.5, color="light blue", fill="red") + 
     geom_density(alpha=.3, fill="brown", col="black") + 
  labs(title="Residual Histogram")
ggplotly(p)

ols_test_normality(linearMod)

-----------------------------------------------
       Test             Statistic       pvalue  
-----------------------------------------------
Shapiro-Wilk              0.9919         0.3324 
Kolmogorov-Smirnov        0.0407         0.8948 
Cramer-von Mises         11.5763         0.0000 
Anderson-Darling          0.3397         0.4957 
-----------------------------------------------

ols_test_f(linearMod)


 F Test for Heteroskedasticity
 -----------------------------
 Ho: Variance is homogenous
 Ha: Variance is not homogenous

 Variables: fitted values of weight 

      Test Summary        
 -------------------------
 Num DF     =    1 
 Den DF     =    198 
 F          =    2.713106 
 Prob > F   =    0.1011132

ols_test_score(linearMod)


 Score Test for Heteroskedasticity
 ---------------------------------
 Ho: Variance is homogenous
 Ha: Variance is not homogenous

 Variables: fitted values of weight 

        Test Summary         
 ----------------------------
 DF            =    1 
 Chi2          =    2.703467 
 Prob > Chi2   =    0.1001303

In 1973, Frank Anscombe published a set of examples showing situations where the results of r could be misleading. (Anscombe FJ, “Graphs in Statistical Analysis,” The American Statistician, 27, 17-21).

The examples involves the analysis of four sets of paired data where all of them will give the same value for r. However, the scatterplot of each set reveals how different the relationships between pairs of variables are.

n <- 20 # True correlation 1
m <- 1
x <- seq(0, 5, length=n)
sigmac <- 0.5
noise <- rnorm(n, 0, sigmac)
y <- m*x + noise
v <- m*x + noise*x
df <- data.frame(x, y, v); datareg <- df
p1 <- df%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~y, marker = list(size=6), 
              #color = ~y, 
              name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2))) %>% 
  layout(xaxis = list(range = c(-1, 6)))

n <- 20 # False correlation, Non-linear relationship
x <- seq(0, 5, length=n)
y <- x^2/5
df <- data.frame(x, y); datareg <- cbind.data.frame(datareg, z=y)
p2 <- df%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~y, marker = list(size=6), 
              #color = ~y, 
              name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>% 
  layout(xaxis = list(range = c(-1, 6)))

n <- 20 # False correlation, Outlier
m <- 0.6
x <- seq(0, 5, length=n)
y <- m*x
y[19] <- y[19]+3
df <- data.frame(x, y) ; datareg <- cbind.data.frame(datareg, t=y)
p3 <- df%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~y, marker = list(size=6), 
              #color = ~y, 
              name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>% 
  layout(xaxis = list(range = c(-1, 6)))

n <- 20 # False correlation, Outlier
x <- c(rep(1,n-1),5)
y <- c(rnorm(n-1, 0, 0.5) ,5)
df <- data.frame(x, y); datareg <- cbind.data.frame(datareg, x1=x, u=y)
p4 <- df%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~y, marker = list(size=6), 
              #color = ~y, 
              name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>% 
  layout(xaxis = list(range = c(-1, 6)))

p5 <- datareg%>% 
plot_ly(x = ~x) %>% 
  add_markers(y = ~v, marker = list(size=6), 
              #color = ~y, 
              name = 'y') %>% 
  add_lines(x = ~x, y = fitted(lm(v~x, data = datareg)), line = list(width = 4), name = paste("Cor", round(cor(datareg$x,v),2), ",", "Cov", round(cov(datareg$x,v),2)))%>% 
  layout(xaxis = list(range = c(-1, 6)))

subplot(p1,p2,p3,p4,p5, nrows = 3, shareX = TRUE, shareY = TRUE)

3.2 Regression

# For x choose y= y, z, t, v, for x1 use u
linearMod <- lm(v ~ x, data=datareg) 
summary(linearMod)


Call:
lm(formula = v ~ x, data = datareg)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.2332 -0.4799  0.1090  0.6271  1.6003 

Coefficients:
            Estimate Std. Error t value    Pr(>|t|)    
(Intercept)  -0.2061     0.4459  -0.462       0.649    
x             1.1516     0.1525   7.552 0.000000551 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.035 on 18 degrees of freedom
Multiple R-squared:  0.7601,    Adjusted R-squared:  0.7468 
F-statistic: 57.04 on 1 and 18 DF,  p-value: 0.000000551

anova(linearMod)

Analysis of Variance Table

Response: v
          Df Sum Sq Mean Sq F value      Pr(>F)    
x          1 61.071  61.071  57.037 0.000000551 ***
Residuals 18 19.273   1.071                        
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.3 Model Checking

autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")

shapiro.test(linearMod$residuals)


    Shapiro-Wilk normality test

data:  linearMod$residuals
W = 0.9782, p-value = 0.9087

nrow(cars)

[1] 50

head(cars, 4)

  speed dist
1     4    2
2     4   10
3     7    4
4     7   22

linearMod <- lm(dist ~ speed, data=cars) 
summary(linearMod)


Call:
lm(formula = dist ~ speed, data = cars)

Residuals:
    Min      1Q  Median      3Q     Max 
-29.069  -9.525  -2.272   9.215  43.201 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -17.5791     6.7584  -2.601   0.0123 *  
speed         3.9324     0.4155   9.464 1.49e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 15.38 on 48 degrees of freedom
Multiple R-squared:  0.6511,    Adjusted R-squared:  0.6438 
F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

anova(linearMod)

Analysis of Variance Table

Response: dist
          Df Sum Sq Mean Sq F value   Pr(>F)    
speed      1  21186 21185.5  89.567 1.49e-12 ***
Residuals 48  11354   236.5                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p6 <- cars%>% 
plot_ly(x = ~speed) %>% 
  add_markers(y = ~dist, marker = list(size=6), 
              #color = ~y, 
              name = 'dist') %>% 
  add_lines(x = ~speed, y = fitted(lm(dist~speed, data = cars)), line = list(width = 4), name = paste("Cor", round(cor(cars$speed,cars$dist),2), ",", "Cov", round(cov(cars$speed,cars$dist),2)))
p6

3.4 Residuals

p <- ggplot(cars, aes(x = speed, y = dist)) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_segment(aes(xend = speed, yend =predict(linearMod) ), alpha = .4) +
  # > Color AND size adjustments made here...
  geom_point(aes(color = abs(linearMod$residuals), size = abs(linearMod$residuals))) + # size also mapped
  scale_color_continuous(low = "black", high = "red") +
  guides(color = FALSE, size = FALSE) +  # Size legend also removed
  geom_point(aes(y = predict(linearMod)), shape = 1)
ggplotly(p, tooltip = c("x","y", "colour", "text", "label"))

p <- ggplot(data=data.frame(residuals=linearMod$residuals), 
            aes(x=residuals)) +
     geom_histogram(aes(y =..density..), position="identity", 
                    breaks=seq(min(linearMod$residuals), 
                               max(linearMod$residuals), 
                               length.out = 15), 
                    alpha=.5, color="light blue", fill="red") + 
     geom_density(alpha=.3, fill="brown", col="black") + 
  labs(title="Residual Histogram")
ggplotly(p)

autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")

shapiro.test(linearMod$residuals)


    Shapiro-Wilk normality test

data:  linearMod$residuals
W = 0.94509, p-value = 0.02152

library(datarium)
data("marketing", package = "datarium")
head(marketing, 5)

  youtube facebook newspaper sales
1  276.12    45.36     83.04 26.52
2   53.40    47.16     54.12 12.48
3   20.64    55.08     83.16 11.16
4  181.80    49.56     70.20 22.20
5  216.96    12.96     70.08 15.48

model <- lm(sales ~ youtube + facebook , data = marketing)
summary(model)


Call:
lm(formula = sales ~ youtube + facebook, data = marketing)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.5572  -1.0502   0.2906   1.4049   3.3994 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.50532    0.35339   9.919   <2e-16 ***
youtube      0.04575    0.00139  32.909   <2e-16 ***
facebook     0.18799    0.00804  23.382   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.018 on 197 degrees of freedom
Multiple R-squared:  0.8972,    Adjusted R-squared:  0.8962 
F-statistic: 859.6 on 2 and 197 DF,  p-value: < 2.2e-16

autoplot(model, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")

shapiro.test(model$residuals)


    Shapiro-Wilk normality test

data:  model$residuals
W = 0.91804, p-value = 0.00000000419

3.5 `olsrr` library Model Checking

olsrr offers tools for detecting violation of standard regression assumptions. Here we take a look at residual diagnostics. The standard regression assumptions include the following about residuals/errors:

The error has a normal distribution (normality assumption).
The errors have mean zero.
The errors have same but unknown variance (homoscedasticity assumption).
The error are independent of each other (independent errors assumption).

Graph for detecting violation of normality assumption.

ols_plot_resid_qq(model)

Residual Normality Test Test for detecting violation of normality assumption.

ols_test_normality(model)

-----------------------------------------------
       Test             Statistic       pvalue  
-----------------------------------------------
Shapiro-Wilk              0.918          0.0000 
Kolmogorov-Smirnov        0.1253         0.0037 
Cramer-von Mises          9.459          0.0000 
Anderson-Darling          3.6974         0.0000 
-----------------------------------------------

Correlation between observed residuals and expected residuals under normality.

ols_test_correlation(model)

[1] 0.9564499

Residual vs Fitted Values Plot It is a scatter plot of residuals on the y axis and fitted values on the x axis to detect non-linearity, unequal error variances, and outliers.

Characteristics of a well behaved residual vs fitted plot:

The residuals spread randomly around the 0 line indicating that the relationship is linear.
The residuals form an approximate horizontal band around the 0 line indicating homogeneity of error variance.
No one residual is visibly away from the random pattern of the residuals indicating that there are no outliers.

ols_plot_resid_fit(model)

Residual Histogram

Histogram of residuals for detecting violation of normality assumption.

ols_plot_resid_hist(model)

3.6 Prediction

new.marketing <- data.frame(
  youtube = c(12, 19, 24),
  facebook = c(10, 19, 24)
)
predict(model, new.marketing, interval = "confidence")

       fit      lwr      upr
1 5.934320 5.343442 6.525197
2 7.946552 7.418504 8.474599
3 9.115297 8.610423 9.620171

4 Approximation

aprox <- (5/3)*pi*(10.5^3-9.5^3)

r <- runif(100,9.5,10.5)
v <- pi*r^2*5
mean(v)-aprox

[1] -9.238514

qnorm(0.025)

[1] -1.959964

x <- c(1,2,3,5,1,5)
mean(x)

[1] 2.833333

confint(lm(x~1), level = 1-0.05)

                2.5 %   97.5 %
(Intercept) 0.9077786 4.758888

5 Time Series

5.1 Creating Time Series.

x<-3
x[2]<-1.5
for (i in 3:30) {
  x[i]<-x[i-1]-x[i-2]/2+i/5+cos(i)+2
}
set.seed(100)
value <- x + rnorm(30,0,2)

mytsdata <- ts(value, frequency=365, start=c(2014, 6)) # Daily
(mytsdata <- ts(value, frequency = 4, start = c(2010, 2))) # frequency 4 => Quarterly Data

          Qtr1      Qtr2      Qtr3      Qtr4
2010            1.995615  1.763062  1.452173
2011  4.779934  5.718965  8.779271  8.389820
2012 10.365962  5.398546  5.021821  6.601211
2013  8.987069 10.687979 13.110406 10.572172
2014  8.693758  7.936813 11.620355 11.202458
2015 18.759441 12.399895 13.134812 11.559743
2016 14.003463 13.301188 15.671638 14.750984
2017 15.016441 11.195322 14.881908

mytsdata <- ts(value, frequency = 12, start = 1990) # freq 12 => Monthly data. 
(mytsdata <- ts(value, start=c(1980), end=c(2014), frequency=1)) # Yearly Data

Time Series:
Start = 1980 
End = 2014 
Frequency = 1 
 [1]  1.995615  1.763062  1.452173  4.779934  5.718965  8.779271  8.389820
 [8] 10.365962  5.398546  5.021821  6.601211  8.987069 10.687979 13.110406
[15] 10.572172  8.693758  7.936813 11.620355 11.202458 18.759441 12.399895
[22] 13.134812 11.559743 14.003463 13.301188 15.671638 14.750984 15.016441
[29] 11.195322 14.881908  1.995615  1.763062  1.452173  4.779934  5.718965

library(xts) # Alternatively use xts

Loading required package: zoo


Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

dates <- seq(as.Date("2016-01-01"), length = 30, by = "days") # Using xts library
(mytsdata1 <- xts(x = value, order.by = dates))

                [,1]
2016-01-01  1.995615
2016-01-02  1.763062
2016-01-03  1.452173
2016-01-04  4.779934
2016-01-05  5.718965
2016-01-06  8.779271
2016-01-07  8.389820
2016-01-08 10.365962
2016-01-09  5.398546
2016-01-10  5.021821
2016-01-11  6.601211
2016-01-12  8.987069
2016-01-13 10.687979
2016-01-14 13.110406
2016-01-15 10.572172
2016-01-16  8.693758
2016-01-17  7.936813
2016-01-18 11.620355
2016-01-19 11.202458
2016-01-20 18.759441
2016-01-21 12.399895
2016-01-22 13.134812
2016-01-23 11.559743
2016-01-24 14.003463
2016-01-25 13.301188
2016-01-26 15.671638
2016-01-27 14.750984
2016-01-28 15.016441
2016-01-29 11.195322
2016-01-30 14.881908

# library(xtable)
# xtable(ts(value, frequency = 4, start = c(2010, 2)))

5.2 Manipulate Time Series

as.matrix(mytsdata1)
as.Date(time(mytsdata1))
coredata(mytsdata1)
index(mytsdata1)

as.xts(mytsdata)
as.ts(mytsdata1)

first(mytsdata1)
last(mytsdata1)

merge(mytsdata, mytsdata1,join='left',fill=0)

as.Date(time(mytsdata)) # Extract Dates
as.matrix(mytsdata) # Extract Data

data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata))) # Create dataframe

library(zoo) # Infrastructure for Regular and Irregular Time Series
as.zoo(mytsdata)
as.zoo(mytsdata1)

5.3 Visualisation

5.3.1 Usual plot

plot(mytsdata, type="o", ylab="Quarterly Earnings")

5.3.2 ggplot

mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Quarterly Earnings")+
  scale_x_date(date_labels = "%Y %b %d")+
  theme(axis.text.x=element_text(angle=60, hjust=1))
ggplotly(p)

5.3.3 Plotly

data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata))) %>%
plot_ly(x = ~date, y = ~Y, mode = 'lines+markers', type = "scatter", name="Confirm") %>% 
  layout(
    title = "",
    xaxis = list(rangeslider = list(type = "date")),
    yaxis = list(title = "Quarterly Earnings"))

5.3.4 dygraphs

library(dygraphs)
dygraph(mytsdata)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

5.4 Data For Example

x<-0
x[2]<-1
set.seed(1)
for (i in 3:50) {
  #x[i]<-x[i-1]-x[i-2]/2+i/5+3*cos(i)
  x[i] <- x[i-1] - x[i-2]/5 + i/5 + 3*cos(i) + rnorm(1,0,i/4)
  #x[i]<-x[i-1] + rnorm(1,0,i/5)
}
#value <- x + rnorm(50,0,5)
value <- x
mytsdata <- ts(value, frequency = 12, start = c(2010, 2))
dygraph(mytsdata)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

5.4.1 Lag, Difference, and Autocorrelation

lag(mytsdata, 2)

           Jan       Feb       Mar       Apr       May       Jun       Jul
2009                                                                      
2010  1.000000 -1.839818 -3.017105 -1.842691  5.234162  9.841046  8.316776
2011 22.237127 16.637862  3.658659  7.686553 12.333570 17.485469 24.962183
2012 25.156549 26.243030 23.531001 20.708705 11.434616 12.531440 22.490719
2013 24.744799 24.425151 24.882243 36.510455 46.747396 47.942016 47.605089
2014 56.182945                                                            
           Aug       Sep       Oct       Nov       Dec
2009                                          0.000000
2010  6.511892  6.177134  8.671431 11.451401 19.952744
2011 28.333312 28.007451 30.626409 35.790274 38.104631
2012 37.753707 36.636157 36.766465 35.771098 25.376246
2013 56.433489 61.217564 48.260981 35.207093 40.722804
2014

diff(mytsdata, lag = 1, differences=1)

             Jan         Feb         Mar         Apr         May
2010                           1.0000000  -2.8398178  -1.1772875
2011   2.7799700   8.5013429   2.2843828  -5.5992647 -12.9792034
2012   5.1638647   2.3143578 -12.9480828   1.0864811  -2.7120285
2013  -0.9953678 -10.3948518  -0.6314469  -0.3196474   0.4570921
2014 -13.0538878   5.5157105  15.4601410                        
             Jun         Jul         Aug         Sep         Oct
2010   1.1744144   7.0768531   4.6068836  -1.5242693  -1.8048846
2011   4.0278940   4.6470171   5.1518996   7.4767133   3.3711296
2012  -2.8222960  -9.2740888   1.0968231   9.9592793  15.2629881
2013  11.6282120  10.2369407   1.1946202  -0.3369278   8.8284007
2014                                                            
             Nov         Dec
2010  -0.3347581   2.4942974
2011  -0.3258617   2.6189584
2012  -1.1175503   0.1303086
2013   4.7840752 -12.9565837
2014

acf(mytsdata, plot = FALSE)


Autocorrelations of series 'mytsdata', by lag

0.0000 0.0833 0.1667 0.2500 0.3333 0.4167 0.5000 0.5833 0.6667 0.7500 
 1.000  0.859  0.708  0.609  0.519  0.405  0.337  0.325  0.314  0.278 
0.8333 0.9167 1.0000 1.0833 1.1667 1.2500 1.3333 
 0.245  0.223  0.186  0.156  0.122  0.049 -0.016

#acf(mytsdata, lag = 20)
#autoplot(acf(mytsdata, plot = FALSE), conf.int.fill = 'blue', conf.int.value = 0.5, conf.int.type = 'ma', colour="red")

5.4.2 Forcast Package

library(forecast)
ggtsdisplay(mytsdata)

gglagplot(mytsdata, do.lines = FALSE)

gglagchull(mytsdata)

ggAcf(mytsdata)

# Acf(mytsdata, lags = 9)
# acf1(mytsdata)
# lag.plot(mytsdata, lags = 9)

5.4.3 Trend

linearMod <- lm(mytsdata~time(mytsdata))
summary(linearMod)


Call:
lm(formula = mytsdata ~ time(mytsdata))

Residuals:
     Min       1Q   Median       3Q      Max 
-17.0685  -5.6420   0.6023   6.3084  16.8171 

Coefficients:
                  Estimate  Std. Error t value Pr(>|t|)    
(Intercept)    -23966.4357   1958.8331  -12.23 2.30e-16 ***
time(mytsdata)     11.9230      0.9735   12.25 2.22e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.278 on 48 degrees of freedom
Multiple R-squared:  0.7576,    Adjusted R-squared:  0.7525 
F-statistic:   150 on 1 and 48 DF,  p-value: 2.221e-16

anova(linearMod)

Analysis of Variance Table

Response: mytsdata
               Df  Sum Sq Mean Sq F value    Pr(>F)    
time(mytsdata)  1 10279.2 10279.2     150 2.221e-16 ***
Residuals      48  3289.4    68.5                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mytstrend <- linearMod$fitted.values
mytsde_trend <- linearMod$residuals

dataplot<-cbind(mytsdata, mytstrend, mytsde_trend) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")

shapiro.test(linearMod$residuals)


    Shapiro-Wilk normality test

data:  linearMod$residuals
W = 0.98465, p-value = 0.7566

5.4.4 Regression with Lagged Time Series

library(dynlm)
linearMod <- dynlm(mytsdata ~ L(mytsdata,1))
summary(linearMod)


Time series regression with "ts" data:
Start = 2010(3), End = 2014(3)

Call:
dynlm(formula = mytsdata ~ L(mytsdata, 1))

Residuals:
     Min       1Q   Median       3Q      Max 
-14.5796  -3.5823   0.0598   3.2617  15.4817 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     2.72080    1.68965    1.61    0.114    
L(mytsdata, 1)  0.93266    0.05967   15.63   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.675 on 47 degrees of freedom
Multiple R-squared:  0.8387,    Adjusted R-squared:  0.8352 
F-statistic: 244.3 on 1 and 47 DF,  p-value: < 2.2e-16

dataplot<-cbind(mytsdata, trend=linearMod$fitted.values, residual=linearMod$residuals) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

5.4.5 Seasonality

ggtsdisplay(mytsde_trend)

ggtsdisplay(diff(mytsde_trend, lag = 1, differences=6))

library(forecast)
ts.stl <- stl(mytsdata, "periodic")  # decompose the TS
ts.sa  <- seasadj(ts.stl)
autoplot(ts.stl)

dataplot<-cbind(mytsdata, seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2]) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

seasonplot(ts.sa, 5, col=rainbow(5), year.labels=TRUE, main="Seasonal plot: Airpassengers", lwd=3)

5.4.6 Making Stationary

library(tseries)
adf.test(mytsdata) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  mytsdata
Dickey-Fuller = -3.3154, Lag order = 3, p-value = 0.07932
alternative hypothesis: stationary

nsdiffs(mytsdata)  # number for seasonal differencing needed

[1] 0

ndiffs(mytsdata) # number of differences need to make it stationary

[1] 1

stationaryTS <- diff(mytsdata, differences= ndiffs(mytsdata))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  stationaryTS
Dickey-Fuller = -3.9373, Lag order = 3, p-value = 0.01973
alternative hypothesis: stationary

dataplot<-cbind(mytsdata,stationaryTS) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

5.4.7 Forecast

ts.stl %>% forecast() %>%
  autoplot()

fit <- auto.arima(ts.stl$time.series[,3], seasonal=FALSE)
summary(fit)

Series: ts.stl$time.series[, 3] 
ARIMA(2,0,0) with zero mean 

Coefficients:
         ar1      ar2
      0.8735  -0.6074
s.e.  0.1167   0.1158

sigma^2 estimated as 18.39:  log likelihood=-143.36
AIC=292.72   AICc=293.24   BIC=298.45

Training set error measures:
                    ME     RMSE      MAE      MPE    MAPE      MASE
Training set 0.1290637 4.201792 3.438581 124.7528 161.216 0.4287052
                   ACF1
Training set 0.03800343

fit %>% forecast() %>%
  autoplot()

5.4.8 Entir Process

mytsdata %>% stl(s.window='periodic') %>% seasadj() -> mytsdatadj
autoplot(mytsdatadj)

The data are clearly non-stationary, as the series wanders up and down for long periods. Consequently, we will take a first difference of the data.

mytsdatadj %>% diff() %>% ggtsdisplay(main="")

The PACF shown is suggestive of an AR(2) model. So an initial candidate model is an ARIMA(2,1,0). There are no other obvious candidate models.

(fit <- Arima(mytsdatadj, order=c(2,1,0)))

Series: mytsdatadj 
ARIMA(2,1,0) 

Coefficients:
         ar1      ar2
      0.3716  -0.4716
s.e.  0.1341   0.1319

sigma^2 estimated as 28.68:  log likelihood=-151.02
AIC=308.04   AICc=308.57   BIC=313.71

checkresiduals(fit)


    Ljung-Box test

data:  Residuals from ARIMA(2,1,0)
Q* = 7.5718, df = 8, p-value = 0.4764

Model df: 2.   Total lags used: 10

autoplot(forecast(fit))

5.4.9 Seasonal Package

library(seasonal)
m <- seas(mytsdata)
#view(m)

dataplot<-cbind(trend=m[["data"]][,"trend"], seasonaladj=m[["data"]][,"seasonaladj"], irregular=m[["data"]][,"irregular"],  data=mytsdata)
  dygraph(dataplot)%>%
    dyOptions(gridLineColor = "#E1E5EA", axisLineColor = "#303030", drawPoints
=TRUE, strokeWidth = 2, colors = c("blue","green","grey","red"))%>%
    dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
    dyRangeSelector()

5.5 Examples

5.5.1 White Noise

n <- 200
value <-  rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata, plot = FALSE))

Smoothing

values <- stats::filter(value, sides=2, filter=rep(1/5,5))
mytsdata <- ts(values, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata[-c(1,2,199,200)], plot = FALSE))

5.5.2 Trend and White Noise

value <- (1:n)/10 + rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata, plot = FALSE))

5.5.3 Autoregresive

value <- 0
for (i in 2:n) {
  value[i] <- value[i-1] + rnorm(1,0,1)
}
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata, plot = FALSE))

value <- 0
value[2] <- 1
for (i in 3:n) {
  value[i] <- value[i-1] - value[i-2]/9 + rnorm(1,0,1)
}
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata, plot = FALSE))

5.5.4 Seasonality and White Noise

value <- 2*cos(2*pi*1:n/10 +0.6*pi) + rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
  geom_line(alpha=1,  color="#69b3a2", size=2) +
  geom_point(alpha=1, color="red", size=2) +
  xlab("Time")+ylab("Value")
ggplotly(p)

autoplot(acf(mytsdata, plot = FALSE))

5.5.5 Cross-correlation

value <- (1:n)/20 + cos((1:n)/10) + rnorm(n,0,1)
mytsdatax <- ts(value, start = 1, end = n)
value <- (1:n)/20 + rnorm(n,0,2) + 5
mytsdatay <- ts(value, start = 1, end = n)
mytsdata <- cbind(mytsdatax, mytsdatay)
dygraph(mytsdata)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

ggCcf(mytsdatax, mytsdatay)

library(astsa) # Applied Statistical Time Series Analysis


Attaching package: 'astsa'

The following object is masked from 'package:seasonal':

    unemp

The following object is masked from 'package:forecast':

    gas

lag2.plot (mytsdatax, mytsdatay, 11)

5.6 Real Life Examples

data(jj)
plot(jj, type="o", ylab="Quarterly Earnings per Share")

jjdata<-data.frame(Y=as.matrix(jj), date=as.Date(time(jj)))
jjdata %>%
plot_ly(x = ~date, y = ~Y, mode = 'lines+markers', type = "scatter", name="Confirm", width = 850, height = 750)

library(dygraphs)
dygraph(jj)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

dygraph(globtemp)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

dygraph(speech)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

#as.xts(AirPassengers)
dygraph(AirPassengers)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

lungDeaths <- cbind(mdeaths, fdeaths)
dygraph(lungDeaths)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

SoiRec <- cbind(soi, rec) # Southern Oscillation Index and Recruitment
dy_graph <- list(dygraph(soi)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector(),
  dygraph(rec)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector())
htmltools::browsable(htmltools::tagList(dy_graph))

SoiRecdata<-data.frame(as.matrix(SoiRec), date=as.Date(time(SoiRec)))
subplot(SoiRecdata %>%
plot_ly(x = ~date, y = ~soi, mode = 'lines+markers', type = "scatter", name="soi"),
SoiRecdata %>%
plot_ly(x = ~date, y = ~rec, mode = 'lines+markers', type = "scatter", name="rec"), nrows = 2, shareX = TRUE)

dy_graph <- list(
dygraph(fmri1[,2:5])%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector(),

dygraph(fmri1[,6:9])%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector())
htmltools::browsable(htmltools::tagList(dy_graph))

trend <- sin(seq(1,41))+runif(41)
data <- data.frame(
  time=seq(from=Sys.Date()-40, to=Sys.Date(), by=1 ), 
  value1=trend, 
  value2=trend+rnorm(41), 
  value3=trend+rnorm(41), 
  value4=trend+rnorm(41) 
)

# switch to xts format
data <- xts(x = data[,-1], order.by = data$time)

# Plot it
p <- dygraph(data) %>%
  dyCandlestick()%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()
p

5.7 Exercise

n <- 100

x <- 1 + -0.05*(1:n) + cos((1:n)*pi/10) + rnorm(n,0,1)
mytsX <- ts(x, frequency = 4, start = c(1980, 2))

y<-0
set.seed(1)
for (i in 2:n) {
  y[i] <- 0.03 + y[i-1] + rnorm(1,0,1)
}

mytsY <- ts(y, frequency = 4, start = c(1980, 2))
mytsdata <- cbind(mytsX, mytsY)
dygraph(mytsdata)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

ggtsdisplay(mytsdata[,1])

ggtsdisplay(mytsdata[,2])

gglagplot(mytsdata, do.lines = FALSE)

gglagchull(mytsdata)

ggAcf(mytsdata)

ggCcf(mytsdatax, mytsdatay)

linearMod <- lm(mytsdata~time(mytsdata))
summary(linearMod)

Response mytsX :

Call:
lm(formula = mytsX ~ time(mytsdata))

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5583 -0.6055  0.1441  0.8457  2.5433 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    426.38967   32.54121   13.10   <2e-16 ***
time(mytsdata)  -0.21471    0.01633  -13.15   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.179 on 98 degrees of freedom
Multiple R-squared:  0.6382,    Adjusted R-squared:  0.6345 
F-statistic: 172.9 on 1 and 98 DF,  p-value: < 2.2e-16


Response mytsY :

Call:
lm(formula = mytsY ~ time(mytsdata))

Residuals:
    Min      1Q  Median      3Q     Max 
-2.5659 -1.0276 -0.2509  1.0234  3.0951 

Coefficients:
                  Estimate  Std. Error t value Pr(>|t|)    
(Intercept)    -1189.71594    39.87865  -29.83   <2e-16 ***
time(mytsdata)     0.60070     0.02001   30.02   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.444 on 98 degrees of freedom
Multiple R-squared:  0.9019,    Adjusted R-squared:  0.9009 
F-statistic: 900.9 on 1 and 98 DF,  p-value: < 2.2e-16

anova(linearMod)

Analysis of Variance Table

               Df  Pillai approx F num Df den Df    Pr(>F)    
(Intercept)     1 0.96816  1474.56      2     97 < 2.2e-16 ***
time(mytsdata)  1 0.92730   618.61      2     97 < 2.2e-16 ***
Residuals      98                                             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mytstrend <- linearMod$fitted.values
mytsde_trend <- linearMod$residuals

dataplot<-cbind(mytsdata, mytstrend) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

ts.stl <- stl(mytsdata[,1], "periodic")  # decompose the TS
ts.sa  <- seasadj(ts.stl)
autoplot(ts.stl)

dataplot<-cbind(mytsdata[,1], seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2]) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

ts.stl <- stl(mytsdata[,2], "periodic")  # decompose the TS
ts.sa  <- seasadj(ts.stl)
autoplot(ts.stl)

dataplot<-cbind(mytsdata[,2], seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2]) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

adf.test(mytsdata[,1]) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  mytsdata[, 1]
Dickey-Fuller = -3.4149, Lag order = 4, p-value = 0.05601
alternative hypothesis: stationary

nsdiffs(mytsdata[,1])  # number for seasonal differencing needed

[1] 0

ndiffs(mytsdata[,1]) # number of differences need to make it stationary

[1] 1

stationaryTS <- diff(mytsdata[,1], differences= ndiffs(mytsdata[,1]))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  stationaryTS
Dickey-Fuller = -5.7598, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

dataplot<-cbind(mytsdata[,1],stationaryTS) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

adf.test(mytsdata[,2]) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  mytsdata[, 2]
Dickey-Fuller = -2.801, Lag order = 4, p-value = 0.2448
alternative hypothesis: stationary

nsdiffs(mytsdata[,2])  # number for seasonal differencing needed

[1] 0

ndiffs(mytsdata[,2]) # number of differences need to make it stationary

[1] 1

stationaryTS <- diff(mytsdata[,2], differences= ndiffs(mytsdata[,2]))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary


    Augmented Dickey-Fuller Test

data:  stationaryTS
Dickey-Fuller = -5.3845, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary

dataplot<-cbind(mytsdata[,2],stationaryTS) 
dygraph(dataplot)%>% 
  dyOptions(drawPoints = TRUE, pointSize = 4)%>%
  dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>% 
  dyRangeSelector()

mytsdata[,1] %>% stl(s.window='periodic') %>% seasadj() -> mytsdatadj
autoplot(mytsdatadj)