Libraries to use.
library(plyr)
library(ggplot2)
library(plotly)
library(DT)
library(ggExtra)
library(GGally)
library(knitr)
library(measurements)
library(gtools)
library(ggfortify)
library(olsrr)A dataset containing Height and Weight information for 1000 male and female. Get the data from https://www.kaggle.com/mustafaali96/weight-height and place in the same folder as the RMarkdown script.
mydata<-read.csv("weight-height.csv")
mydata$Height<-conv_unit(mydata$Height, "inch", "cm") # Use measurement library
mydata$Height<-round(mydata$Height,2)
mydata$Weight<-conv_unit(mydata$Weight, "lbs", "kg")
mydata$Weight<-round(mydata$Weight,2)
summary(mydata) Gender Height Weight
Female:5000 Min. :137.8 Min. : 29.35
Male :5000 1st Qu.:161.3 1st Qu.: 61.61
Median :168.4 Median : 73.12
Mean :168.6 Mean : 73.23
3rd Qu.:175.7 3rd Qu.: 84.90
Max. :200.7 Max. :122.47
# Use DT library
datatable(mydata, options = list(pageLength = 10,
dom = 'tip',
scrollY = 500,
scroller = TRUE,
scrollX = TRUE,
fixedColumns = TRUE), width = "100%", height = "100%")Population consisting of 10000 members. The mean for the population height is
[1] 168.5736
Take a simple random sample of size 10.
Find its mean.
[1] 162.384
Repeat this 1000 times.
heightSM<-0
smapleSize<-10
rep<-1000
set.seed(100)
for (i in 1:rep) {
heightSM[i]<-mean(sample(mydata$Height,smapleSize))
}
heightSM<-as.data.frame(heightSM)
kable(head(heightSM), caption = "A few of the Mean Heights") # Use knitr library| heightSM |
|---|
| 172.909 |
| 170.725 |
| 163.015 |
| 170.169 |
| 167.784 |
| 172.352 |
Plot a histogram of the repeated resampling procedure.
# Use ggplot2 library
p <- ggplot(data=heightSM, aes(x=heightSM)) +
geom_histogram(aes(y =..density..), position="identity",
breaks=seq(min(heightSM$heightSM), max(heightSM$heightSM),
length.out = 20),
alpha=.5, color="light blue", fill="red") +
geom_density(alpha=.3, fill="brown", col="black") +
geom_vline(xintercept=mean(heightSM$heightSM), linetype="dashed", color = "red", size=1) +
geom_vline(xintercept=mean(mydata$Height), linetype="dashed", color = "blue", size=1) +
annotate(geom="text", x=162, y=0.108, label="Population Mean",
color="blue") +
geom_curve(aes(x = 162.5, y = 0.11, xend = mean(mydata$Height), yend = 0.14), arrow = arrow(length = unit(0.5, "cm")), curvature = -0.2)+
labs(title="Sample Mean Histogram")+
xlab(paste("Mean Height for Each Sample of Size", smapleSize))
pTake a sample of size n from a vector x.
[1] 40 2 24 58 37 48 16 56 13 42 5 23
Produce a histogram of height.
# Use plotly library
p <- ggplot(data=mydata, aes(x=Height)) +
geom_histogram(position="identity",
breaks=seq(min(mydata$Height), max(mydata$Height),
length.out = 20),
alpha=.5, color="light blue", fill="red") +
labs(title="Population Height Histogram")+
xlab("Height")
ggplotly(p)Histogram for height with density on the y-axis and a smooth curve. Adjusted so the histogram’s area (i.e., the total area defined by the boxes) equals one.
p <- ggplot(data=mydata, aes(x=Height)) +
geom_histogram(aes(y =..density..), position="identity",
breaks=seq(min(mydata$Height), max(mydata$Height),
length.out = 20),
alpha=.5, color="light blue", fill="red") +
geom_density(alpha=.3, fill="brown", col="black") +
labs(title="Population Height Histogram")+
xlab("Height")
ggplotly(p)
The decimal point is 1 digit(s) to the right of the |
13 | 899
14 | 011122222233333444444444444444
14 | 55555555555555555666666666666666667777777777777777777777777777888888+73
15 | 00000000000000000000000000000000000000000000000000000000000000111111+462
15 | 55555555555555555555555555555555555555555555555555555555555555555555+1139
16 | 00000000000000000000000000000000000000000000000000000000000000000000+1571
16 | 55555555555555555555555555555555555555555555555555555555555555555555+1695
17 | 00000000000000000000000000000000000000000000000000000000000000000000+1651
17 | 55555555555555555555555555555555555555555555555555555555555555555555+1360
18 | 00000000000000000000000000000000000000000000000000000000000000000000+880
18 | 55555555555555555555555555555555555555555555555555555555555555555555+297
19 | 00000000000000000000000000000000001111111111111111111112222222222222+17
19 | 55555555556666777899
20 | 01
p <- plot_ly(data = mydata, x = ~Height, y = ~Weight, color = ~Gender, type = 'scatter', mode = 'markers', marker = list(size=8))
p# Use ggextra library
p<- ggplot(data = mydata, mapping = aes(x = Height, y = Weight, colour = Gender)) +
geom_point() + stat_smooth(method = "lm")
ggMarginal(p, type = c("violin"), margins = c("both"), size = 5,
xparams = list(), yparams = list(), groupColour = TRUE,
groupFill = TRUE)br <- read.table("http://media.pearsoncmg.com/cmg/pmmg_mml_shared/mathstatsresources/Akritas/BearsData.txt", header=T)
datatable(br, options = list(pageLength = 10,
dom = 'tip',
scrollY = 500,
scroller = TRUE,
scrollX = TRUE,
fixedColumns = TRUE), width = "100%", height = "100%")Scatter plots for pairs.
Correlation plots.
The Iris flower data set or Fisher’s Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 https://en.wikipedia.org/wiki/Iris_flower_data_set. The dataset contains a set of 150 records under five attributes - petal length, petal width, sepal length, sepal width and species.
datatable(iris, options = list(pageLength = 10,
dom = 'tip',
scrollY = 500,
scroller = TRUE,
scrollX = TRUE,
fixedColumns = TRUE), width = "100%", height = "100%")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p <- ggplot(iris, aes(x = Species, y = Sepal.Length, colour = Species)) + geom_boxplot()
ggplotly(p)p <- plot_ly(iris) %>%
add_markers(x=~Sepal.Length, y=~Sepal.Width, z=~Petal.Length,
color= ~Species,
mode = 'marker',
name = paste('Species', iris$Species),
text = paste('Petal Width', iris$Petal.Width),
marker = list(size = ~Petal.Width*15)
)
p <- p %>% layout(
scene = list(
camera=list(
eye = list(x=0.8, y=1.5, z=1)
),
title = 'Iris Data',
xaxis = list(title = 'Sepal Length'),
yaxis = list(title = 'Sepal Width'),
zaxis = list(title = 'Petal Length'))
, autosize = T
)
p set.seed(1)
x<-sample( c("red", "blue", "green"), 1000, replace = TRUE, prob = c(0.1, 0.6, 0.3))
table(x)x
blue green red
606 284 110
x
blue green red
0.606 0.284 0.110
y
blue green red
79 12 9
y
blue green red
0.79 0.12 0.09
[1] -1.0 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 0.0 0.1 0.2 0.3
[15] 0.4 0.5 0.6 0.7 0.8 0.9 1.0
[1] 5.283162e-17
[1] 0.385
[1] 0.6204837
[1] 0.3666667
[1] TRUE
heightSV<-0
heightPV<-0
smapleSize<-5
rep<-1000
for (i in 1:rep) {
heightSample<-sample(mydata$Height,smapleSize)
heightSV[i]<-var(heightSample)
heightPV[i]<-mean((heightSample-mean(heightSample))^2)
}
heightSPVar<-cbind.data.frame(heightSV, heightPV)
kable(head(heightSPVar), caption = "A few of the Sample/Population Variance Heights")| heightSV | heightPV |
|---|---|
| 258.72013 | 206.97610 |
| 37.39507 | 29.91606 |
| 158.52195 | 126.81756 |
| 114.71627 | 91.77302 |
| 83.59060 | 66.87248 |
| 87.68842 | 70.15074 |
Plot a histogram of the repeated resampling procedure.
heightSPVarP<-cbind.data.frame(height= c(heightSV, heightPV),
computation=factor(rep(c("Sample Variance", "Population Variance"), each=rep)))
popVari<-mean((mydata$Height-mean(mydata$Height))^2)
mu <- ddply(heightSPVarP, "computation", summarise, grp.mean=mean(height))
mu[3,]<-popVari
mu[,1]<-as.factor(c("Population Variance", "Sample Variance", "True Variance"))
names(mu)[1]<-"computation.mean"
p <- ggplot(data=heightSPVarP, aes(x=height, fill=computation, color=computation)) +
geom_histogram(aes(y =..density..),
position="identity",
breaks=seq(min(heightSPVarP$height),
max(heightSPVarP$height),
length.out = 20),
alpha=.5) +
geom_density(alpha=.2) +
geom_vline(data=mu, aes(xintercept=grp.mean, color=computation.mean),
linetype="dashed", size=1) +
labs(title="Variance Histogram")+
xlab(paste("Variance Height for Each Sample of Size", smapleSize))
ggplotly(p)Population variance is biased for samples, this is shown by the red line lagging behind the blue line. The True variance and sample variance almost match more closely.
computation.mean grp.mean
1 Population Variance 76.45993
2 Sample Variance 95.57491
3 True Variance 95.49720
set.seed(1)
z<-0
rep<-10000
n<-1000
for (i in 1:rep) {
x<-sample(1:6,n, replace = TRUE)
y<-sample(1:6,n, replace = TRUE)
z[i]<-nrow(cbind(x,y)[x==6 & y==6,])/n
}
dataz<-as.data.frame(z)
# Use ggplot2 library
p <- ggplot(data=dataz, aes(x=z)) +
geom_histogram(aes(y =..density..), position="identity",
breaks=seq(min(dataz$z), max(dataz$z),
length.out = 20),
alpha=.5, color="light blue", fill="red") +
geom_density(alpha=.3, fill="brown", col="black") +
geom_vline(xintercept=mean(dataz$z), linetype="dashed", color = "red", size=1) +
geom_vline(xintercept=1/36, linetype="dashed", color = "blue", size=1) +
annotate(geom="text", x=0.015, y=55, label="Theoretical Probability 1/36",
color="blue") +
geom_curve(aes(x = 0.0155, y = 60, xend = mean(dataz$z), yend = 65), arrow = arrow(length = unit(0.5, "cm")), curvature = -0.2)+
labs(title="Probability Histogram")+
xlab(paste("Probability of getting two sixes"))
pFactorial
[1] 24
Combination of \(k\) units out of \(n\).
[1] 6
To actually have the combinations and permutations use gtools library
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 1 4
[4,] 2 3
[5,] 2 4
[6,] 3 4
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 1 4
[4,] 2 1
[5,] 2 3
[6,] 2 4
[7,] 3 1
[8,] 3 2
[9,] 3 4
[10,] 4 1
[11,] 4 2
[12,] 4 3
For Exe 10.
[1] 0.001085973
[1] 1 0 0 0 0 0 1 1 0 0
[1] 0.69601752 0.78701285 0.88207892 0.34780817 0.20874892 0.01817447
[7] 0.12665581 0.97636847 0.61139940 0.60878280
| Distribution | Functions |
|---|---|
| Beta | pbeta qbeta dbeta rbeta |
| Binomial | pbinom qbinom dbinom rbinom |
| Cauchy | pcauchy qcauchy dcauchy rcauchy |
| Chi-Square | pchisq qchisq dchisq rchisq |
| Exponential | pexp qexp dexp rexp |
| F | pf qf df rf |
| Gamma | pgamma qgamma dgamma rgamma |
| Geometric | pgeom qgeom dgeom rgeom |
| Hypergeometric | phyper qhyper dhyper rhyper |
| Logistic | plogis qlogis dlogis rlogis |
| Log Normal | plnorm qlnorm dlnorm rlnorm |
| Negative Binomial | pnbinom qnbinom dnbinom rnbinom |
| Normal | pnorm qnorm dnorm rnorm |
| Poisson | ppois qpois dpois rpois |
| Student | t pt qt dt rt |
| Studentized Range | ptukey qtukey dtukey rtukey |
| Uniform | punif qunif dunif runif |
| Weibull | pweibull qweibull dweibull rweibull |
| Wilcoxon Rank Sum Statistic | pwilcox qwilcox dwilcox rwilcox |
| Wilcoxon Signed Rank Statistic | psignrank qsignrank dsignrank rsignrank |
[1] 0.1733431
[1] 0.9502129
[1] 0.9502129
[1] 0.7768698
[1] 0.7768698
n=15
p=0.7
prob=dbinom(1:n, n, prob=p)
#prob=dgeom(1:n, 0.7)
df <- data.frame(x=1:n, prob)
df$csum<-cumsum(df$prob)
p1 <- plot_ly(data = df, x = ~x, y = ~prob, color = ~prob, type = 'scatter', mode = 'markers', marker = list(size=8), name = 'Probability')
p2 <- plot_ly(data = df, x = ~x, y = ~csum, color = ~csum, type = 'scatter', mode = 'markers', marker = list(size=8), name = 'Cumulative')
subplot(p1, p2) Binomial Bin(15, 0.7)
mu<-0.5
sigmad<-1
x<-seq(mu-4*sigmad , mu+4*sigmad, length=50)
prob<-dnorm(x, mean=mu, sd=sigmad) # Change
df <- data.frame(x, prob)
df$csum<-cumsum(df$prob)
p1 <- plot_ly(data = df, x = ~x, y = ~prob, color = ~prob, type = 'scatter', mode = 'markers+lines', marker = list(size=5), line = list(width = 5, color="cyan"),name = 'Probability')
p2 <- plot_ly(data = df, x = ~x, y = ~csum, color = ~csum, type = 'scatter', mode = 'markers+lines', marker = list(size=5), line = list(width = 5, color="cyan"), name = 'Cumulative')
subplot(p1, p2) Normal N(0.5, 1)
2.300524 with absolute error < 0.00011
1 with absolute error < 1.1e-14
n <- 200 # True correlation 1
m <- 1
x <- rnorm(n, 5, 1)
sigmac <- 2
noise <- rnorm(n, 0, sigmac)
y <- 10 + m*x + noise
df <- data.frame(x, y)
df%>%
plot_ly(x = ~x) %>%
add_markers(y = ~y, marker = list(size=5), color = ~y, name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(y~x, data = df)),
line = list(width = 4, color="orange"),
name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))[1] 1.090688
[1] 0.4309702
Pearson's product-moment correlation
data: x and y
t = 6.7204, df = 198, p-value = 1.885e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3108137 0.5375682
sample estimates:
cor
0.4309702
# For x choose y= y, z, t, v, for x1 use u
height <- x
weight <- y
df1<-data.frame(height,weight)
linearMod <- lm(weight ~ height, data=df1)
summary(linearMod)
Call:
lm(formula = weight ~ height, data = df1)
Residuals:
Min 1Q Median 3Q Max
-5.6909 -1.3864 0.1708 1.4898 8.3337
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.4157 0.8338 11.29 < 2e-16 ***
height 1.0907 0.1623 6.72 1.89e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.259 on 198 degrees of freedom
Multiple R-squared: 0.1857, Adjusted R-squared: 0.1816
F-statistic: 45.16 on 1 and 198 DF, p-value: 1.885e-10
Analysis of Variance Table
Response: weight
Df Sum Sq Mean Sq F value Pr(>F)
height 1 230.38 230.379 45.164 1.885e-10 ***
Residuals 198 1009.98 5.101
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")
Shapiro-Wilk normality test
data: linearMod$residuals
W = 0.99191, p-value = 0.3324
p <- ggplot(data=data.frame(residuals=linearMod$residuals),
aes(x=residuals)) +
geom_histogram(aes(y =..density..), position="identity",
breaks=seq(min(linearMod$residuals),
max(linearMod$residuals),
length.out = 15),
alpha=.5, color="light blue", fill="red") +
geom_density(alpha=.3, fill="brown", col="black") +
labs(title="Residual Histogram")
ggplotly(p)-----------------------------------------------
Test Statistic pvalue
-----------------------------------------------
Shapiro-Wilk 0.9919 0.3324
Kolmogorov-Smirnov 0.0407 0.8948
Cramer-von Mises 11.5763 0.0000
Anderson-Darling 0.3397 0.4957
-----------------------------------------------
F Test for Heteroskedasticity
-----------------------------
Ho: Variance is homogenous
Ha: Variance is not homogenous
Variables: fitted values of weight
Test Summary
-------------------------
Num DF = 1
Den DF = 198
F = 2.713106
Prob > F = 0.1011132
Score Test for Heteroskedasticity
---------------------------------
Ho: Variance is homogenous
Ha: Variance is not homogenous
Variables: fitted values of weight
Test Summary
----------------------------
DF = 1
Chi2 = 2.703467
Prob > Chi2 = 0.1001303
In 1973, Frank Anscombe published a set of examples showing situations where the results of r could be misleading. (Anscombe FJ, “Graphs in Statistical Analysis,” The American Statistician, 27, 17-21).
The examples involves the analysis of four sets of paired data where all of them will give the same value for r. However, the scatterplot of each set reveals how different the relationships between pairs of variables are.
n <- 20 # True correlation 1
m <- 1
x <- seq(0, 5, length=n)
sigmac <- 0.5
noise <- rnorm(n, 0, sigmac)
y <- m*x + noise
v <- m*x + noise*x
df <- data.frame(x, y, v); datareg <- df
p1 <- df%>%
plot_ly(x = ~x) %>%
add_markers(y = ~y, marker = list(size=6),
#color = ~y,
name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2))) %>%
layout(xaxis = list(range = c(-1, 6)))n <- 20 # False correlation, Non-linear relationship
x <- seq(0, 5, length=n)
y <- x^2/5
df <- data.frame(x, y); datareg <- cbind.data.frame(datareg, z=y)
p2 <- df%>%
plot_ly(x = ~x) %>%
add_markers(y = ~y, marker = list(size=6),
#color = ~y,
name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>%
layout(xaxis = list(range = c(-1, 6)))n <- 20 # False correlation, Outlier
m <- 0.6
x <- seq(0, 5, length=n)
y <- m*x
y[19] <- y[19]+3
df <- data.frame(x, y) ; datareg <- cbind.data.frame(datareg, t=y)
p3 <- df%>%
plot_ly(x = ~x) %>%
add_markers(y = ~y, marker = list(size=6),
#color = ~y,
name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>%
layout(xaxis = list(range = c(-1, 6)))n <- 20 # False correlation, Outlier
x <- c(rep(1,n-1),5)
y <- c(rnorm(n-1, 0, 0.5) ,5)
df <- data.frame(x, y); datareg <- cbind.data.frame(datareg, x1=x, u=y)
p4 <- df%>%
plot_ly(x = ~x) %>%
add_markers(y = ~y, marker = list(size=6),
#color = ~y,
name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(y~x, data = df)), line = list(width = 4), name = paste("Cor", round(cor(x,y),2), ",", "Cov", round(cov(x,y),2)))%>%
layout(xaxis = list(range = c(-1, 6)))p5 <- datareg%>%
plot_ly(x = ~x) %>%
add_markers(y = ~v, marker = list(size=6),
#color = ~y,
name = 'y') %>%
add_lines(x = ~x, y = fitted(lm(v~x, data = datareg)), line = list(width = 4), name = paste("Cor", round(cor(datareg$x,v),2), ",", "Cov", round(cov(datareg$x,v),2)))%>%
layout(xaxis = list(range = c(-1, 6)))
Call:
lm(formula = v ~ x, data = datareg)
Residuals:
Min 1Q Median 3Q Max
-2.2332 -0.4799 0.1090 0.6271 1.6003
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.2061 0.4459 -0.462 0.649
x 1.1516 0.1525 7.552 0.000000551 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.035 on 18 degrees of freedom
Multiple R-squared: 0.7601, Adjusted R-squared: 0.7468
F-statistic: 57.04 on 1 and 18 DF, p-value: 0.000000551
Analysis of Variance Table
Response: v
Df Sum Sq Mean Sq F value Pr(>F)
x 1 61.071 61.071 57.037 0.000000551 ***
Residuals 18 19.273 1.071
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")
Shapiro-Wilk normality test
data: linearMod$residuals
W = 0.9782, p-value = 0.9087
[1] 50
speed dist
1 4 2
2 4 10
3 7 4
4 7 22
Call:
lm(formula = dist ~ speed, data = cars)
Residuals:
Min 1Q Median 3Q Max
-29.069 -9.525 -2.272 9.215 43.201
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -17.5791 6.7584 -2.601 0.0123 *
speed 3.9324 0.4155 9.464 1.49e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.38 on 48 degrees of freedom
Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
Analysis of Variance Table
Response: dist
Df Sum Sq Mean Sq F value Pr(>F)
speed 1 21186 21185.5 89.567 1.49e-12 ***
Residuals 48 11354 236.5
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p6 <- cars%>%
plot_ly(x = ~speed) %>%
add_markers(y = ~dist, marker = list(size=6),
#color = ~y,
name = 'dist') %>%
add_lines(x = ~speed, y = fitted(lm(dist~speed, data = cars)), line = list(width = 4), name = paste("Cor", round(cor(cars$speed,cars$dist),2), ",", "Cov", round(cov(cars$speed,cars$dist),2)))
p6p <- ggplot(cars, aes(x = speed, y = dist)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_segment(aes(xend = speed, yend =predict(linearMod) ), alpha = .4) +
# > Color AND size adjustments made here...
geom_point(aes(color = abs(linearMod$residuals), size = abs(linearMod$residuals))) + # size also mapped
scale_color_continuous(low = "black", high = "red") +
guides(color = FALSE, size = FALSE) + # Size legend also removed
geom_point(aes(y = predict(linearMod)), shape = 1)
ggplotly(p, tooltip = c("x","y", "colour", "text", "label"))p <- ggplot(data=data.frame(residuals=linearMod$residuals),
aes(x=residuals)) +
geom_histogram(aes(y =..density..), position="identity",
breaks=seq(min(linearMod$residuals),
max(linearMod$residuals),
length.out = 15),
alpha=.5, color="light blue", fill="red") +
geom_density(alpha=.3, fill="brown", col="black") +
labs(title="Residual Histogram")
ggplotly(p)autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")
Shapiro-Wilk normality test
data: linearMod$residuals
W = 0.94509, p-value = 0.02152
youtube facebook newspaper sales
1 276.12 45.36 83.04 26.52
2 53.40 47.16 54.12 12.48
3 20.64 55.08 83.16 11.16
4 181.80 49.56 70.20 22.20
5 216.96 12.96 70.08 15.48
Call:
lm(formula = sales ~ youtube + facebook, data = marketing)
Residuals:
Min 1Q Median 3Q Max
-10.5572 -1.0502 0.2906 1.4049 3.3994
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.50532 0.35339 9.919 <2e-16 ***
youtube 0.04575 0.00139 32.909 <2e-16 ***
facebook 0.18799 0.00804 23.382 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.018 on 197 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
Shapiro-Wilk normality test
data: model$residuals
W = 0.91804, p-value = 0.00000000419
olsrr library Model Checkingolsrr offers tools for detecting violation of standard regression assumptions. Here we take a look at residual diagnostics. The standard regression assumptions include the following about residuals/errors:
The error has a normal distribution (normality assumption).
The errors have mean zero.
The errors have same but unknown variance (homoscedasticity assumption).
The error are independent of each other (independent errors assumption).
Graph for detecting violation of normality assumption.
Residual Normality Test Test for detecting violation of normality assumption.
-----------------------------------------------
Test Statistic pvalue
-----------------------------------------------
Shapiro-Wilk 0.918 0.0000
Kolmogorov-Smirnov 0.1253 0.0037
Cramer-von Mises 9.459 0.0000
Anderson-Darling 3.6974 0.0000
-----------------------------------------------
Correlation between observed residuals and expected residuals under normality.
[1] 0.9564499
Residual vs Fitted Values Plot It is a scatter plot of residuals on the y axis and fitted values on the x axis to detect non-linearity, unequal error variances, and outliers.
Characteristics of a well behaved residual vs fitted plot:
The residuals spread randomly around the 0 line indicating that the relationship is linear.
The residuals form an approximate horizontal band around the 0 line indicating homogeneity of error variance.
No one residual is visibly away from the random pattern of the residuals indicating that there are no outliers.
Residual Histogram
Histogram of residuals for detecting violation of normality assumption.
new.marketing <- data.frame(
youtube = c(12, 19, 24),
facebook = c(10, 19, 24)
)
predict(model, new.marketing, interval = "confidence") fit lwr upr
1 5.934320 5.343442 6.525197
2 7.946552 7.418504 8.474599
3 9.115297 8.610423 9.620171
[1] -9.238514
[1] -1.959964
[1] 2.833333
2.5 % 97.5 %
(Intercept) 0.9077786 4.758888
x<-3
x[2]<-1.5
for (i in 3:30) {
x[i]<-x[i-1]-x[i-2]/2+i/5+cos(i)+2
}
set.seed(100)
value <- x + rnorm(30,0,2)
mytsdata <- ts(value, frequency=365, start=c(2014, 6)) # Daily
(mytsdata <- ts(value, frequency = 4, start = c(2010, 2))) # frequency 4 => Quarterly Data Qtr1 Qtr2 Qtr3 Qtr4
2010 1.995615 1.763062 1.452173
2011 4.779934 5.718965 8.779271 8.389820
2012 10.365962 5.398546 5.021821 6.601211
2013 8.987069 10.687979 13.110406 10.572172
2014 8.693758 7.936813 11.620355 11.202458
2015 18.759441 12.399895 13.134812 11.559743
2016 14.003463 13.301188 15.671638 14.750984
2017 15.016441 11.195322 14.881908
mytsdata <- ts(value, frequency = 12, start = 1990) # freq 12 => Monthly data.
(mytsdata <- ts(value, start=c(1980), end=c(2014), frequency=1)) # Yearly DataTime Series:
Start = 1980
End = 2014
Frequency = 1
[1] 1.995615 1.763062 1.452173 4.779934 5.718965 8.779271 8.389820
[8] 10.365962 5.398546 5.021821 6.601211 8.987069 10.687979 13.110406
[15] 10.572172 8.693758 7.936813 11.620355 11.202458 18.759441 12.399895
[22] 13.134812 11.559743 14.003463 13.301188 15.671638 14.750984 15.016441
[29] 11.195322 14.881908 1.995615 1.763062 1.452173 4.779934 5.718965
Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
dates <- seq(as.Date("2016-01-01"), length = 30, by = "days") # Using xts library
(mytsdata1 <- xts(x = value, order.by = dates)) [,1]
2016-01-01 1.995615
2016-01-02 1.763062
2016-01-03 1.452173
2016-01-04 4.779934
2016-01-05 5.718965
2016-01-06 8.779271
2016-01-07 8.389820
2016-01-08 10.365962
2016-01-09 5.398546
2016-01-10 5.021821
2016-01-11 6.601211
2016-01-12 8.987069
2016-01-13 10.687979
2016-01-14 13.110406
2016-01-15 10.572172
2016-01-16 8.693758
2016-01-17 7.936813
2016-01-18 11.620355
2016-01-19 11.202458
2016-01-20 18.759441
2016-01-21 12.399895
2016-01-22 13.134812
2016-01-23 11.559743
2016-01-24 14.003463
2016-01-25 13.301188
2016-01-26 15.671638
2016-01-27 14.750984
2016-01-28 15.016441
2016-01-29 11.195322
2016-01-30 14.881908
as.matrix(mytsdata1)
as.Date(time(mytsdata1))
coredata(mytsdata1)
index(mytsdata1)
as.xts(mytsdata)
as.ts(mytsdata1)
first(mytsdata1)
last(mytsdata1)
merge(mytsdata, mytsdata1,join='left',fill=0)
as.Date(time(mytsdata)) # Extract Dates
as.matrix(mytsdata) # Extract Data
data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata))) # Create dataframe
library(zoo) # Infrastructure for Regular and Irregular Time Series
as.zoo(mytsdata)
as.zoo(mytsdata1)mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Quarterly Earnings")+
scale_x_date(date_labels = "%Y %b %d")+
theme(axis.text.x=element_text(angle=60, hjust=1))
ggplotly(p)x<-0
x[2]<-1
set.seed(1)
for (i in 3:50) {
#x[i]<-x[i-1]-x[i-2]/2+i/5+3*cos(i)
x[i] <- x[i-1] - x[i-2]/5 + i/5 + 3*cos(i) + rnorm(1,0,i/4)
#x[i]<-x[i-1] + rnorm(1,0,i/5)
}
#value <- x + rnorm(50,0,5)
value <- x
mytsdata <- ts(value, frequency = 12, start = c(2010, 2))
dygraph(mytsdata)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector() Jan Feb Mar Apr May Jun Jul
2009
2010 1.000000 -1.839818 -3.017105 -1.842691 5.234162 9.841046 8.316776
2011 22.237127 16.637862 3.658659 7.686553 12.333570 17.485469 24.962183
2012 25.156549 26.243030 23.531001 20.708705 11.434616 12.531440 22.490719
2013 24.744799 24.425151 24.882243 36.510455 46.747396 47.942016 47.605089
2014 56.182945
Aug Sep Oct Nov Dec
2009 0.000000
2010 6.511892 6.177134 8.671431 11.451401 19.952744
2011 28.333312 28.007451 30.626409 35.790274 38.104631
2012 37.753707 36.636157 36.766465 35.771098 25.376246
2013 56.433489 61.217564 48.260981 35.207093 40.722804
2014
Jan Feb Mar Apr May
2010 1.0000000 -2.8398178 -1.1772875
2011 2.7799700 8.5013429 2.2843828 -5.5992647 -12.9792034
2012 5.1638647 2.3143578 -12.9480828 1.0864811 -2.7120285
2013 -0.9953678 -10.3948518 -0.6314469 -0.3196474 0.4570921
2014 -13.0538878 5.5157105 15.4601410
Jun Jul Aug Sep Oct
2010 1.1744144 7.0768531 4.6068836 -1.5242693 -1.8048846
2011 4.0278940 4.6470171 5.1518996 7.4767133 3.3711296
2012 -2.8222960 -9.2740888 1.0968231 9.9592793 15.2629881
2013 11.6282120 10.2369407 1.1946202 -0.3369278 8.8284007
2014
Nov Dec
2010 -0.3347581 2.4942974
2011 -0.3258617 2.6189584
2012 -1.1175503 0.1303086
2013 4.7840752 -12.9565837
2014
Autocorrelations of series 'mytsdata', by lag
0.0000 0.0833 0.1667 0.2500 0.3333 0.4167 0.5000 0.5833 0.6667 0.7500
1.000 0.859 0.708 0.609 0.519 0.405 0.337 0.325 0.314 0.278
0.8333 0.9167 1.0000 1.0833 1.1667 1.2500 1.3333
0.245 0.223 0.186 0.156 0.122 0.049 -0.016
Call:
lm(formula = mytsdata ~ time(mytsdata))
Residuals:
Min 1Q Median 3Q Max
-17.0685 -5.6420 0.6023 6.3084 16.8171
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -23966.4357 1958.8331 -12.23 2.30e-16 ***
time(mytsdata) 11.9230 0.9735 12.25 2.22e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.278 on 48 degrees of freedom
Multiple R-squared: 0.7576, Adjusted R-squared: 0.7525
F-statistic: 150 on 1 and 48 DF, p-value: 2.221e-16
Analysis of Variance Table
Response: mytsdata
Df Sum Sq Mean Sq F value Pr(>F)
time(mytsdata) 1 10279.2 10279.2 150 2.221e-16 ***
Residuals 48 3289.4 68.5
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mytstrend <- linearMod$fitted.values
mytsde_trend <- linearMod$residuals
dataplot<-cbind(mytsdata, mytstrend, mytsde_trend)
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()autoplot(linearMod, colour="red", which=1:6, ncol = 2, label.size = 3, title="Model checking plots")
Shapiro-Wilk normality test
data: linearMod$residuals
W = 0.98465, p-value = 0.7566
Time series regression with "ts" data:
Start = 2010(3), End = 2014(3)
Call:
dynlm(formula = mytsdata ~ L(mytsdata, 1))
Residuals:
Min 1Q Median 3Q Max
-14.5796 -3.5823 0.0598 3.2617 15.4817
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.72080 1.68965 1.61 0.114
L(mytsdata, 1) 0.93266 0.05967 15.63 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.675 on 47 degrees of freedom
Multiple R-squared: 0.8387, Adjusted R-squared: 0.8352
F-statistic: 244.3 on 1 and 47 DF, p-value: < 2.2e-16
library(forecast)
ts.stl <- stl(mytsdata, "periodic") # decompose the TS
ts.sa <- seasadj(ts.stl)
autoplot(ts.stl) dataplot<-cbind(mytsdata, seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2])
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
Augmented Dickey-Fuller Test
data: mytsdata
Dickey-Fuller = -3.3154, Lag order = 3, p-value = 0.07932
alternative hypothesis: stationary
[1] 0
[1] 1
stationaryTS <- diff(mytsdata, differences= ndiffs(mytsdata))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary
Augmented Dickey-Fuller Test
data: stationaryTS
Dickey-Fuller = -3.9373, Lag order = 3, p-value = 0.01973
alternative hypothesis: stationary
Series: ts.stl$time.series[, 3]
ARIMA(2,0,0) with zero mean
Coefficients:
ar1 ar2
0.8735 -0.6074
s.e. 0.1167 0.1158
sigma^2 estimated as 18.39: log likelihood=-143.36
AIC=292.72 AICc=293.24 BIC=298.45
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.1290637 4.201792 3.438581 124.7528 161.216 0.4287052
ACF1
Training set 0.03800343
The data are clearly non-stationary, as the series wanders up and down for long periods. Consequently, we will take a first difference of the data.
The PACF shown is suggestive of an AR(2) model. So an initial candidate model is an ARIMA(2,1,0). There are no other obvious candidate models.
Series: mytsdatadj
ARIMA(2,1,0)
Coefficients:
ar1 ar2
0.3716 -0.4716
s.e. 0.1341 0.1319
sigma^2 estimated as 28.68: log likelihood=-151.02
AIC=308.04 AICc=308.57 BIC=313.71
Ljung-Box test
data: Residuals from ARIMA(2,1,0)
Q* = 7.5718, df = 8, p-value = 0.4764
Model df: 2. Total lags used: 10
dataplot<-cbind(trend=m[["data"]][,"trend"], seasonaladj=m[["data"]][,"seasonaladj"], irregular=m[["data"]][,"irregular"], data=mytsdata)
dygraph(dataplot)%>%
dyOptions(gridLineColor = "#E1E5EA", axisLineColor = "#303030", drawPoints
=TRUE, strokeWidth = 2, colors = c("blue","green","grey","red"))%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()n <- 200
value <- rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)Smoothing
values <- stats::filter(value, sides=2, filter=rep(1/5,5))
mytsdata <- ts(values, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)value <- (1:n)/10 + rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)value <- 0
for (i in 2:n) {
value[i] <- value[i-1] + rnorm(1,0,1)
}
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)value <- 0
value[2] <- 1
for (i in 3:n) {
value[i] <- value[i-1] - value[i-2]/9 + rnorm(1,0,1)
}
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)value <- 2*cos(2*pi*1:n/10 +0.6*pi) + rnorm(n,0,1)
mytsdata <- ts(value, start = 1, end = n)
mytsdatafr<-data.frame(Y=as.matrix(mytsdata), date=as.Date(time(mytsdata)))
p <- ggplot(mytsdatafr, aes(x=date, y=Y)) +
geom_line(alpha=1, color="#69b3a2", size=2) +
geom_point(alpha=1, color="red", size=2) +
xlab("Time")+ylab("Value")
ggplotly(p)value <- (1:n)/20 + cos((1:n)/10) + rnorm(n,0,1)
mytsdatax <- ts(value, start = 1, end = n)
value <- (1:n)/20 + rnorm(n,0,2) + 5
mytsdatay <- ts(value, start = 1, end = n)
mytsdata <- cbind(mytsdatax, mytsdatay)
dygraph(mytsdata)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
Attaching package: 'astsa'
The following object is masked from 'package:seasonal':
unemp
The following object is masked from 'package:forecast':
gas
data(jj)
plot(jj, type="o", ylab="Quarterly Earnings per Share")
jjdata<-data.frame(Y=as.matrix(jj), date=as.Date(time(jj)))
jjdata %>%
plot_ly(x = ~date, y = ~Y, mode = 'lines+markers', type = "scatter", name="Confirm", width = 850, height = 750)
library(dygraphs)
dygraph(jj)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
dygraph(globtemp)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
dygraph(speech)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
#as.xts(AirPassengers)
dygraph(AirPassengers)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
lungDeaths <- cbind(mdeaths, fdeaths)
dygraph(lungDeaths)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
SoiRec <- cbind(soi, rec) # Southern Oscillation Index and Recruitment
dy_graph <- list(dygraph(soi)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector(),
dygraph(rec)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector())
htmltools::browsable(htmltools::tagList(dy_graph))
SoiRecdata<-data.frame(as.matrix(SoiRec), date=as.Date(time(SoiRec)))
subplot(SoiRecdata %>%
plot_ly(x = ~date, y = ~soi, mode = 'lines+markers', type = "scatter", name="soi"),
SoiRecdata %>%
plot_ly(x = ~date, y = ~rec, mode = 'lines+markers', type = "scatter", name="rec"), nrows = 2, shareX = TRUE)
dy_graph <- list(
dygraph(fmri1[,2:5])%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector(),
dygraph(fmri1[,6:9])%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector())
htmltools::browsable(htmltools::tagList(dy_graph))trend <- sin(seq(1,41))+runif(41)
data <- data.frame(
time=seq(from=Sys.Date()-40, to=Sys.Date(), by=1 ),
value1=trend,
value2=trend+rnorm(41),
value3=trend+rnorm(41),
value4=trend+rnorm(41)
)
# switch to xts format
data <- xts(x = data[,-1], order.by = data$time)
# Plot it
p <- dygraph(data) %>%
dyCandlestick()%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
pn <- 100
x <- 1 + -0.05*(1:n) + cos((1:n)*pi/10) + rnorm(n,0,1)
mytsX <- ts(x, frequency = 4, start = c(1980, 2))
y<-0
set.seed(1)
for (i in 2:n) {
y[i] <- 0.03 + y[i-1] + rnorm(1,0,1)
}
mytsY <- ts(y, frequency = 4, start = c(1980, 2))
mytsdata <- cbind(mytsX, mytsY)
dygraph(mytsdata)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()Response mytsX :
Call:
lm(formula = mytsX ~ time(mytsdata))
Residuals:
Min 1Q Median 3Q Max
-3.5583 -0.6055 0.1441 0.8457 2.5433
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 426.38967 32.54121 13.10 <2e-16 ***
time(mytsdata) -0.21471 0.01633 -13.15 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.179 on 98 degrees of freedom
Multiple R-squared: 0.6382, Adjusted R-squared: 0.6345
F-statistic: 172.9 on 1 and 98 DF, p-value: < 2.2e-16
Response mytsY :
Call:
lm(formula = mytsY ~ time(mytsdata))
Residuals:
Min 1Q Median 3Q Max
-2.5659 -1.0276 -0.2509 1.0234 3.0951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1189.71594 39.87865 -29.83 <2e-16 ***
time(mytsdata) 0.60070 0.02001 30.02 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.444 on 98 degrees of freedom
Multiple R-squared: 0.9019, Adjusted R-squared: 0.9009
F-statistic: 900.9 on 1 and 98 DF, p-value: < 2.2e-16
Analysis of Variance Table
Df Pillai approx F num Df den Df Pr(>F)
(Intercept) 1 0.96816 1474.56 2 97 < 2.2e-16 ***
time(mytsdata) 1 0.92730 618.61 2 97 < 2.2e-16 ***
Residuals 98
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mytstrend <- linearMod$fitted.values
mytsde_trend <- linearMod$residuals
dataplot<-cbind(mytsdata, mytstrend)
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()ts.stl <- stl(mytsdata[,1], "periodic") # decompose the TS
ts.sa <- seasadj(ts.stl)
autoplot(ts.stl) dataplot<-cbind(mytsdata[,1], seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2])
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()ts.stl <- stl(mytsdata[,2], "periodic") # decompose the TS
ts.sa <- seasadj(ts.stl)
autoplot(ts.stl) dataplot<-cbind(mytsdata[,2], seasonal=ts.stl$time.series[,1], trend=ts.stl$time.series[,2])
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
Augmented Dickey-Fuller Test
data: mytsdata[, 1]
Dickey-Fuller = -3.4149, Lag order = 4, p-value = 0.05601
alternative hypothesis: stationary
[1] 0
[1] 1
stationaryTS <- diff(mytsdata[,1], differences= ndiffs(mytsdata[,1]))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary
Augmented Dickey-Fuller Test
data: stationaryTS
Dickey-Fuller = -5.7598, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
dataplot<-cbind(mytsdata[,1],stationaryTS)
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()
Augmented Dickey-Fuller Test
data: mytsdata[, 2]
Dickey-Fuller = -2.801, Lag order = 4, p-value = 0.2448
alternative hypothesis: stationary
[1] 0
[1] 1
stationaryTS <- diff(mytsdata[,2], differences= ndiffs(mytsdata[,2]))
adf.test(stationaryTS) # p-value < 0.05 indicates the TS is stationary
Augmented Dickey-Fuller Test
data: stationaryTS
Dickey-Fuller = -5.3845, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
dataplot<-cbind(mytsdata[,2],stationaryTS)
dygraph(dataplot)%>%
dyOptions(drawPoints = TRUE, pointSize = 4)%>%
dyHighlight(highlightSeriesOpts = list(strokeWidth = 3))%>%
dyRangeSelector()The data are clearly non-stationary, as the series wanders up and down for long periods. Consequently, we will take a first difference of the data.
The PACF shown is suggestive of an AR(1) model. So an initial candidate model is an ARIMA(1,1,0). There are no other obvious candidate models.
Series: mytsdatadj
ARIMA(1,1,0)
Coefficients:
ar1
-0.5128
s.e. 0.0861
sigma^2 estimated as 1.542: log likelihood=-161.56
AIC=327.12 AICc=327.24 BIC=332.31
Ljung-Box test
data: Residuals from ARIMA(1,1,0)
Q* = 12.871, df = 7, p-value = 0.07533
Model df: 1. Total lags used: 8
Series: mytsdatadj
ARIMA(0,1,1) with drift
Coefficients:
ma1 drift
-0.7453 -0.0545
s.e. 0.1093 0.0305
sigma^2 estimated as 1.362: log likelihood=-155.16
AIC=316.33 AICc=316.58 BIC=324.11
Ljung-Box test
data: Residuals from ARIMA(0,1,1) with drift
Q* = 8.1736, df = 6, p-value = 0.2257
Model df: 2. Total lags used: 8
[1] 67.159
[1] 253.80 748.08 123.19 143.74 68.08 72.23 122.97 252.78 8.36 17.23
[1] 181.0443
[1] 13.45527
[1] 72.848
[1] 104.90 469.24 281.84 312.51 6.56
[1] 293.7633
[1] 17.13953
[1] -3.000000e-01 -2.000000e-01 -1.000000e-01 5.551115e-17 1.000000e-01
[6] 2.000000e-01 3.000000e-01
[1] 0
[1] 0.04666667
[1] 1
[1] 0.04666667
[1] 0
[1] 4.666667
[1] 1
[1] 4.666667
x<-seq(0,0.5,0.01)
y=x/(1-x)
df=cbind.data.frame(x,y)
p<-ggplot(data=df) + geom_line(aes(x=x, y=y)) +
geom_hline(yintercept=1, linetype="dashed", color = "blue", size=1) +
#geom_vline(xintercept =range(0.5,1), linetype="dashed", color = "blue", size=1)+
geom_segment(aes(x=0.5,xend=0.5,y=0,yend=1.2), linetype="dashed", color = "blue", size=1)+
geom_segment(aes(x=1,xend=1,y=0,yend=1.2), linetype="dashed", color = "blue", size=1)+
geom_segment(aes(x = 0.2, y = 1.2, xend = 0.3, yend = 0.9), arrow = arrow(length = unit(0.5, "cm")))+
annotate(geom="text", x=0.2, y=1.25, label="some writing",
color="red") +
annotate(geom="text", x=0.6, y=1.15, label=paste0("u >","frac(x,1-x)"),
color="red", parse = TRUE) +
geom_ribbon(aes(x=x, ymin=1, ymax=y), data=df, fill="purple", alpha = 0.15)+
xlab("x")+ylab("u")+ggtitle("Some Graph")
p
library(AER)
data(Affairs)
summary(lm(affairs ~., data=Affairs))