require(ggplot2)
require(ggExtra)
require(hrbrthemes)
require(extrafont)
require(gridExtra)
require(ggrepel)
require(goftest)
theme_set(theme_ipsum(base_family = "Times New Roman",
base_size = 10, axis_title_size = 10))
First, read the data,
HC.BP = data.frame("H" = numeric(16000),
"C" = numeric(16000),
"Dist" = numeric(16000),
"D" = numeric(16000),
"N" = numeric(16000),
stringsAsFactors=FALSE)
filecsv = data.frame(read.csv("../Data/HC_series_fk0_16000.csv"))
HC.BP$H = filecsv[,1]
HC.BP$C = filecsv[,2]
HC.BP$D = as.factor(filecsv[,3])
HC.BP$N = as.factor(rep(c(rep(1e+04, 100), rep(2e+04, 100), rep(3e+04, 100), rep(4e+04, 100), rep(5e+04, 100), rep(6e+04, 100), rep(7e+04, 100), rep(8e+04, 100), rep(9e+04, 100), rep(1e+05, 100)), 16))
summary(HC.BP)
H C Dist D N
Min. :0.9199 Min. :0.0002271 Min. :0 3:4000 10000 :1600
1st Qu.:0.9869 1st Qu.:0.0041165 1st Qu.:0 4:4000 20000 :1600
Median :0.9924 Median :0.0075186 Median :0 5:4000 30000 :1600
Mean :0.9902 Mean :0.0096574 Mean :0 6:4000 40000 :1600
3rd Qu.:0.9959 3rd Qu.:0.0128926 3rd Qu.:0 50000 :1600
Max. :0.9998 Max. :0.0745442 Max. :0 60000 :1600
(Other):6400
ggplot(data = HC.BP, aes(x = H, y = C, color = D)) +
geom_point(size = 3) +
labs(title = "Points in the HxC plane",
x = expression(italic(H)),
y = expression(italic(C))
)
Now separated by the embedding dimension \(D\).
ggplot(data=HC.BP, aes(x=H, y=C, color=D)) +
geom_point(size=2) +
labs(title="Points in the HxC plane",
x=expression(italic(H)),
y=expression(italic(C))) +
facet_grid(.~D)
We see a seemingly linear trend, but at this point we have to recall that \(C=HD\), in which \(D\) is a distance (in this case, the Jensen-Shannon distance) to an equilibrium distribution (in this case, the Uniform law).
In order to see the relationship between the variables we are studying, let’s look at \(H\times D\).
HC.BP$Dist = HC.BP$C / HC.BP$H
ggplot(data=HC.BP, aes(x=Dist, y=C, color=D)) +
geom_point(size=2) +
labs(title="Points in the Dist x C plane",
x="Dist",
y=expression(italic(C))) +
facet_grid(.~D)
ggplot(data=HC.BP, aes(x=H, y=Dist, color=D)) +
geom_point(size=2) +
labs(title="Points in the H x Dist plane",
x=expression(italic(H)),
y="Dist") +
facet_grid(.~D)
We will take N out of the data frame, in order to use the default regression in ggplot
HC.BP.regression.data = HC.BP[,-3]
lm.HC = lm(data=HC.BP.regression.data, formula = C ~ H * D)
summary(lm.HC)
Call:
lm(formula = C ~ H * D, data = HC.BP.regression.data)
Residuals:
Min 1Q Median 3Q Max
-0.0046003 -0.0000673 -0.0000218 0.0000608 0.0037435
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9778085 0.0005905 1655.768 < 2e-16 ***
H -0.9777680 0.0005964 -1639.509 < 2e-16 ***
D4 -0.0024713 0.0008190 -3.018 0.00255 **
D5 -0.0018136 0.0008318 -2.180 0.02924 *
D6 -0.0039134 0.0008092 -4.836 1.34e-06 ***
H:D4 0.0024901 0.0008271 3.011 0.00261 **
H:D5 0.0018344 0.0008400 2.184 0.02899 *
H:D6 0.0039479 0.0008173 4.831 1.37e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0002963 on 15992 degrees of freedom
Multiple R-squared: 0.9986, Adjusted R-squared: 0.9986
F-statistic: 1.621e+06 on 7 and 15992 DF, p-value: < 2.2e-16
hist(lm.HC$residuals, breaks = 200)
plot(lm.HC)
The above results suggest that there is no evidence against the use of the embedding dimension \(D\) as a factor in the linear regression model.
ggplot(HC.BP.regression.data, aes(x=H, y=C, colour=D)) +
geom_point(size=2) +
geom_smooth(method="lm", formula = y~x) +
labs(title="Complexity explained by Entropy",
x=expression(italic(H)),
y=expression(italic(C)))
Now, let’s see the fitted lines in a semilogarithmic scale.
ggplot(HC.BP.regression.data, aes(x=H, y=C, colour=D)) +
geom_point(size=2) +
scale_y_log10() +
geom_smooth(method="lm", formula = y~x) +
labs(title="Complexity explained by Entropy",
x=expression(italic(H)),
y=expression(italic(C)~"[logarithmic scale]"))
facet_grid(.~D)
<ggproto object: Class FacetGrid, Facet, gg>
compute_layout: function
draw_back: function
draw_front: function
draw_labels: function
draw_panels: function
finish_data: function
init_scales: function
map_data: function
params: list
setup_data: function
setup_params: function
shrink: TRUE
train_scales: function
vars: function
super: <ggproto object: Class FacetGrid, Facet, gg>
The model we are fitting is \[ \left\{ \begin{eqnarray*} C^{(3)} &= \beta_0^{(3)} + \beta_1^{(3)} H^{(3)},\\ C^{(4)} &= \beta_0^{(4)} + \beta_1^{(4)} H^{(4)},\\ C^{(5)} &= \beta_0^{(5)} + \beta_1^{(5)} H^{(5)},\\ C^{(6)} &= \beta_0^{(6)} + \beta_1^{(6)} H^{(6)}.\\ \end{eqnarray*} \right. \] in other words, a different linear relationship for each embedding dimension \(D\).
Let’s save the estimates and their confidence intervals.
beta0 <- c(
lm.HC$coefficients[1],
lm.HC$coefficients[1] + lm.HC$coefficients[3],
lm.HC$coefficients[1] + lm.HC$coefficients[4],
lm.HC$coefficients[1] + lm.HC$coefficients[5])
beta0inf <- c(
confint(lm.HC)[1,1],
beta0[2] - confint(lm.HC)[3,1],
beta0[3] - confint(lm.HC)[4,1],
beta0[4] - confint(lm.HC)[5,1])
beta0sup <- c(
confint(lm.HC)[1,2],
beta0[2] + confint(lm.HC)[3,2],
beta0[3] + confint(lm.HC)[4,2],
beta0[4] + confint(lm.HC)[5,2])
beta1 <- c(
lm.HC$coefficients[2],
lm.HC$coefficients[2] + lm.HC$coefficients[6],
lm.HC$coefficients[2] + lm.HC$coefficients[7],
lm.HC$coefficients[2] + lm.HC$coefficients[8])
beta1inf <-c(
confint(lm.HC)[2,1],
beta1[2] - confint(lm.HC)[6,1],
beta1[3] - confint(lm.HC)[7,1],
beta1[4] - confint(lm.HC)[8,1])
beta1sup <- c(
confint(lm.HC)[2,2],
beta1[2] + confint(lm.HC)[6,2],
beta1[3] + confint(lm.HC)[7,2],
beta1[4] + confint(lm.HC)[8,2])
estimates <- data.frame(beta0=beta0,
beta0inf=beta0inf,
beta0sup=beta0sup,
beta1=beta1,
beta1inf=beta1inf,
beta1sup=beta1sup,
D=3:6)
estimates
The fitted parameters are: \[ \left\{ \begin{eqnarray*} \widehat{\beta}_0^{(3)} =0.983 ,&\quad \widehat{\beta}_1^{(3)}=-0.983,\\ \widehat{\beta}_0^{(4)} =1.310 ,&\quad \widehat{\beta}_1^{(4)}=-1.310,\\ \widehat{\beta}_0^{(5)} =1.785 ,&\quad \widehat{\beta}_1^{(5)}=-1.785,\\ \widehat{\beta}_0^{(6)} =2.405 ,&\quad \widehat{\beta}_1^{(6)}=-2.405.\\ \end{eqnarray*} \right. \]
Let’s see how they behave.
plot.beta0 <- ggplot(data=estimates, aes(x=D, y=beta0)) +
geom_point() +
labs(title="Intercept vs. embedding dimension",
x=expression(italic(D)),
y=expression(widehat(beta)[0]))
plot.beta1 <- ggplot(data=estimates, aes(x=D, y=beta1)) +
geom_point() +
labs(title="Slope vs embedding dimension",
x=expression(italic(D)),
y=expression(widehat(beta)[1]))
grid.arrange(plot.beta0, plot.beta1, nrow=1)
The joint variation:
ggplot(data=estimates, aes(x=beta0, y=beta1)) +
geom_smooth(method = "lm", formula = y~x) +
geom_point(size=2) +
# geom_errorbar(aes(ymin=beta1inf, ymax=beta1sup)) +
# geom_errorbarh(aes(xmin=beta0inf, xmax=beta0sup)) +
geom_label_repel(data=estimates, aes(x=beta0, y=beta1, label=D),
direction = "x") +
labs(title="Intercept vs Slope",
x=expression(widehat(beta)[0]),
y=expression(widehat(beta)[1])) +
coord_fixed()
We verify there is a linear relationship between \(\widehat{\beta}_0\) and \(\widehat{\beta}_1\).