The purpose of this study is describing the distribution of ties in irrational numbers when considering sequences of embedding dimension \(D\) (and \(\tau=1\)). The finding will help building a model for simulating such occurrences and, then, assessing imputation techniques in a Monte Carlo study. Thus, in this report we will make an exploratory analysis of the data, analyzing the percentage of tied sequences for each embedding dimension value.

Load packages and sources

if(!require(ggpubr)){
  install.packages("ggpubr")
  require(ggpubr)
}
if(!require(ggplot2)){
  install.packages("ggplot2")
  require(ggplot2)
}
if(!require(ggthemes)){
  install.packages("ggthemes")
  require(ggthemes)
}
source('Bandt-Pompe.R')

Reading the numbers

We have produced to files with Mathematica: e.txt, pi.txt, and sqrt2.txt. They contain expansions of these irrational numbers with \(100,000\) digits.

e.data = as.character(read.table('../Data/e.txt', stringsAsFactors=FALSE, fileEncoding="latin1"))
pi.data = as.character(unlist(read.table('../Data/pi.txt', stringsAsFactors=FALSE, fileEncoding="latin1")))
sqrt2.data = as.character(unlist(read.table('../Data/sqrt2.txt', stringsAsFactors=FALSE, fileEncoding="latin1")))

e.vector = as.numeric(unlist(strsplit(e.data, ""))[3:100011])
pi.vector = as.numeric(unlist(strsplit(pi.data, ""))[3:100011])
sqrt2.vector = as.numeric(unlist(strsplit(sqrt2.data, ""))[3:100030])

Computing the number of tied sequences for each value of \(D\in\{3,4,5,6\}\)

D = 3
Tau = 1

e.elements.D3 = formationPattern(e.vector, D, Tau, 1)
e.percent.D3 = percentual.equalities(e.elements.D3)

pi.elements.D3 = formationPattern(pi.vector, D, Tau, 1)
pi.percent.D3 = percentual.equalities(pi.elements.D3)

sqrt2.elements.D3 = formationPattern(sqrt2.vector, D, Tau, 1)
sqrt2.percent.D3 = percentual.equalities(sqrt2.elements.D3)

cat("Number of tied sequences \ne: ", round(e.percent.D3*100, 3), "%\npi: ", round(pi.percent.D3*100, 3), "%\nsqrt2: ", round(sqrt2.percent.D3*100, 3), "%\n")
Number of tied sequences 
e:  28.115 %
pi:  27.913 %
sqrt2:  27.845 %
D = 4
e.elements.D4 = formationPattern(e.vector, D, Tau, 1)
e.percent.D4 = percentual.equalities(e.elements.D4)

pi.elements.D4 = formationPattern(pi.vector, D, Tau, 1)
pi.percent.D4 = percentual.equalities(pi.elements.D4)

sqrt2.elements.D4 = formationPattern(sqrt2.vector, D, Tau, 1)
sqrt2.percent.D4 = percentual.equalities(sqrt2.elements.D4)

cat("Number of tied sequences \ne: ", round(e.percent.D4*100, 3), "%\npi: ", round(pi.percent.D4*100, 3), "%\nsqrt2: ", round(sqrt2.percent.D4*100, 3), "%\n")
Number of tied sequences 
e:  49.622 %
pi:  49.538 %
sqrt2:  49.487 %
D = 5
e.elements.D5 = formationPattern(e.vector, D, Tau, 1)
e.percent.D5 = percentual.equalities(e.elements.D5)

pi.elements.D5 = formationPattern(pi.vector, D, Tau, 1)
pi.percent.D5 = percentual.equalities(pi.elements.D5)

sqrt2.elements.D5 = formationPattern(sqrt2.vector, D, Tau, 1)
sqrt2.percent.D5 = percentual.equalities(sqrt2.elements.D5)

cat("Number of tied sequences \ne: ", round(e.percent.D5*100, 3), "%\npi: ", round(pi.percent.D5*100, 3), "%\nsqrt2: ", round(sqrt2.percent.D5*100, 3), "%\n")
Number of tied sequences 
e:  69.866 %
pi:  69.92 %
sqrt2:  69.757 %
D = 6
e.elements.D6 = formationPattern(e.vector, D, Tau, 1)
e.percent.D6 = percentual.equalities(e.elements.D6)

pi.elements.D6 = formationPattern(pi.vector, D, Tau, 1)
pi.percent.D6 = percentual.equalities(pi.elements.D6)

sqrt2.elements.D6 = formationPattern(sqrt2.vector, D, Tau, 1)
sqrt2.percent.D6 = percentual.equalities(sqrt2.elements.D6)

cat("Number of tied sequences \ne: ", round(e.percent.D6*100, 3), "%\npi: ", round(pi.percent.D6*100, 3), "%\nsqrt2: ", round(sqrt2.percent.D6*100, 3), "%\n")
Number of tied sequences 
e:  85.122 %
pi:  85.145 %
sqrt2:  84.956 %

Analyzing the binary vectors with the position of the tied sequences

e.binary.D3 = binary.equalities(e.elements.D3)
e.binary.D4 = binary.equalities(e.elements.D4)
e.binary.D5 = binary.equalities(e.elements.D5)
e.binary.D6 = binary.equalities(e.elements.D6)

n.elements = 100

e.binary.df = data.frame('series' = c(e.binary.D3[1:n.elements], e.binary.D4[1:n.elements], e.binary.D5[1:n.elements], e.binary.D6[1:n.elements]),
                         'elements' = rep(c(1:n.elements), 4),
                         'D' = as.factor(c(rep(3, n.elements), rep(4, n.elements), rep(5, n.elements), rep(6, n.elements))))

ggplot(e.binary.df, mapping = aes(x = elements, y = series, group = D, color = D)) + 
    xlab("") + ylab("") +
    ggtitle("e number") +
    geom_line(position = position_dodge(0.8)) +
    theme_few(base_size = 13, base_family = "serif") + 
    facet_grid(facets = D~.) +
    scale_y_continuous(breaks=c(0, 1)) +
    theme(plot.title = element_text(hjust=0.5), legend.position = "none")

pi.binary.D3 = binary.equalities(pi.elements.D3)
pi.binary.D4 = binary.equalities(pi.elements.D4)
pi.binary.D5 = binary.equalities(pi.elements.D5)
pi.binary.D6 = binary.equalities(pi.elements.D6)

n.elements = 100

pi.binary.df = data.frame('series' = c(pi.binary.D3[1:n.elements], pi.binary.D4[1:n.elements], pi.binary.D5[1:n.elements], pi.binary.D6[1:n.elements]),
                         'elements' = rep(c(1:n.elements), 4),
                         'D' = as.factor(c(rep(3, n.elements), rep(4, n.elements), rep(5, n.elements), rep(6, n.elements))))

ggplot(pi.binary.df, mapping = aes(x = elements, y = series, group = D, color = D)) + 
    xlab("") + ylab("") +
    ggtitle("pi number") +
    geom_line(position = position_dodge(0.8)) +
    theme_few(base_size = 13, base_family = "serif") + 
    facet_grid(facets = D~.) +
    scale_y_continuous(breaks=c(0, 1)) +
    theme(plot.title = element_text(hjust=0.5), legend.position = "none")

sqrt2.binary.D3 = binary.equalities(sqrt2.elements.D3)
sqrt2.binary.D4 = binary.equalities(sqrt2.elements.D4)
sqrt2.binary.D5 = binary.equalities(sqrt2.elements.D5)
sqrt2.binary.D6 = binary.equalities(sqrt2.elements.D6)

n.elements = 100

sqrt2.binary.df = data.frame('series' = c(sqrt2.binary.D3[1:n.elements], sqrt2.binary.D4[1:n.elements], sqrt2.binary.D5[1:n.elements], sqrt2.binary.D6[1:n.elements]),
                         'elements' = rep(c(1:n.elements), 4),
                         'D' = as.factor(c(rep(3, n.elements), rep(4, n.elements), rep(5, n.elements), rep(6, n.elements))))

ggplot(sqrt2.binary.df, mapping = aes(x = elements, y = series, group = D, color = D)) + 
    xlab("") + ylab("") +
    ggtitle("sqrt2 number") +
    geom_line(position = position_dodge(0.8)) +
    theme_few(base_size = 13, base_family = "serif") + 
    facet_grid(facets = D~.) +
    scale_y_continuous(breaks=c(0, 1)) +
    theme(plot.title = element_text(hjust=0.5), legend.position = "none")

As we can see in the graphs above, the larger the dimension used, the greater the presence of patterns with repeated elements. When we have \(D = 3\), we see that there is a greater tendency for the existence of sequential patterns with the \(label = 0\) (that is, without repeated elements). However, as the dimension increases, this behavior is reversed. This fact occurs because we are analyzing numbers within the small range \([0, 9]\), so the larger the symbol considered, the greater the probability of the existence of equal elements to be grouped.

