link : https://rpubs.com/Isaiah-Mireles/1434057
Note :
\[ \rho=\frac{\sigma_{12}}{\sqrt{\sigma_{1}\sigma_{2}}} \]
\[ \implies \]
\[ \sigma_{12}=\rho\sqrt{\sigma_{1}\sigma_{2}} \]
od <- sqrt(1.5*2)*(-.6)
Sigma <-
c(2, od, od, 1.5) |> matrix(nrow=2, ncol=2, byrow=T)
mu <- c(3, 1) |> matrix()
print("mu :")
## [1] "mu :"
mu
## [,1]
## [1,] 3
## [2,] 1
print("Sigma :")
## [1] "Sigma :"
Sigma
## [,1] [,2]
## [1,] 2.00000 -1.03923
## [2,] -1.03923 1.50000
eig <- eigen(Sigma)
eig
## eigen() decomposition
## $values
## [1] 2.8188779 0.6811221
##
## $vectors
## [,1] [,2]
## [1,] -0.7854585 -0.6189143
## [2,] 0.6189143 -0.7854585
Below we see simulated data & the football shape expected of MVN
# simulate MVN
library(MASS)
df <- mvrnorm(n = 1000, mu = mu, Sigma = Sigma)
# center point
mu_hat <- colMeans(df)
# scale vectors by sqrt(eigenvalue)
scale1 <- sqrt(eig$values[1])
scale2 <- sqrt(eig$values[2])
# geneate plt
plot(df, xlab="X1", ylab="X2")
abline(h = mean(df[,2]), col = "red")
abline(v = mean(df[,1]), col = "red")
mdl <- lm(df[,2]~df[,1])
abline(mdl, col="green")
# first eigenvector
arrows(
x0 = mu_hat[1],
y0 = mu_hat[2],
x1 = mu_hat[1] + eig$vectors[1,1] * scale1,
y1 = mu_hat[2] + eig$vectors[2,1] * scale1,
col = "blue",
lwd = 3,
length = 0.12
)
# opposite direction
arrows(
x0 = mu_hat[1],
y0 = mu_hat[2],
x1 = mu_hat[1] - eig$vectors[1,2] * scale2,
y1 = mu_hat[2] - eig$vectors[2,2] * scale2,
col = "purple",
lwd = 3,
length = 0.12
)
Here we draw the regression line (green) onto the data & show the principle components (blue, purple) and triangulate the central value ( \(\bar{x}\), \(\bar{y}\) )
library(ellipse)
e <- ellipse(Sigma, centre = mu, level = .5)
plot(
e,
type = "n",
asp = 1
)
polygon(e, density = 20)
points(mu[1], mu[2], pch = 1, col = "red")
abline(h = mean(df[,2]), col = "red")
abline(v = mean(df[,1]), col = "red")
# first eigenvector
arrows(
x0 = mu_hat[1],
y0 = mu_hat[2],
x1 = mu_hat[1] + eig$vectors[1,1] * scale1,
y1 = mu_hat[2] + eig$vectors[2,1] * scale1,
col = "blue",
lwd = 3,
length = 0.12
)
# opposite direction
arrows(
x0 = mu_hat[1],
y0 = mu_hat[2],
x1 = mu_hat[1] - eig$vectors[1,2] * scale2,
y1 = mu_hat[2] - eig$vectors[2,2] * scale2,
col = "purple",
lwd = 3,
length = 0.12
)
library(car)
dataEllipse(df[,1], df[,2], levels=c(.5), fill = T, pch = 1, col=c("#00000080", "red"), center.pch = F, xlab="X1",ylab="X2")
Notice both are multiplied by a constant, of which we can ignore
\[ C=\frac{1}{(2\pi)^{np/2}|\sum|^{n=2}} \]
We can instead focus on whats in the exponential.
Let :
\[ X^{(i)}-\mu = (X^{(i)}-\bar X)+(\bar X-\mu)\\ \implies \]
\[ \sum_{i=1}^n (X^{(i)}-\mu)^T\Sigma^{-1}(X^{(i)}-\mu) = \sum_{i=1}^n \left[(X^{(i)}-\bar X)+(\bar X-\mu)\right]^T \Sigma^{-1} \left[(X^{(i)}-\bar X)+(\bar X-\mu)\right] \]
expand :
\[ \text{Let } a_i=X^{(i)}-\bar X \]
\[ \text{and} \]
\[ b=\bar X-\mu. \]
\[ X^{(i)}-\mu=a_i+b \]
so:
\[ \sum_{i=1}^n (X^{(i)}-\mu)^T\Sigma^{-1}(X^{(i)}-\mu) = \sum_{i=1}^n (a_i+b)^T\Sigma^{-1}(a_i+b) \]
\[ (a_i+b)^T\Sigma^{-1}(a_i+b) = (a_i^T+b^T)\Sigma^{-1}(a_i+b) \]
\[ (a_i^T+b^T)\Sigma^{-1}(a_i+b) = (a_i^T+b^T) \left(\Sigma^{-1}a_i+\Sigma^{-1}b\right) \]
\[ (a_i^T+b^T) \left(\Sigma^{-1}a_i+\Sigma^{-1}b\right) = a_i^T\Sigma^{-1}a_i + a_i^T\Sigma^{-1}b + b^T\Sigma^{-1}a_i + b^T\Sigma^{-1}b \]
Recall \(\sum^T=\sum\) and, \((\sum^{-1})^T=\sum^{-1}\), therefore :
\[ (a_i^T\Sigma^{-1}b)^{T}=b^{T}\Sigma^{-1}a \]
meaning:
\[ a_i^T\Sigma^{-1}b = b^T\Sigma^{-1}a_i \\ \implies \]
\[ (a_i+b)^T\Sigma^{-1}(a_i+b) = a_i^T\Sigma^{-1}a_i + 2a_i^T\Sigma^{-1}b + b^T\Sigma^{-1}b \]
\[ \begin{aligned} &\left[(X^{(i)}-\bar X)+(\bar X-\mu)\right]^T \Sigma^{-1} \left[(X^{(i)}-\bar X)+(\bar X-\mu)\right] \\ &= (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) + 2(X^{(i)}-\bar X)^T\Sigma^{-1}(\bar X-\mu) + (\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) \end{aligned} \]
Split sums:
\[ \begin{aligned} &\sum_{i=1}^n \left[(X^{(i)}-\bar X)+(\bar X-\mu)\right]^T \Sigma^{-1} \left[(X^{(i)}-\bar X)+(\bar X-\mu)\right] \\ &= \sum_{i=1}^n (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) \\ &\quad+ 2\sum_{i=1}^n (X^{(i)}-\bar X)^T\Sigma^{-1}(\bar X-\mu) \\ &\quad+ \sum_{i=1}^n (\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) \end{aligned} \]
and,
\[ 2\sum_{i=1}^n (X^{(i)}-\bar X)^T\Sigma^{-1}(\bar X-\mu) = 2\left[\sum_{i=1}^n(X^{(i)}-\bar X)\right]^T \Sigma^{-1}(\bar X-\mu) \]
\[ \sum_{i=1}^n(X^{(i)}-\bar X) = \sum_{i=1}^nX^{(i)} - \sum_{i=1}^n\bar X \]
except notice we are summing a scalar n times :
\[ \sum_{i=1}^n\bar X=n\bar X \]
\[ \bar X=\frac1n\sum_{i=1}^nX^{(i)} \]
\[ n\bar X=\sum_{i=1}^nX^{(i)} \]
\[ \sum_{i=1}^n(X^{(i)}-\bar X) = \sum_{i=1}^nX^{(i)} - \sum_{i=1}^nX^{(i)} = 0 \]
\[ 2\left[\sum_{i=1}^n(X^{(i)}-\bar X)\right]^T \Sigma^{-1}(\bar X-\mu) = 0 \]
therefore we can negate those sums which sum to 0. And end with:
\[ \sum_{i=1}^n (\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) = n(\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) \]
\[ \sum_{i=1}^n (X^{(i)}-\mu)^T\Sigma^{-1}(X^{(i)}-\mu) = \sum_{i=1}^n (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) + n(\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) \]
Notice this is a scalar and so the trace of it is still it:
\[ (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) = \operatorname{tr} \left[ (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) \right] \]
We can use the cyclical property of trace :
\[ \operatorname{tr} \left[ (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) \right] = \operatorname{tr} \left[ \Sigma^{-1}(X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] \]
\[ \sum_{i=1}^n (X^{(i)}-\bar X)^T\Sigma^{-1}(X^{(i)}-\bar X) = \sum_{i=1}^n \operatorname{tr} \left[ \Sigma^{-1}(X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] \]
\[ \sum_{i=1}^n \operatorname{tr} \left[ \Sigma^{-1}(X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] = \operatorname{tr} \left[ \sum_{i=1}^n \Sigma^{-1}(X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] \]
\[ \operatorname{tr} \left[ \sum_{i=1}^n \Sigma^{-1}(X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] = \operatorname{tr} \left[ \Sigma^{-1} \sum_{i=1}^n (X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] \]
therefore!
\[ \sum_{i=1}^n (X^{(i)}-\mu)^T\Sigma^{-1}(X^{(i)}-\mu) = \operatorname{tr} \left[ \Sigma^{-1} \sum_{i=1}^n (X^{(i)}-\bar X)(X^{(i)}-\bar X)^T \right] + n(\bar X-\mu)^T\Sigma^{-1}(\bar X-\mu) \]
x1 <- c(1,1,3,3,4,5,5,7,9,10,13)
x2 <- c(17.95, 19, 17.85, 16.5, 14.1, 12.15, 11, 9.35, 6, 4.15, 2.99)
df <- data.frame(x1=x1,x2=x2)
stat_dist <-
mahalanobis(
x = df,
center = colMeans(df),
cov = var(df)
)
stat_dist
## [1] 1.6457872 1.5278722 3.5264271 0.9103886 0.1662493 0.2185317 1.8649088
## [8] 0.2630483 1.2769104 2.3281935 6.2716829
Before we use chi-sqr to determine central 50% lets do some visual inspection :
dataEllipse(df[,1], df[,2], levels=c(.5), fill = T, pch = 1, col=c("#00000080", "red"), center.pch = F, xlab="X1",ylab="X2")
cutoff <- qchisq(0.50, df = ncol(df))
within_50 <- stat_dist <= cutoff
proportion <- mean(within_50)
proportion
## [1] 0.4545455
car::qqPlot(stat_dist, dist = "chisq", df = ncol(df))
## [1] 11 3
site : https://archive.ics.uci.edu/dataset/583/chemical+composition+of+ceramic+samples
Ceramic.Name: name of ceramic types from Longquan and Jindgezhen Part: a binary categorical variable (‘Body’ or ‘Glaze’) Na2O: percentage of Na2O (wt%) MgO: percentage of MgO (wt%) Al2O3: percentage of AI2O3 (wt%) SiO2: percentage of SiO2 (wt%) K2O: percentage of K2O (wt%) CaO: percentage of CaO (wt%) TiO2: percentage of TiO2 (wt%) Fe2O3: percentage of Fe2O3 (wt%) MnO: percentage of MnO (ppm) CuO: percentage of CuO (ppm) ZnO: percentage of ZnO (ppm) PbO2: percentage of PbO2 (ppm) Rb2O: percentage of Rb2O (ppm) SrO: percentage of SrO (ppm) Y2O3: percentage of Y2O3 (ppm) ZrO2: percentage of ZrO2 (ppm) P2O5: percentage of P2O5 (ppm)
Focus-variables :
df <- read.csv("Chemical Composion of Ceramic.csv")
library(dplyr)
num <- df |> select(MnO, CuO, ZrO2, P2O5)
hist(num$MnO)
qqnorm(num$MnO)
qqline(num$MnO, col = "red")
hist(num$CuO)
qqnorm(num$CuO)
qqline(num$CuO, col = "red")
skewwed, non-normal
Closer to normality
hist(num$ZrO2)
qqnorm(num$ZrO2)
qqline(num$ZrO2, col = "red")
skewwed, non-normal
Closer to normality
hist(num$P2O5)
qqnorm(num$P2O5)
qqline(num$P2O5, col = "red")
library(GGally)
ggpairs(num)
Lets take a step back and remember “Statistical Distance” :
\[ (x-\mu)'\Sigma(x-\mu) \]
Which we know, that if \(X\sim N_p(\vec{\mu}, \Sigma)\) then :
\[ (x-\mu)'\Sigma(x-\mu) \sim X^2_{p}(\alpha) \]
xbar <- colMeans(num)
S <- cov(num) # est cov
S_inv <- solve(S) # est of cov-inc
num |> head()
?mahalanobis
stat_dist <- mahalanobis(
x = num,
center = xbar,
cov = S
)
p <- ncol(num)
car::qqPlot(stat_dist, dist = "chisq", df = p)
## [1] 52 43
notice up to the 7th-8th quantile it looks good, then beyond that, its above. Indicating for statistical-distances above about 5, our dist deviates from MVN
outliers <-
cbind(num, stat_dist) |> arrange(desc(stat_dist)) |> mutate(p_val = 1-pchisq(stat_dist, 4)) |> mutate(outlier = p_val<=.05)
outliers |> filter(outlier==T)
new_data <- outliers |> filter(outlier==F)
car::qqPlot(new_data$stat_dist, dist = "chisq", df = p)
## [1] 1 2
this is the new data without the outliers at a 5% significance
Much better improvement
library(GGally)
ggpairs(new_data[,1:4])
library(ICSNP)
data(LASERI)
LASERI <- LASERI |> select(HRT1:HRT4)
LASERI |> head()
ggpairs(LASERI)
stat_dist <-
mahalanobis(
x = LASERI,
center = colMeans(LASERI),
cov = var(LASERI)
)
car::qqPlot(stat_dist, dist = "chisq", df = ncol(LASERI))
## [1] 219 66
cbind(LASERI, stat_dist) |> arrange(desc(stat_dist)) |> mutate(p_val = 1-pchisq(stat_dist, ncol(LASERI))) |> mutate(outlier = p_val<=.05) |> head()
outliers |> filter(outlier==T)
library(ggplot2)
ci_df <- data.frame(
Variable = c("HRT1", "HRT2", "HRT3"),
Mean = c(
mean(LASERI$HRT1),
mean(LASERI$HRT2),
mean(LASERI$HRT3)
),
Lower = c(
t.test(LASERI$HRT1)$conf.int[1],
t.test(LASERI$HRT2)$conf.int[1],
t.test(LASERI$HRT3)$conf.int[1]
),
Upper = c(
t.test(LASERI$HRT1)$conf.int[2],
t.test(LASERI$HRT2)$conf.int[2],
t.test(LASERI$HRT3)$conf.int[2]
)
)
ggplot(ci_df,
aes(x = Variable, y = Mean)) +
geom_point(size = 3) +
geom_errorbar(
aes(ymin = Lower, ymax = Upper),
width = 0.2
) +
labs(
title = "95% CI -- Heart Rate Means",
y = "Heart Rate"
)
hrt_diff <-
data.frame(
d12 = LASERI$HRT1 - LASERI$HRT2,
d13 = LASERI$HRT1 - LASERI$HRT3,
d14 = LASERI$HRT1 - LASERI$HRT4
)
HotellingsT2(hrt_diff, mu = c(0,0,0))
##
## Hotelling's one sample T2-test
##
## data: hrt_diff
## T.2 = 326.19, df1 = 3, df2 = 220, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to c(0,0,0)
Here we take the difference with HRT1, between each and see if they are different from (0,0,0) – two-sided test.