library(readr)
Warning: package 'readr' was built under R version 4.3.3
nutrient<-read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/nutrient.csv")
library(dplyr)
N <-count(nutrient)
mean_cal <- mean(nutrient$calcium)
std_cal <- sd(nutrient$calcium)
min_cal <- min(nutrient$calcium)
max_cal <- max(nutrient$calcium)
summary_df <- data.frame(Variable = "calcium",
N = N,
Mean = mean_cal,
StdDev = std_cal,
Minimum = min_cal,
Maximum = max_cal)
mean_ir <- mean(nutrient$iron)
std_ir <- sd(nutrient$iron)
min_ir <- min(nutrient$iron)
max_ir <- max(nutrient$iron)
summary_df2<- data.frame(Variable = "iron",
N = N,
Mean = mean_ir,
StdDev = std_ir,
Minimum = min_ir,
Maximum = max_ir)
mean_pr <- mean(nutrient$protein)
std_pr <- sd(nutrient$protein)
min_pr <- min(nutrient$protein)
max_pr <- max(nutrient$protein)
summary_df3<- data.frame(Variable = "protein",
N = N,
Mean = mean_pr,
StdDev = std_pr,
Minimum = min_pr,
Maximum = max_pr)
mean_a <- mean(nutrient$vitamin.A)
std_a <- sd(nutrient$vitamin.A)
min_a <- min(nutrient$vitamin.A)
max_a <- max(nutrient$vitamin.A)
summary_df4<- data.frame(Variable = "a",
N = N,
Mean = mean_a,
StdDev = std_a,
Minimum = min_a,
Maximum = max_a)
mean_c <- mean(nutrient$vitamin.C)
std_c <- sd(nutrient$vitamin.C)
min_c <- min(nutrient$vitamin.C)
max_c <- max(nutrient$vitamin.C)
summary_df5<- data.frame(Variable = "c",
N = N,
Mean = mean_c,
StdDev = std_c,
Minimum = min_c,
Maximum = max_c)
rbind(summary_df,summary_df2,summary_df3,summary_df4,summary_df5)
Variable n Mean StdDev Minimum Maximum
1 calcium 737 624.04925 397.27754 7.44 2866.440
2 iron 737 11.12990 5.98419 0.00 58.668
3 protein 737 65.80344 30.57576 0.00 251.012
4 a 737 839.63535 1633.53983 0.00 34434.270
5 c 737 78.92845 73.59527 0.00 433.339
nut <- select(nutrient, calcium, iron, protein, vitamin.A, vitamin.C)
round(cov(nut),1)
calcium iron protein vitamin.A vitamin.C
calcium 157829.4 940.1 6075.8 102411.1 6701.6
iron 940.1 35.8 114.1 2383.2 137.7
protein 6075.8 114.1 934.9 7330.1 477.2
vitamin.A 102411.1 2383.2 7330.1 2668452.4 22063.2
vitamin.C 6701.6 137.7 477.2 22063.2 5416.3
round(cor(nut),3)
calcium iron protein vitamin.A vitamin.C
calcium 1.000 0.395 0.500 0.158 0.229
iron 0.395 1.000 0.623 0.244 0.313
protein 0.500 0.623 1.000 0.147 0.212
vitamin.A 0.158 0.244 0.147 1.000 0.184
vitamin.C 0.229 0.313 0.212 0.184 1.000
df1<-data.frame(Variable = "calcium",
Variance = var(nutrient$calcium))
df2<-data.frame(Variable = "iron",
Variance = var(nutrient$iron))
df3<-data.frame(Variable = "protein",
Variance = var(nutrient$protein))
df4<-data.frame(Variable = "Vitamin A",
Variance = var(nutrient$vitamin.A))
df5<-data.frame(Variable = "Vitamin C",
Variance = var(nutrient$vitamin.C))
df6<-data.frame(Variable = "Total",
Variance = (var(nutrient$calcium)+var(nutrient$iron)+var(nutrient$protein)+var(nutrient$vitamin.A)+var(nutrient$vitamin.C)))
rbind(df1,df2,df3,df4,df5,df6)
Variable Variance
1 calcium 1.578294e+05
2 iron 3.581054e+01
3 protein 9.348769e+02
4 Vitamin A 2.668452e+06
5 Vitamin C 5.416264e+03
6 Total 2.832669e+06
gen_variance <- det(cov(nut))
gen_variance
[1] 2.831042e+19
df1<-data.frame(Variable = "calcium",
Mean = mean(nutrient$calcium))
df2<-data.frame(Variable = "iron",
Mean = mean(nutrient$iron))
df3<-data.frame(Variable = "protein",
Mean = mean(nutrient$protein))
df4<-data.frame(Variable = "Vitamin A",
Mean = mean(nutrient$vitamin.A))
df5<-data.frame(Variable = "Vitamin C",
Mean = mean(nutrient$vitamin.C))
rbind(df1,df2,df3,df4,df5)
Variable Mean
1 calcium 624.04925
2 iron 11.12990
3 protein 65.80344
4 Vitamin A 839.63535
5 Vitamin C 78.92845
round(cov(nut),1)
calcium iron protein vitamin.A vitamin.C
calcium 157829.4 940.1 6075.8 102411.1 6701.6
iron 940.1 35.8 114.1 2383.2 137.7
protein 6075.8 114.1 934.9 7330.1 477.2
vitamin.A 102411.1 2383.2 7330.1 2668452.4 22063.2
vitamin.C 6701.6 137.7 477.2 22063.2 5416.3
\(s_y^2=\)
round(((0.001)^2)*var(nutrient$vitamin.A) + var(nutrient$vitamin.C)+ 2*0.001*cov(nutrient$vitamin.A,nutrient$vitamin.C),1)
[1] 5463.1
L_calciu <-log(nutrient$calcium)
S_calciu <- nutrient$calcium**.5
S_S_calc <- nutrient$calcium**.25
L_iron <- log(nutrient$iron)
S_S_iron <- nutrient$iron**.25
L_prot <- log(nutrient$protein);
S_S_prot <- nutrient$protein**.25;
S_S_a = nutrient$vitamin.A**.25;
S_S_c = nutrient$vitamin.C**.25;
Univariate Cases
hist(nutrient$calcium, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of Calcium")
hist(L_calciu, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of L_calciu")
hist(S_calciu, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of S_calciu")
hist(S_S_calc, breaks = 20, col = "skyblue", xlab = "calcium", ylab = "Percent", main = "Distribution of S_S_calc")
Bivariate Cases
plot(nutrient$iron, nutrient$calcium, main = "Scatter Plot of calcium vs iron", xlab = "iron", ylab = "calcium", col = "blue", pch = 16, xlim = c(0, 60), ylim = c(0, 3000))
plot(S_S_iron,S_S_calc, main = "Scatter Plot of S_S_calc vs S_S_iron", xlab = "S_S_iron", ylab = "S_S_calc", col = "blue", pch = 16, xlim = c(0, 3), ylim = c(1, 8))
Trivariate Cases
library(scatterplot3d)
scatterplot3d(nutrient$iron,nutrient$protein,nutrient$calcium,
main = "3D Scatter Plot of calcium by iron and protein",
xlab = "iron",
ylab = "protein",
zlab = "calcium",
color = "blue",
type = "h",
pch = 16,
xlim = c(0, 60), ylim = c(0, 250), zlim=c(7, 2900)
)
Multivariate Cases
pairs(~S_S_calc + S_S_iron + S_S_prot + S_S_a + S_S_c, data = nutrient,
main = "Scatterplot Matrix",
pch = 19,
col = "blue",
cex = 0.1)
boards <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/boardstiffness.csv", skip = 1, header = FALSE)
names(boards) <- c("x1", "x2", "x3", "x4")
pc_result <- prcomp(boards, scale. = TRUE)
scores <- pc_result$x
mahal_distances <- mahalanobis(scores, colMeans(scores), cov(scores))
n <- nrow(boards)
plot_data <- data.frame(
dist2 = mahal_distances,
prb = (seq_len(n) - 0.5) / n
)
plot_data$chiquant <- qchisq(plot_data$prb, df = 4)
# Using base R
plot(plot_data$chiquant, sort(plot_data$dist2), xlab = "chiquant", ylab = "dist2", main = "SAS Plot of the Mahbalanobis Distance", pch = 3)
## Example 4-4: Wechsler Adult Intelligence Scale
wechsler <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/wechsler.csv")
df1<-data.frame(Variable = "Information",
Mean = mean(wechsler$info))
df2<-data.frame(Variable = "Similarities",
Mean = mean(wechsler$sim))
df3<-data.frame(Variable = "Arithmetic",
Mean = mean(wechsler$arith))
df4<-data.frame(Variable = "Picture Completion",
Mean = mean(wechsler$pict))
rbind(df1,df2,df3,df4)
Variable Mean
1 Information 12.567568
2 Similarities 9.567568
3 Arithmetic 11.486486
4 Picture Completion 7.972973
library(dplyr)
wech <- select(wechsler, "info", "sim", "arith", "pict")
covariance_matrix <- round(cov(wech), 3)
covariance_matrix
info sim arith pict
info 11.474 9.086 6.383 2.071
sim 9.086 12.086 5.938 0.544
arith 6.383 5.938 11.090 1.791
pict 2.071 0.544 1.791 3.694
pca_result <- prcomp(~ info + sim + arith + pict, data = wechsler)
eigenvalues <- pca_result$sdev^2
round(eigenvalues,3)
[1] 26.245 6.255 3.932 1.912
pca_result <- prcomp(~ info + sim + arith + pict, data = wechsler)
eigenvectors <- pca_result$rotation
round(eigenvectors,3)
PC1 PC2 PC3 PC4
info 0.606 -0.218 -0.461 0.611
sim 0.605 -0.496 0.320 -0.535
arith 0.505 0.795 0.335 0.035
pict 0.110 0.274 -0.757 -0.582
conf_level <- 0.95
alpha <- qchisq(conf_level, df = 4)
ellipse_lengths <- sqrt(alpha * eigenvalues)
ellipse_lengths
[1] 15.779990 7.703845 6.107496 4.258778
n <- length(eigenvalues)
volume_hyperellipse <- ((2*pi^2) /(4*gamma(2))) *(9.49^2)* sqrt(26.245*6.255*3.932*1.912)
print(volume_hyperellipse)
[1] 15613.12
alpha <- 0.05
p <- 3
n <- 25
f_quantile <- qf(1 - alpha, df1 = p, df2 = n - p)
print(f_quantile)
[1] 3.049125
multiplier<-sqrt((p*(n-1))/(n-p) * f_quantile)
multiplier
[1] 3.158948
mineral <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/mineral.csv")
df1<-data.frame(Variable = "domradius",
Mean = mean(mineral$domradius),
StdDev = sd(mineral$domradius))
df2<-data.frame(Variable = "domhumerus",
Mean = mean(mineral$domhumerus),
StdDev = sd(mineral$domhumerus))
df3<-data.frame(Variable = "domulna",
Mean = mean(mineral$domulna),
StdDev = sd(mineral$domulna))
rbind(df1,df2,df3)
Variable Mean StdDev
1 domradius 0.84380 0.1140245
2 domhumerus 1.79268 0.2834735
3 domulna 0.70440 0.1075566
One at a time intervals
Variable<-noquote(c('domradius','domhumerus','domulna'))
means <- c(mean(mineral$domradius), mean(mineral$domhumerus),mean(mineral$domulna))
ses <- c(sd(mineral$domradius), sd(mineral$domhumerus), sd(mineral$domulna))
confidence_level <- 0.95
df <- n-1
t_crit <- qt((1 - confidence_level) / 2 + confidence_level, df)
ci_widths <- t_crit * (ses / sqrt(n))
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis
Variable lower_bounds upper_bounds
1 domradius 0.7967330 0.8908670
2 domhumerus 1.6756679 1.9096921
3 domulna 0.6600028 0.7487972
Bonferroni Method
m <- length(means)
confidence_level <- 0.95
alpha_adj <- (1 - confidence_level)/m
df <- df
t_crit <- qt(1-alpha_adj/2, df)
ci_widths <- t_crit * (ses / sqrt(n))
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis
Variable lower_bounds upper_bounds
1 domradius 0.7851084 0.9024916
2 domhumerus 1.6467682 1.9385918
3 domulna 0.6490376 0.7597624
Simultaneous Confidence Region Method
ci_widths <- multiplier * ses/sqrt(n)
lower_bounds <- means - ci_widths
upper_bounds <- means + ci_widths
cis <- data.frame(Variable,lower_bounds, upper_bounds)
cis
Variable lower_bounds upper_bounds
1 domradius 0.7717605 0.9158395
2 domhumerus 1.6135844 1.9717756
3 domulna 0.6364469 0.7723531
Variable<-noquote(c('domhumerus', 'domradius','domulna'))
var<- c( var(mineral$domhumerus),var(mineral$domradius), var(mineral$domulna))
means <- c( mean(mineral$domhumerus),mean(mineral$domradius),mean(mineral$domulna))
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab
Variable n means var t1 tb f loone
1 domhumerus 25 1.79268 0.08035723 2.063899 2.573641 3.049125 1.6756679
2 domradius 25 0.84380 0.01300158 2.063899 2.573641 3.049125 0.7967330
3 domulna 25 0.70440 0.01156842 2.063899 2.573641 3.049125 0.6600028
upone lobon upbon losim upsim
1 1.9096921 1.6467682 1.9385918 1.6135844 1.9717756
2 0.8908670 0.7851084 0.9024916 0.7717605 0.9158395
3 0.7487972 0.6490376 0.7597624 0.6364469 0.7723531
cor_matrix <- cor(wechsler[, c("info", "sim", "arith", "pict")])
cor_matrix
info sim arith pict
info 1.0000000 0.77152974 0.5658260 0.31816370
sim 0.7715297 1.00000000 0.5129451 0.08135234
arith 0.5658260 0.51294508 1.0000000 0.27987766
pict 0.3181637 0.08135234 0.2798777 1.00000000
n<-nrow(wechsler)
test_statistic <- cor(wechsler$info,wechsler$sim) * sqrt(n - 2) / sqrt(1 - cor(wechsler$info,wechsler$sim)^2)
test_statistic
[1] 7.174599
fisher_transformation <- 0.5 * log((1 + cor(wechsler$info,wechsler$sim)) / (1 - cor(wechsler$info,wechsler$sim)))
fisher_transformation
[1] 1.024096
standard_error <- 1 / sqrt(n - 3)
df <- n - 3
t_crit <- qnorm(1-0.025)
margin_of_error <- t_crit * standard_error
lower_bound <- fisher_transformation - margin_of_error
upper_bound <- fisher_transformation + margin_of_error
ci_lower <- (exp(2 * lower_bound) - 1) / (exp(2 * lower_bound) + 1)
ci_upper <- (exp(2 * upper_bound) - 1) / (exp(2 * upper_bound) + 1)
print(paste("95% Confidence Interval:", ci_lower, "to", ci_upper))
[1] "95% Confidence Interval: 0.596673274426047 to 0.876445796058012"
library(ppcor)
Warning: package 'ppcor' was built under R version 4.3.3
fit_lm <- lm(cbind(info, sim) ~ arith + pict, data = wechsler)
residuals <- residuals(fit_lm)
partial_cor <- pcor(residuals)
r<-partial_cor$estimate[1,2]
r
[1] 0.7118787
n<-nrow(wechsler)
test_statistic <- r * sqrt(n - 2-2) / sqrt(1 - r^2)
test_statistic
[1] 5.822892
fisher_transformation <- 0.5 * log((1 + r) / (1 - r))
fisher_transformation
[1] 0.8909825
standard_error <- 1 / sqrt(n - 3 -2)
df <- n - 3-2
t_crit <- qnorm(1-0.025)
margin_of_error <- t_crit * standard_error
lower_bound <- fisher_transformation - margin_of_error
upper_bound <- fisher_transformation + margin_of_error
ci_lower <- (exp(2 * lower_bound) - 1) / (exp(2 * lower_bound) + 1)
ci_upper <- (exp(2 * upper_bound) - 1) / (exp(2 * upper_bound) + 1)
print(paste("95% Confidence Interval:", ci_lower, "to", ci_upper))
[1] "95% Confidence Interval: 0.496391667333479 to 0.844729175475622"
library(MASS)
hotel <- function(x, mu0) {
one <- matrix(1, nrow = nrow(x), ncol = 1)
ident <- diag(nrow(x))
ybar <- t(x) %*% one / nrow(x)
s <- t(x) %*% (ident - one %*% t(one) / nrow(x)) %*% x / (nrow(x) - 1)
t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
df1 <- ncol(x)
df2 <- nrow(x) - ncol(x)
p <- 1 - pf(f, df1, df2)
result <- list(t2 = t2, f = f, df1 = df1, df2 = df2, p = p)
return(result)
}
mu0 <- c(1000, 15, 60, 800, 75)
x <- as.matrix(nutrient[, c("calcium", "iron", "protein", "vitamin.A", "vitamin.C")])
result <- hotel(x, mu0)
rbind(result$t2, result$f, result$df1, result$df2, result$p)
[,1]
[1,] 1758.5413
[2,] 349.7968
[3,] 5.0000
[4,] 732.0000
[5,] 0.0000
calcium <- nutrient$calcium
conf_interval_cal <- t.test(calcium)$conf.int
conf_interval_cal
[1] 595.3201 652.7784
attr(,"conf.level")
[1] 0.95
variable<-noquote(c("Calcium","Iron","Protein","Vitamin A","Vitamin C"))
calcium <- nutrient$calcium
iron<- nutrient$iron
protein<-nutrient$protein
vitamin.A<-nutrient$vitamin.A
vitamin.C<-nutrient$vitamin.C
conf_interval_cal <- t.test(calcium)$conf.int
conf_interval_ir<-t.test(iron)$conf.int
conf_interval_pr<-t.test(protein)$conf.int
conf_interval_a<-t.test(vitamin.A)$conf.int
conf_interval_c<-t.test(vitamin.C)$conf.int
tab<-data.frame(conf_interval_cal,conf_interval_ir,conf_interval_pr,conf_interval_a,conf_interval_c)
tab
conf_interval_cal conf_interval_ir conf_interval_pr conf_interval_a
1 595.3201 10.69715 63.59235 721.5057
2 652.7784 11.56265 68.01453 957.7650
conf_interval_c
1 73.6064
2 84.2505
Simultaneous confidence intervals are given by the columns for “losim” and “upsim” while for Bonferroni intervals are given by “lobon” and “upbon”
Variable<-noquote(c("Calcium","Iron","Protein","Vitamin A","Vitamin C"))
var<-c(var(calcium),var(iron),var(protein),var(vitamin.A),var(vitamin.C))
means <-c(mean(calcium),mean(iron),mean(protein),mean(vitamin.A),mean(vitamin.C))
p<-5
n<-nrow(nutrient)
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab
Variable n means var t1 tb f loone
1 Calcium 737 624.04925 1.578294e+05 1.963192 2.582526 2.22634 595.32008
2 Iron 737 11.12990 3.581054e+01 1.963192 2.582526 2.22634 10.69715
3 Protein 737 65.80344 9.348769e+02 1.963192 2.582526 2.22634 63.59235
4 Vitamin A 737 839.63535 2.668452e+06 1.963192 2.582526 2.22634 721.50572
5 Vitamin C 737 78.92845 5.416264e+03 1.963192 2.582526 2.22634 73.60640
upone lobon upbon losim upsim
1 652.77843 586.25681 661.84169 575.09118 673.00733
2 11.56265 10.56063 11.69917 10.39244 11.86735
3 68.01453 62.89481 68.71207 62.03547 69.57141
4 957.76498 684.23906 995.03163 638.32780 1040.94289
5 84.25050 71.92743 85.92946 69.85901 87.99788
library(MASS)
hotel <- function(x) {
mu0 <- rep(0, ncol(x))
one <- rep(1, nrow(x))
ident <- diag(nrow(x))
ybar <- t(x) %*% one / nrow(x)
s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
print(mu0)
print(ybar)
print(s)
t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
df1 <- ncol(x)
df2 <- nrow(x) - ncol(x)
p <- 1 - pf(f, df1, df2)
print(t2)
print(f)
print(df1)
print(df2)
print(p)
}
spouse <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/spouse.csv")
d1<- spouse$h1 - spouse$w1
d2<- spouse$h2 - spouse$w2
d3<- spouse$h3 - spouse$w3
d4<- spouse$h4 - spouse$w4
diffs <- data.frame(d1, d2, d3, d4)
x <- as.matrix(diffs)
hotel(x)
[1] 0 0 0 0
[,1]
d1 0.06666667
d2 -0.13333333
d3 -0.30000000
d4 -0.13333333
d1 d2 d3 d4
d1 0.82298851 0.07816092 -0.0137931 -0.05977011
d2 0.07816092 0.80919540 -0.2137931 -0.15632184
d3 -0.01379310 -0.21379310 0.5620690 0.51034483
d4 -0.05977011 -0.15632184 0.5103448 0.60229885
[,1]
[1,] 13.12784
[,1]
[1,] 2.942447
[1] 4
[1] 26
[,1]
[1,] 0.03936914
In this output losim and upsim give the lower and upper bounds for the simultaneous intervals, and lobon and upbon give the lower and upper bounds for the Bonferroni interval
Variable<-noquote(c("d1","d2","d3","d4"))
var<-c(var(d1),var(d2),var(d3),var(d4))
means <-c(mean(d1),mean(d2),mean(d3),mean(d4))
p<-4
n<-nrow(diffs)
t1 <- qt(1 - 0.025, n - 1)
tb <- qt(1 - 0.025 / p, n - 1)
f <- qf(0.95, p, n - p)
loone <- means - t1 * sqrt(var / n)
upone <- means + t1 * sqrt(var / n)
lobon <- means - tb * sqrt(var / n)
upbon <- means + tb * sqrt(var / n)
losim <- means - sqrt(p * (n - 1) * f * var / (n - p) / n)
upsim <- means + sqrt(p * (n - 1) * f * var / (n - p) / n)
tab <- data.frame(Variable,n,means,var,t1,tb,f,loone,upone,lobon,upbon,losim,upsim)
tab
Variable n means var t1 tb f loone
1 d1 30 0.06666667 0.8229885 2.04523 2.663196 2.742594 -0.2720826
2 d2 30 -0.13333333 0.8091954 2.04523 2.663196 2.742594 -0.4692319
3 d3 30 -0.30000000 0.5620690 2.04523 2.663196 2.742594 -0.5799473
4 d4 30 -0.13333333 0.6022989 2.04523 2.663196 2.742594 -0.4231261
upone lobon upbon losim upsim
1 0.40541591 -0.3744357 0.50776899 -0.5127078 0.6460411
2 0.20256524 -0.5707236 0.30405698 -0.7078322 0.4411655
3 -0.02005272 -0.6645333 0.06453334 -0.7788034 0.1788034
4 0.15645938 -0.5106869 0.24402025 -0.6289758 0.3623091
library(MASS)
hotel <- function(x) {
mu0 <- rep(0, ncol(x))
one <- rep(1, nrow(x))
ident <- diag(nrow(x))
ybar <- t(x) %*% one / nrow(x)
s <- t(x) %*% ((ident - one %*% t(one) / nrow(x)) %*% x) / (nrow(x) - 1)
print(mu0)
print(ybar)
print(s)
t2 <- nrow(x) * t(ybar - mu0) %*% solve(s) %*% (ybar - mu0)
f <- (nrow(x) - ncol(x)) * t2 / ncol(x) / (nrow(x) - 1)
df1 <- ncol(x)
df2 <- nrow(x) - ncol(x)
p <- 1 - pf(f, df1, df2)
print(t2)
print(f)
print(df1)
print(df2)
print(p)
}
spouse <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/spouse.csv")
d1<- spouse$h1 - spouse$w2
d2<- spouse$h2 - spouse$w1
d3<- spouse$h3 - spouse$w4
d4<- spouse$h4 - spouse$w3
diffs <- data.frame(d1, d2, d3, d4)
x <- as.matrix(diffs)
hotel(x)
[1] 0 0 0 0
[,1]
d1 -0.2000000
d2 0.1333333
d3 -0.2000000
d4 -0.2333333
d1 d2 d3 d4
d1 0.7862069 0.13103448 -0.14482759 -0.18620690
d2 0.1310345 0.74022989 -0.04137931 -0.07126437
d3 -0.1448276 -0.04137931 0.51034483 0.50344828
d4 -0.1862069 -0.07126437 0.50344828 0.66781609
[,1]
[1,] 6.426621
[,1]
[1,] 1.44045
[1] 4
[1] 26
[,1]
[1,] 0.2488886
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
swissgenuine<-swiss%>%
filter(type =="real")
swisscounterfeit<-swiss%>%
filter(type =="fake")
df1<-data.frame(Variable = "Length",
Genuine = mean(swissgenuine$length),
Counterfeit=mean(swisscounterfeit$length))
df2<-data.frame(Variable = "Left Width",
Genuine = mean(swissgenuine$left),
Counterfeit=mean(swisscounterfeit$left))
df3<-data.frame(Variable = "Right Width",
Genuine = mean(swissgenuine$right),
Counterfeit=mean(swisscounterfeit$right))
df4<-data.frame(Variable = "Bottom Margin",
Genuine = mean(swissgenuine$bottom),
Counterfeit=mean(swisscounterfeit$bottom))
df5<-data.frame(Variable = "Top Margin",
Genuine = mean(swissgenuine$top),
Counterfeit=mean(swisscounterfeit$top))
df6<-data.frame(Variable = "Diagonal",
Genuine = mean(swissgenuine$diag),
Counterfeit= mean(swisscounterfeit$diag))
rbind(df1,df2,df3,df4,df5,df6)
Variable Genuine Counterfeit
1 Length 214.969 214.823
2 Left Width 129.943 130.300
3 Right Width 129.720 130.193
4 Bottom Margin 8.305 10.530
5 Top Margin 10.168 11.133
6 Diagonal 141.517 139.450
library(MASS)
hotel2 <- function(x1, x2) {
n1 <- nrow(x1)
n2 <- nrow(x2)
k <- ncol(x1)
one1 <- matrix(1, nrow = n1, ncol = 1)
one2 <- matrix(1, nrow = n2, ncol = 1)
ident1 <- diag(n1)
ident2 <- diag(n2)
ybar1 <- t(x1) %*% one1 / n1
s1 <- t(x1) %*% ((ident1 - one1 %*% t(one1) / n1) %*% x1) / (n1 - 1.0)
print(round(s1,4)) ##sample variance-covariance matrix for the real or genuine notes
ybar2 <- t(x2) %*% one2 / n2
s2 <- t(x2) %*% ((ident2 - one2 %*% t(one2) / n2) %*% x2) / (n2 - 1.0)
print(round(s2,4)) ##sample variance-covariance for the second sample of notes
spool <- ((n1 - 1.0) * s1 + (n2 - 1.0) * s2) / (n1 + n2 - 2.0)
print(round(spool,4)) ##pooled variance-covariance matrix for the two samples.
t2 <- t(ybar1 - ybar2) %*% solve(spool * (1 / n1 + 1 / n2)) %*% (ybar1 - ybar2)
f <- (n1 + n2 - k - 1) * t2 / k / (n1 + n2 - 2)
df1 <- k
df2 <- n1 + n2 - k - 1
p <- 1 - pf(f, df1, df2)
print(c("t2" = t2, "f" = f, "df1" = df1, "df2" = df2, "p" = p))
}
x1 <- as.matrix(swiss[swiss$type == "real", c("length", "left", "right", "bottom", "top", "diag")])
x2 <- as.matrix(swiss[swiss$type == "fake", c("length", "left", "right", "bottom", "top", "diag")])
hotel2(x1, x2)
length left right bottom top diag
length 0.1502 0.0580 0.0573 0.0571 0.0145 0.0055
left 0.0580 0.1326 0.0859 0.0567 0.0491 -0.0431
right 0.0573 0.0859 0.1263 0.0582 0.0306 -0.0238
bottom 0.0571 0.0567 0.0582 0.4132 -0.2635 -0.0002
top 0.0145 0.0491 0.0306 -0.2635 0.4212 -0.0753
diag 0.0055 -0.0431 -0.0238 -0.0002 -0.0753 0.1998
length left right bottom top diag
length 0.1240 0.0315 0.0240 -0.1006 0.0194 0.0116
left 0.0315 0.0651 0.0468 -0.0240 -0.0119 -0.0051
right 0.0240 0.0468 0.0889 -0.0186 0.0001 0.0342
bottom -0.1006 -0.0240 -0.0186 1.2813 -0.4902 0.2385
top 0.0194 -0.0119 0.0001 -0.4902 0.4045 -0.0221
diag 0.0116 -0.0051 0.0342 0.2385 -0.0221 0.3112
length left right bottom top diag
length 0.1371 0.0448 0.0406 -0.0217 0.0169 0.0085
left 0.0448 0.0988 0.0663 0.0163 0.0186 -0.0241
right 0.0406 0.0663 0.1076 0.0198 0.0154 0.0052
bottom -0.0217 0.0163 0.0198 0.8473 -0.3768 0.1191
top 0.0169 0.0186 0.0154 -0.3768 0.4128 -0.0487
diag 0.0085 -0.0241 0.0052 0.1191 -0.0487 0.2555
t2 f df1 df2 p
2412.4507 391.9217 6.0000 193.0000 0.0000
The bounds of the simultaneous confidence intervals are given in columns losim and upsim. The Bonferroni Corrected (1 - α) x 100% Confidence Intervals are given in columns lobon and upbon
library(dplyr)
library(tidyverse)
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
pop1 <- swiss %>%
filter(type == "real") %>%
pivot_longer(cols = c(length, left, right, bottom, top, diag),
names_to = "variable",
values_to = "x")%>%
group_by(variable) %>%
summarize(n1 = n(),
xbar1 = mean(x),
s21 = var(x))
fake <- swiss %>%
filter(type == "fake") %>%
pivot_longer(cols = c(length, left, right, bottom, top, diag),
names_to = "variable",
values_to = "x")
pop2 <- fake %>%
group_by(variable) %>%
summarize(n2 = n(),
xbar2 = mean(x),
s22 = var(x))
combine <- merge(pop1, pop2, by = "variable")
p <- 6
f <- qf(0.95, p, combine$n1 + combine$n2 - p - 1)
t <- qt(1 - 0.025 / p, combine$n1 + combine$n2 - 2)
sp <- ((combine$n1 - 1) * combine$s21 + (combine$n2 - 1) * combine$s22) / (combine$n1 + combine$n2 - 2)
combine$losim <- combine$xbar1 - combine$xbar2 - sqrt(p * (combine$n1 + combine$n2 - 2) * f * (1/combine$n1 + 1/combine$n2) * sp / (combine$n1 + combine$n2 - p - 1))
combine$upsim <- combine$xbar1 - combine$xbar2 + sqrt(p * (combine$n1 + combine$n2 - 2) * f * (1/combine$n1 + 1/combine$n2) * sp / (combine$n1 + combine$n2 - p - 1))
combine$lobon <- combine$xbar1 - combine$xbar2 - t * sqrt((1/combine$n1 + 1/combine$n2) * sp)
combine$upbon <- combine$xbar1 - combine$xbar2 + t * sqrt((1/combine$n1 + 1/combine$n2) * sp)
cbind(combine, f,t,sp)
variable n1 xbar1 s21 n2 xbar2 s22 losim upsim
1 bottom 100 8.305 0.4132071 100 10.530 1.28131313 -2.69809429 -1.7519057
2 diag 100 141.517 0.1998091 100 139.450 0.31121212 1.80719722 2.3268028
3 left 100 129.943 0.1325768 100 130.300 0.06505051 -0.51856518 -0.1954348
4 length 100 214.969 0.1502414 100 214.823 0.12401111 -0.04432667 0.3363267
5 right 100 129.720 0.1262626 100 130.193 0.08894040 -0.64159649 -0.3044035
6 top 100 10.168 0.4211879 100 11.133 0.40445556 -1.29523310 -0.6347669
lobon upbon f t sp
1 -2.571916424 -1.8780836 2.145801 2.665026 0.84726010
2 1.876488608 2.2575114 2.145801 2.665026 0.25551061
3 -0.475474511 -0.2385255 2.145801 2.665026 0.09881364
4 0.006434909 0.2855651 2.145801 2.665026 0.13712626
5 -0.596630515 -0.3493695 2.145801 2.665026 0.10760152
6 -1.207157404 -0.7228426 2.145801 2.665026 0.41282172
library(dplyr)
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv") # Skip the first row
swiss <- swiss %>%
mutate(
length = length - 215.0,
left = left - 130.0,
right = right - 130.0,
bottom = bottom - 9.0,
top = top - 10.0,
diag = diag - 141.354
) %>%
pivot_longer(
cols = c(length, left, right, bottom, top, diag),
names_to = "variable",
values_to = "x"
)
swiss <- swiss %>%
arrange(type, variable)
a <- swiss %>%
group_by(type, variable) %>%
summarize(xbar = mean(x))
library(ggplot2)
# Create the line plot
ggplot(a, aes(x = variable, y = xbar, group = type, color = type)) +
geom_line() +
labs(title = "Profile Plots", x = "Variable", y = "Mean") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(biotools)
Warning: package 'biotools' was built under R version 4.3.3
swiss <- read_csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
result <- boxM(data = swiss[, -1], grouping = swiss$type)
result
Box's M-test for Homogeneity of Covariance Matrices
data: swiss[, -1]
Chi-Sq (approx.) = 121.9, df = 21, p-value = 3.198e-16
library(Hotelling)
swiss <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/swiss3.csv")
x1 <- subset(swiss, type == "real", select = c(length, left, right, bottom, top, diag))
x2 <- subset(swiss, type == "fake", select = c(length, left, right, bottom, top, diag))
n1 <- nrow(x1)
n2 <- nrow(x2)
k <- ncol(x1)
ybar1 <- colMeans(x1)
ybar2 <- colMeans(x2)
s1 <- cov(x1)
s2 <- cov(x2)
st <- s1 / n1 + s2 / n2
t2 <- t(ybar1 - ybar2) %*% solve(st) %*% (ybar1 - ybar2)
df1 <- k
df2 <- n1 + n2 - k - 1
p_value <- 1 - pf(((n1 + n2 - k - 1) * t2) / (k * (n1 + n2 - 2)), df1, df2)
print(paste("Test statistic:", t2))
[1] "Test statistic: 2412.45068552327"
print(paste("Degrees of freedom:", df1, df2))
[1] "Degrees of freedom: 6 193"
print(paste("p-value:", p_value))
[1] "p-value: 0"
pottery<- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv")
models <- list()
resids <- data.frame(site = pottery$site)
for(response in c("al", "fe", "mg", "ca", "na")) {
formula <- as.formula(paste(response, "~ site"))
# Fit the model
model <- lm(formula, data = pottery)
models[[response]] <- model
resids[[paste0("r", response)]] <- residuals(model)
}
print(head(resids))
site ral rfe rmg rca rna
1 L 1.835714 0.627857143 -0.5264286 -0.052142857 0.25928571
2 L 1.235714 0.707857143 -1.3964286 -0.082142857 -0.08071429
3 L 2.035714 0.717857143 -0.9464286 -0.072142857 -0.05071429
4 L -1.064286 -0.002142857 0.8135714 -0.042142857 -0.11071429
5 L 1.235714 0.687857143 0.5135714 -0.002142857 -0.05071429
6 L -1.664286 -0.112142857 -1.3564286 -0.032142857 -0.03071429
for(variable in c("al", "fe", "mg", "ca", "na")) {
hist(pottery[[variable]],
main = paste("Histogram of", variable),
xlab = variable,
col = "lightblue",
border = "darkblue")
}
pairs(~al + fe + mg + ca + na, data = pottery,
main = "Scatterplot Matrix",
pch = 19,
col = "blue",
cex = 1)
p <- 5
g <- length(unique(pottery$site))
df <- 0.5 * p * (p + 1) * (g - 1)
cov_matrices <- list()
for (site in unique(pottery$site)) {
cov_matrices[[site]] <- cov(subset(pottery, site == site)[, c("al", "fe", "mg", "ca", "na")])
}
l_statistic <- 0
for (i in 1:(g - 1)) {
for (j in (i + 1):g) {
l_statistic <- l_statistic + (sum(diag((cov_matrices[[i]] + cov_matrices[[j]]) %*% solve(cov_matrices[[i]] + cov_matrices[[j]]))) - p)
}
}
l_statistic <- l_statistic * 2
alpha <- 0.05
critical_value <- qchisq(1 - alpha, df)
cat("Test statistic:", l_statistic, "\n")
Test statistic: -2.131628e-14
cat("Degrees of freedom:", df, "\n")
Degrees of freedom: 45
cat("Critical value (", 1 - alpha, "):", critical_value, "\n")
Critical value ( 0.95 ): 61.65623
if (l_statistic > critical_value) {
cat("Reject the null hypothesis: Covariance matrices are not homogeneous.\n")
} else {
cat("Fail to reject the null hypothesis: Covariance matrices are homogeneous.\n")
}
Fail to reject the null hypothesis: Covariance matrices are homogeneous.
outcome <- cbind(pottery$al, pottery$fe, pottery$mg, pottery$ca, pottery$na)
model<- manova(outcome ~ site, data = pottery)
summary(model, intercept = TRUE, test = "Wilks")
Df Wilks approx F num Df den Df Pr(>F)
(Intercept) 1 0.0068354 523.07 5 18.000 < 2.2e-16 ***
site 3 0.0123009 13.09 15 50.091 1.84e-12 ***
Residuals 22
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pottery <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv", stringsAsFactors = TRUE)
library(tidyverse)
pottery_long <- pottery %>%
gather(key = "chemical", value = "amount", al:na)
means <- pottery_long %>%
group_by(site, chemical) %>%
summarize(mean = mean(amount))
library(ggplot2)
ggplot(means, aes(x = chemical, y = mean, group = site, color = site)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Profile Plot for Pottery Data", x = "Chemical", y = "Mean Amount") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# For specific univariate ANOVA results
al<-summary(lm(al ~ site, data = pottery))
fe<-summary(lm(fe ~ site, data = pottery))
mg<-summary(lm(mg ~ site, data = pottery))
ca<-summary(lm(ca ~ site, data = pottery))
na<-summary(lm(na ~ site, data = pottery))
df1<-data.frame(Variable = "Al",
F_value = al$fstatistic[1])
df2<-data.frame(Variable = "Fe",
F_value = fe$fstatistic[1])
df3<-data.frame(Variable = "Mg",
F_value = mg$fstatistic[1])
df4<-data.frame(Variable = "Ca",
F_value = ca$fstatistic[1])
df5<-data.frame(Variable = "Na",
F_value = na$fstatistic[1])
rbind(df1,df2,df3,df4,df5)
Variable F_value
value Al 26.669259
value1 Fe 89.882725
value2 Mg 49.120088
value3 Ca 29.156699
value4 Na 9.502604
library(car)
pottery <- read.csv("D:/SCHOOL/MULTIVARIATE ANALYSIS/DATA/pottery.csv")
pottery$site <- factor(pottery$site)
site_contrasts <- matrix(c(8, -2, 8, -14, # C+L-A-I
1, 0, -1, 0, # A vs I
0, 1, 0, -1), # C vs L
nrow = 3, byrow = TRUE)
rownames(site_contrasts) <- c("C+L-A-I", "A vs I", "C vs L")
colnames(site_contrasts) <- c("C", "L", "A", "I")
print(site_contrasts)
C L A I
C+L-A-I 8 -2 8 -14
A vs I 1 0 -1 0
C vs L 0 1 0 -1