Website :
https://rpubs.com/Isaiah-Mireles/1436183
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
library(faraway)
data(punting)
m1 <- lm(Distance~RStr+LStr+RFlex+LFlex, data=punting)
print("Coef")
## [1] "Coef"
m1$coefficients
## (Intercept) RStr LStr RFlex LFlex
## -79.6236456 0.5116373 -0.1861996 2.3745010 -0.5277338
print("--")
## [1] "--"
print("Covariance Matrix")
## [1] "Covariance Matrix"
summary(m1)$cov.unscaled
## (Intercept) RStr LStr RFlex
## (Intercept) 16.1396579363 0.0002123785 0.0378939380 -0.2123867649
## RStr 0.0002123785 0.0008844524 -0.0006380244 -0.0007371097
## LStr 0.0378939380 -0.0006380244 0.0009870667 -0.0010113169
## RFlex -0.2123867649 -0.0007371097 -0.0010113169 0.0077499755
## LFlex -0.0133860443 0.0003449921 0.0001219639 -0.0030130930
## LFlex
## (Intercept) -0.0133860443
## RStr 0.0003449921
## LStr 0.0001219639
## RFlex -0.0030130930
## LFlex 0.0025563651
confint(m1, level=.8)
## 10 % 90 %
## (Intercept) -171.2456955 11.9984043
## RStr -0.1666131 1.1898877
## LStr -0.9027160 0.5303167
## RFlex 0.3667817 4.3822204
## LFlex -1.6808266 0.6253591
Intercept seems significantly different
from 0 and RFlex is the only other which doesnt contain
0.punting
library(car)
linearHypothesis(m1, "RFlex = LFlex")
\[ H_o : \beta_L=\beta_R \\ H_a : \beta_L\ne\beta_R \]
# doing it manually :
X <- model.matrix(m1)
y <- punting$Distance
bhat <- solve(t(X) %*% X) %*% t(X) %*% y
A <- matrix(c(0, 0, 0, 1, -1), nrow = 1) # b3=b4 or b3-b4 = 0
sigma2hat <- sum(resid(m1)^2) / df.residual(m1)
Fstat <- t(A %*% bhat) %*%
solve(A %*% solve(t(X) %*% X) %*% t(A)) %*%
(A %*% bhat) / (1 * sigma2hat)
Fstat
## [,1]
## [1,] 1.934568
# p-val
pf(Fstat, df1 = 1, df2 = df.residual(m1), lower.tail = FALSE)
## [,1]
## [1,] 0.2017235
citation : Sec. 4.4.2, p. 116, Introduction to Regression Modeling (B. Abraham and J. Ledolter)
paste0("F-Stat = ",summary(m1)$fstatistic[1]|>round(2))
## [1] "F-Stat = 5.59"
Residual standard error: 16.33 on 8 degrees of freedom
Multiple R-squared: 0.7365, Adjusted R-squared: 0.6047
F-statistic: 5.59 on 4 and 8 DF, p-value: 0.01902
p-value: 0.01902)b <- coef(m1)[c("RFlex", "LFlex")]
b
## RFlex LFlex
## 2.3745010 -0.5277338
S <- vcov(m1)[c("RFlex", "LFlex"),
c("RFlex", "LFlex")]
S
## RFlex LFlex
## RFlex 2.0659892 -0.8032306
## LFlex -0.8032306 0.6814760
n <- nrow(punting)
p <- length(coef(m1))
cval <- 2 * qf(0.80, df1 = 2, df2 = n - p)
theta <- seq(0, 2*pi, length.out = 200)
eig <- eigen(S)
A <- eig$vectors %*%
diag(sqrt(eig$values * cval))
circle <- rbind(cos(theta), sin(theta))
ellipse <- t(b + A %*% circle)
ellipse <- as.data.frame(ellipse)
names(ellipse) <- c("RFlex", "LFlex")
library(ggplot2)
library(grid)
# semi-axis lengths
axis_lengths <- sqrt(eig$values * cval)
ggplot(ellipse, aes(RFlex, LFlex)) +
# ellipse
geom_path(color = "blue", linewidth = 1) +
# center point
annotate("point",
x = b[1],
y = b[2],
color = "red",
size = 3) +
# first eigenvector
annotate("segment",
x = b[1],
y = b[2],
xend = b[1] + axis_lengths[1] * eig$vectors[1,1],
yend = b[2] + axis_lengths[1] * eig$vectors[2,1],
color = "darkgreen",
linewidth = 1.2,
arrow = arrow(length = unit(0.2, "cm"))) +
# second eigenvector
annotate("segment",
x = b[1],
y = b[2],
xend = b[1] + axis_lengths[2] * eig$vectors[1,2],
yend = b[2] + axis_lengths[2] * eig$vectors[2,2],
color = "purple",
linewidth = 1.2,
arrow = arrow(length = unit(0.2, "cm"))) +
labs(
title = "80% Confidence Region for RFlex and LFlex",
subtitle = "Eigenvector directions shown",
x = "RFlex coefficient",
y = "LFlex coefficient"
) +
theme_minimal()
cancer <- read.table("Downloads/cancer.txt", header=T)
library(dplyr)
s <- cancer |> select(where(is.numeric)) |> apply(2, sd)
s
## All.cancers Lung Colon Melanoma F.breast Pancreas
## 47.857634 14.724588 7.262243 5.245611 7.888396 1.482258
## Leukemia Ovarian Cervix Prostate Liver
## 2.255171 1.358915 1.421959 16.858436 2.241009
s |> as.data.frame() |> dplyr::arrange(desc(s))
range(s)
## [1] 1.358915 47.857634
numeric_cancer <- cancer |> select(where(is.numeric))
# cheating
pca <-
prcomp(numeric_cancer, center=F, scale=F)
wt_S <- cov(numeric_cancer)
eigen(wt_S)
## eigen() decomposition
## $values
## [1] 2538.0316069 242.2089744 75.0597300 45.8760799 25.7666346
## [6] 12.0841259 6.0599825 2.0559860 1.5425573 0.7040127
## [11] 0.6408904
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.948911183 0.048606436 0.20935669 0.10770752 0.115235991
## [2,] 0.173707714 0.668729611 -0.56080328 -0.41636726 -0.079471933
## [3,] 0.098867058 0.127018189 -0.01199414 0.54795323 -0.397823583
## [4,] 0.029715889 -0.112583098 0.14293336 -0.28359794 0.694490359
## [5,] 0.077227953 -0.127033747 0.46441765 -0.63576945 -0.555746943
## [6,] 0.015969632 0.005649558 0.06769941 0.02884992 0.032789987
## [7,] 0.019995381 -0.039163522 0.04279231 0.08030527 -0.028433176
## [8,] 0.004948948 -0.004370774 0.07143164 -0.02340922 -0.057920669
## [9,] 0.014052752 0.035246938 -0.01491943 0.05488954 0.006994686
## [10,] 0.227792155 -0.708933677 -0.62503708 -0.12892118 -0.147295540
## [11,] 0.002286687 0.008824253 0.05478220 0.05187759 -0.063138084
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.151902582 0.025821125 -0.05163051 0.03245476 -0.03184170
## [2,] 0.118220768 -0.012290183 0.08765643 -0.04797850 0.05448703
## [3,] 0.711305149 -0.086208795 0.01615325 0.01060887 0.00232018
## [4,] 0.605596140 -0.113265129 0.14449875 -0.01166668 -0.01237297
## [5,] 0.199046073 -0.015777078 0.03263704 0.05340703 0.02882697
## [6,] -0.009420867 -0.187574322 -0.18120002 -0.48984821 0.80727359
## [7,] -0.024805801 0.572854039 0.75927739 -0.08114244 0.26637074
## [8,] -0.016496386 0.008520542 0.01949652 -0.83868297 -0.43471333
## [9,] -0.026794542 -0.209819414 0.32218462 -0.17376917 -0.27853322
## [10,] 0.094279052 -0.057460006 0.03201328 -0.05044051 0.02786371
## [11,] -0.200114261 -0.753610008 0.50289278 0.10430398 0.07107011
## [,11]
## [1,] -0.0045249774
## [2,] -0.0369717911
## [3,] -0.0408123799
## [4,] -0.0502259945
## [5,] 0.0673818868
## [6,] 0.1829592658
## [7,] -0.0784928893
## [8,] -0.3128281937
## [9,] 0.8596244870
## [10,] -0.0003258794
## [11,] -0.3367980425
eigen(wt_S)$values
## [1] 2538.0316069 242.2089744 75.0597300 45.8760799 25.7666346
## [6] 12.0841259 6.0599825 2.0559860 1.5425573 0.7040127
## [11] 0.6408904
Lets consider the correlation between the score and raw data to see where each maps onto
score <- numeric_cancer |> as.matrix() %*% eigen(wt_S)$vectors
cor(score, numeric_cancer)[1:2,] |> as.data.frame(row.names = c("PC1", "PC2"))
here we see PC1 maps strongly (cor > .7) onto
All.cancers , and almost
Colon + Prostate
here we see PC2 maps strongly (cor > .7) onto
Lung , and almost
Prostate
pca$rotation[, 1:2]
## PC1 PC2
## All.cancers -0.92713211 -0.1530569566
## Lung -0.11384304 -0.6815012161
## Colon -0.08992133 -0.1367690498
## Melanoma -0.04067026 0.1121880804
## F.breast -0.20590064 0.2517498719
## Pancreas -0.02201104 0.0001258049
## Leukemia -0.02615877 0.0390810036
## Ovarian -0.02037864 0.0204077046
## Cervix -0.01256432 -0.0355731175
## Prostate -0.27084794 0.6436035596
## Liver -0.01517559 0.0049920437
s[!(names(s) %in% c("All.cancers", "Colon", "Prostate", "Lung"))] |> round(1)
## Melanoma F.breast Pancreas Leukemia Ovarian Cervix Liver
## 5.2 7.9 1.5 2.3 1.4 1.4 2.2
s[c("All.cancers", "Colon", "Prostate", "Lung")] |> round(1)
## All.cancers Colon Prostate Lung
## 47.9 7.3 16.9 14.7
scale_df <- numeric_cancer |> scale() |> as.data.frame()
scale_cov <- cov(scale_df)
scale_cov
## All.cancers Lung Colon Melanoma F.breast
## All.cancers 1.00000000 0.58875621 0.68945026 0.28681733 0.49437002
## Lung 0.58875621 1.00000000 0.52406545 -0.08097882 0.06450011
## Colon 0.68945026 0.52406545 1.00000000 -0.13422455 0.11322753
## Melanoma 0.28681733 -0.08097882 -0.13422455 1.00000000 0.34006107
## F.breast 0.49437002 0.06450011 0.11322753 0.34006107 1.00000000
## Pancreas 0.56098838 0.20702708 0.41884934 0.25593370 0.33972785
## Leukemia 0.45145940 -0.02192680 0.34181185 0.09585910 0.25929143
## Ovarian 0.19528851 -0.04672923 0.09083837 0.06103151 0.46420489
## Cervix 0.50393937 0.54862565 0.56130273 -0.08429720 -0.04964454
## Prostate 0.65589071 0.05971904 0.28605314 0.33457097 0.38179845
## Liver 0.06285748 -0.02625419 0.08898474 -0.17648425 0.06207339
## Pancreas Leukemia Ovarian Cervix Prostate
## All.cancers 0.56098838 0.45145940 0.19528851 0.50393937 0.65589071
## Lung 0.20702708 -0.02192680 -0.04672923 0.54862565 0.05971904
## Colon 0.41884934 0.34181185 0.09083837 0.56130273 0.28605314
## Melanoma 0.25593370 0.09585910 0.06103151 -0.08429720 0.33457097
## F.breast 0.33972785 0.25929143 0.46420489 -0.04964454 0.38179845
## Pancreas 1.00000000 0.09837914 0.40341323 0.38709200 0.19561036
## Leukemia 0.09837914 1.00000000 0.19003994 0.07386321 0.41411563
## Ovarian 0.40341323 0.19003994 1.00000000 0.06983663 0.02859465
## Cervix 0.38709200 0.07386321 0.06983663 1.00000000 0.10398098
## Prostate 0.19561036 0.41411563 0.02859465 0.10398098 1.00000000
## Liver 0.30483692 -0.25885521 0.09312273 0.42160105 -0.07330917
## Liver
## All.cancers 0.06285748
## Lung -0.02625419
## Colon 0.08898474
## Melanoma -0.17648425
## F.breast 0.06207339
## Pancreas 0.30483692
## Leukemia -0.25885521
## Ovarian 0.09312273
## Cervix 0.42160105
## Prostate -0.07330917
## Liver 1.00000000
cor(numeric_cancer)
## All.cancers Lung Colon Melanoma F.breast
## All.cancers 1.00000000 0.58875621 0.68945026 0.28681733 0.49437002
## Lung 0.58875621 1.00000000 0.52406545 -0.08097882 0.06450011
## Colon 0.68945026 0.52406545 1.00000000 -0.13422455 0.11322753
## Melanoma 0.28681733 -0.08097882 -0.13422455 1.00000000 0.34006107
## F.breast 0.49437002 0.06450011 0.11322753 0.34006107 1.00000000
## Pancreas 0.56098838 0.20702708 0.41884934 0.25593370 0.33972785
## Leukemia 0.45145940 -0.02192680 0.34181185 0.09585910 0.25929143
## Ovarian 0.19528851 -0.04672923 0.09083837 0.06103151 0.46420489
## Cervix 0.50393937 0.54862565 0.56130273 -0.08429720 -0.04964454
## Prostate 0.65589071 0.05971904 0.28605314 0.33457097 0.38179845
## Liver 0.06285748 -0.02625419 0.08898474 -0.17648425 0.06207339
## Pancreas Leukemia Ovarian Cervix Prostate
## All.cancers 0.56098838 0.45145940 0.19528851 0.50393937 0.65589071
## Lung 0.20702708 -0.02192680 -0.04672923 0.54862565 0.05971904
## Colon 0.41884934 0.34181185 0.09083837 0.56130273 0.28605314
## Melanoma 0.25593370 0.09585910 0.06103151 -0.08429720 0.33457097
## F.breast 0.33972785 0.25929143 0.46420489 -0.04964454 0.38179845
## Pancreas 1.00000000 0.09837914 0.40341323 0.38709200 0.19561036
## Leukemia 0.09837914 1.00000000 0.19003994 0.07386321 0.41411563
## Ovarian 0.40341323 0.19003994 1.00000000 0.06983663 0.02859465
## Cervix 0.38709200 0.07386321 0.06983663 1.00000000 0.10398098
## Prostate 0.19561036 0.41411563 0.02859465 0.10398098 1.00000000
## Liver 0.30483692 -0.25885521 0.09312273 0.42160105 -0.07330917
## Liver
## All.cancers 0.06285748
## Lung -0.02625419
## Colon 0.08898474
## Melanoma -0.17648425
## F.breast 0.06207339
## Pancreas 0.30483692
## Leukemia -0.25885521
## Ovarian 0.09312273
## Cervix 0.42160105
## Prostate -0.07330917
## Liver 1.00000000
eigen(scale_cov)
## eigen() decomposition
## $values
## [1] 3.64050939 2.01901973 1.48934946 1.02774399 0.85217841 0.64292122
## [7] 0.45622412 0.38436246 0.26127071 0.18558497 0.04083554
##
## $vectors
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.5010584 -0.03752827 0.13564906 0.10200653 0.00661866 -0.10905402
## [2,] 0.2867743 0.34463886 0.24975314 0.12961783 0.53167942 -0.28217876
## [3,] 0.3905015 0.24854728 0.20719239 -0.20988812 -0.03157362 0.11291776
## [4,] 0.1316843 -0.41857218 -0.05859752 0.60913595 0.15614063 0.38162735
## [5,] 0.2764510 -0.37218720 -0.27254464 -0.03686353 0.14210375 -0.57513296
## [6,] 0.3555025 0.02591798 -0.38107453 0.09964642 0.07993039 0.45035470
## [7,] 0.2406797 -0.28184171 0.30247121 -0.48821395 -0.28970563 0.27410672
## [8,] 0.1834695 -0.18665306 -0.48476086 -0.49457994 0.29556288 0.06044586
## [9,] 0.3183348 0.43674873 -0.03254542 0.07805234 -0.10765380 0.18024636
## [10,] 0.3061206 -0.30469450 0.22752557 0.19475514 -0.44400656 -0.24994785
## [11,] 0.0861796 0.32523824 -0.52350067 0.13936291 -0.53413119 -0.20674709
## [,7] [,8] [,9] [,10] [,11]
## [1,] 0.04374209 0.006792296 0.11185970 -0.18508041 0.811210603
## [2,] -0.14537839 0.007252804 0.29034347 -0.33824243 -0.370950840
## [3,] 0.37193300 0.131962267 -0.70550259 -0.07120336 -0.162640106
## [4,] -0.30791519 0.098058275 -0.31623819 -0.21670726 -0.107522730
## [5,] -0.04175042 0.414550768 -0.10494771 0.40179558 -0.111317074
## [6,] 0.50502008 0.098916420 0.43699136 0.16737802 -0.157430047
## [7,] -0.33568094 0.364797251 0.27142205 -0.15902722 -0.170336907
## [8,] -0.19263901 -0.503971154 -0.13826954 -0.22490690 0.009368924
## [9,] -0.54255118 -0.168849994 -0.04772674 0.57564539 0.012902563
## [10,] 0.15577866 -0.581436598 0.09082973 0.03419142 -0.303946738
## [11,] -0.13637156 0.193887091 -0.01248343 -0.44882159 -0.086554294
score <- scale_df |> as.matrix() %*% eigen(scale_cov)$vectors
cor(score, scale_df)[1:2, ] |> as.data.frame(row.names = c("PC1", "PC2"))
library(dplyr)
df <- read.csv("Downloads/farmers.csv")
df <- df |> select(Family, DistRD, Cattle)
plot(df$Family, df$DistRD)
Remove outliers
df <- df |> arrange(desc(Family)) |> slice(-1) |> arrange(desc(DistRD)) |> slice(-(1:2))
plot(df$Family, df$DistRD)
plot(df$DistRD, df$Cattle)
df <- df |> arrange(desc(Cattle)) |> slice(-1)
plot(df$DistRD, df$Cattle)
# standardize data -- center 0, sd 1
df <- scale(df) |> as.data.frame()
# verify
lapply(df, sd)
## $Family
## [1] 1
##
## $DistRD
## [1] 1
##
## $Cattle
## [1] 1
print("-")
## [1] "-"
lapply(df, mean)
## $Family
## [1] 8.516779e-17
##
## $DistRD
## [1] -2.037944e-16
##
## $Cattle
## [1] 2.433366e-17
recall
\[ y=\Gamma(x-\mu) \]except we’ve already centered our data so \(\mu=0\)
S <- cov(df)
eigen(S)$values
## [1] 1.5550096 1.0068188 0.4381716
sqrt(eigen(S)$values)
## [1] 1.2470003 1.0034036 0.6619453
eigen(S)$vectors
## [,1] [,2] [,3]
## [1,] 0.70557081 -0.10127783 0.7013648
## [2,] 0.02986413 0.99310480 0.1133622
## [3,] 0.70800986 0.05903939 -0.7037303
Verify :
prcomp(df)
## Standard deviations (1, .., p=3):
## [1] 1.2470003 1.0034036 0.6619453
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## Family -0.70557081 -0.10127783 -0.7013648
## DistRD -0.02986413 0.99310480 -0.1133622
## Cattle -0.70800986 0.05903939 0.7037303
prcomp(df) |> summary()
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.2470 1.0034 0.6619
## Proportion of Variance 0.5183 0.3356 0.1461
## Cumulative Proportion 0.5183 0.8539 1.0000
library(tidyverse)
library(plotly)
df <-
read.csv("Downloads/farmers.csv") |>
select(Family, DistRD, Cattle) |>
arrange(desc(Family)) |> slice(-1) |>
arrange(desc(DistRD)) |> slice(-c(1:2)) |>
arrange(desc(Cattle)) |> slice(-1)
plot_ly(
data = df,
x = ~Family,
y = ~DistRD,
z = ~Cattle,
type = "scatter3d",
mode = "markers"
) |>
layout(
scene = list(
xaxis = list(title = "Family"),
yaxis = list(title = "DistRD"),
zaxis = list(title = "Cattle")
)
)
okay here we dont standardize the data
we can see there are 2 clusters
most of the variation appears in the DistRD direction and Family direction and little in cattle
Lets see if this obs remains consistent with loadings
Before this lets investigate covariance matrix
cov(df)
## Family DistRD Cattle
## Family 385.68113 -15.865202 69.557268
## DistRD -15.86520 575.979547 8.730784
## Cattle 69.55727 8.730784 40.796804
DistRD with the largest variance
and Family with less but comparable amt of variation—and
Cattle with relatively little. Therefore, we should expect
the linear combination of the \(X_i \in X \in
R^{3x1}\) to have the large variance directions dominate.# center data
xbar <- colMeans(df) |> as.matrix()
Xbar <- xbar %*% rep(1, nrow(df)) |> as.matrix() |> t()
center <- df - Xbar
pc_comp <- prcomp(center)$rotation
pc_comp
## PC1 PC2 PC3
## Family 0.080418438 0.97823884 0.19126327
## DistRD -0.996744351 0.07780729 0.02113583
## Cattle -0.005794211 0.19234030 -0.98131118
lambda <- prcomp(center)$sdev^2
Lets verify properties of our PCs :
t(pc_comp[,1]) %*% pc_comp[,2]
## [,1]
## [1,] 1.415412e-18
t(pc_comp[,1]) %*% pc_comp[,3]
## [,1]
## [1,] -5.189965e-17
t(pc_comp[,2]) %*% pc_comp[,3]
## [,1]
## [1,] 1.856125e-17
t(pc_comp) %*% pc_comp
## PC1 PC2 PC3
## PC1 1.000000e+00 1.415412e-18 -5.189965e-17
## PC2 1.415412e-18 1.000000e+00 1.856125e-17
## PC3 -5.189965e-17 1.856125e-17 1.000000e+00
colSums(pc_comp^2)
## PC1 PC2 PC3
## 1 1 1
PC1 <- pc_comp[,1]
PC2 <- pc_comp[,2]
PC3 <- pc_comp[,3]
# scale by eigen values (most to least variance)
s1 <- lambda[1] * PC1
s2 <- lambda[2] * PC2
s3 <- lambda[3] * PC3
plot_ly(
data = center,
x = ~Family,
y = ~DistRD,
z = ~Cattle,
type = "scatter3d",
mode = "markers"
) |>
add_trace(
x = c(-s1[1], s1[1]),
y = c(-s1[2], s1[2]),
z = c(-s1[3], s1[3]),
type = "scatter3d",
mode = "lines",
name = "PC1"
) |>
add_trace(
x = c(-s2[1], s2[1]),
y = c(-s2[2], s2[2]),
z = c(-s2[3], s2[3]),
type = "scatter3d",
mode = "lines",
name = "PC2"
) |>
add_trace(
x = c(-s3[1], s3[1]),
y = c(-s3[2], s3[2]),
z = c(-s3[3], s3[3]),
type = "scatter3d",
mode = "lines",
name = "PC3"
) |>
layout(
scene = list(
xaxis = list(title = "Family"),
yaxis = list(title = "DistRD"),
zaxis = list(title = "Cattle")
)
)
center_scale <- scale(center) |> as.data.frame()
pc_comp <- prcomp(center_scale)$rotation
pc_comp
## PC1 PC2 PC3
## Family -0.70557081 -0.10127783 -0.7013648
## DistRD -0.02986413 0.99310480 -0.1133622
## Cattle -0.70800986 0.05903939 0.7037303
lambda <- prcomp(center_scale)$sdev^2
PC1 <- pc_comp[,1]
PC2 <- pc_comp[,2]
PC3 <- pc_comp[,3]
# scale by eigen values (most to least variance)
s1 <- lambda[1] * PC1
s2 <- lambda[2] * PC2
s3 <- lambda[3] * PC3
plot_ly(
data = center_scale,
x = ~Family,
y = ~DistRD,
z = ~Cattle,
type = "scatter3d",
mode = "markers"
) |>
add_trace(
x = c(-s1[1], s1[1]),
y = c(-s1[2], s1[2]),
z = c(-s1[3], s1[3]),
type = "scatter3d",
mode = "lines",
name = "PC1"
) |>
add_trace(
x = c(-s2[1], s2[1]),
y = c(-s2[2], s2[2]),
z = c(-s2[3], s2[3]),
type = "scatter3d",
mode = "lines",
name = "PC2"
) |>
add_trace(
x = c(-s3[1], s3[1]),
y = c(-s3[2], s3[2]),
z = c(-s3[3], s3[3]),
type = "scatter3d",
mode = "lines",
name = "PC3"
) |>
layout(
scene = list(
xaxis = list(title = "Family"),
yaxis = list(title = "DistRD"),
zaxis = list(title = "Cattle")
)
)
itm <- c("A", "B", "C", "D")
x1 <- c(5, 1, -1, 3)
x2 <- c(4,-2,1,1)
df <- data.frame(itm, x1, x2)
plot(x1, x2)
text(x1, x2, labels = itm, pos = 4)
stats package^ (colored by initial
clustering)# manually
df <- cbind(df, cluster=rep(c("C1","C2"), each=2))
df
plot(x1, x2, col = c("red", "blue")[factor(df$cluster)])
text(x1, x2, labels = itm, pos = 2)
points(c(3, 1),
c(1, 1),
pch = 8,
cex = 2,
col=c("red", "blue")[factor(df$cluster)])
df$cluster <- c("C1", rep("C2", 3))
plot(df$x1, df$x2,
col = c("red", "blue")[factor(df$cluster)],
pch = 19)
text(df$x1, df$x2,
labels = df$itm,
pos = 2)
# centroids
points(c(5, 1),
c(4, 0),
pch = 8,
cex = 2,
col = c("red", "blue"))
df$cluster <- rep(c("C1", "C2"), 2)
df
plot(df$x1, df$x2,
col = c("red", "blue")[factor(df$cluster)],
pch = 19)
text(df$x1, df$x2,
labels = df$itm,
pos = 2)
# centroids
points(c(2, 2),
c(2.5, -0.5),
pch = 8,
cex = 2,
col = c("red", "blue"))
groups = {(A, C), (B, D)}
the results are not the same – neither the centroids nor membership is the same.
here we see equivalent “tie” in distances, which i take as staying with the same membership. Meaning both pts {C, D} are equally distant from the centroids of {C1, C2}
df$cluster <- rep(c("C1","C2"), each = 2)
plot(df$x1, df$x2,
col = c("red", "blue")[factor(df$cluster)],
pch = 19)
text(df$x1, df$x2,
labels = df$itm,
pos = 2)
# centroids
points(c(3, 1),
c(1, 1),
pch = 8,
cex = 2,
col = c("red", "blue"))
df$cluster <- c("C1", "C1", "C2", "C1")
plot(df$x1, df$x2,
col = c("red", "blue")[factor(df$cluster)],
pch = 19,
xlab = "x1",
ylab = "x2")
text(df$x1, df$x2,
labels = df$itm,
pos = 2)
# centroids
points(c(3, -1),
c(1, 1),
pch = 8,
cex = 2,
col = c("red", "blue"))
text(c(3, -1),
c(1, 1),
labels = c("m1", "m2"),
pos = 4)
the solutions are not the same.
we notice different ending centroids and a switch between membership
notice that the groups switched
indicating that the initial conditions matter a lot for this algorithm to work
X <- df[,-c(1,4)]
km <- kmeans(X, centers = 2, algorithm="Lloyd")
km
## K-means clustering with 2 clusters of sizes 2, 2
##
## Cluster means:
## x1 x2
## 1 4 2.5
## 2 0 -0.5
##
## Clustering vector:
## [1] 1 2 2 1
##
## Within cluster sum of squares by cluster:
## [1] 6.5 6.5
## (between_SS / total_SS = 65.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
df$cluster <- c("C1", "C2", "C2", "C1")
plot(df$x1, df$x2,
col = c("red", "blue")[factor(df$cluster)],
pch = 19,
xlab = "x1",
ylab = "x2")
text(df$x1, df$x2,
labels = df$itm,
pos = 2)
# centroids
points(c(4, 0),
c(2.5, -0.5),
pch = 8,
cex = 2,
col = c("red", "blue"))
text(c(4, 0),
c(2.5, -0.5),
labels = c("m1", "m2"),
pos = 4)
x <- c(5, 9, 8, 4, 7) # observed number of heads
m <- 10 # number of flips per experiment
K <- 2 # two latent coins: A and B
# initial guesses for:
# pi = mixing probabilities
# theta = coin biases
pi <- c(0.5, 0.5)
theta <- c(0.6, 0.5)
# keep track of parameter history
history <- data.frame()
for(iter in 1:100){
# -------------------
# E-step
# -------------------
# Compute q_ik:
# probability observation i came from coin k
q <- sapply(1:K, function(k){
pi[k] *
dbinom(
x,
size = m,
prob = theta[k]
)
})
# normalize rows so probabilities sum to 1
q <- q / rowSums(q)
# -------------------
# M-step
# -------------------
# Update mixing probabilities
pi_new <- colMeans(q)
# Update coin biases
theta_new <- sapply(1:K, function(k){
sum(q[, k] * x) /
sum(q[, k] * m)
})
# save iteration results
history <- rbind(
history,
data.frame(
iter = iter,
pi1 = pi_new[1],
pi2 = pi_new[2],
theta1 = theta_new[1],
theta2 = theta_new[2]
)
)
# largest parameter change
delta <-
max(
abs(
c(pi_new, theta_new) -
c(pi, theta)
)
)
# stop when parameters stop changing
if(delta < 1e-6){
cat("Converged after", iter, "iterations\n")
break
}
# replace old parameters with updated values
pi <- pi_new
theta <- theta_new
}
## Converged after 52 iterations
# final estimates
pi
## [1] 0.5227561 0.4772439
theta
## [1] 0.7933666 0.5139149
# final responsibilities
q
## [,1] [,2]
## [1,] 0.1176410 0.88235904
## [2,] 0.9586597 0.04134027
## [3,] 0.8646000 0.13539998
## [4,] 0.0354128 0.96458720
## [5,] 0.6374627 0.36253725
# parameter path
history
plot(
history$iter,
history$theta1,
type = "b",
ylim = range(history[, c("theta1", "theta2")]),
ylab = expression(theta),
xlab = "Iteration",
col="red"
)
lines(
history$iter,
history$theta2,
type = "b",
lty = 2,
col="blue"
)
plot(
history$iter,
history$pi1,
type = "b",
ylim = range(history[, c("pi1", "pi2")]),
ylab = expression(pi),
xlab = "Iteration",
col = "red",
pch = 19
)
lines(
history$iter,
history$pi2,
type = "b",
lty = 2,
col = "blue",
pch = 19
)
rm(list=ls())
set.seed(123)
n <- 100
data <- rnorm(n, mean=10, sd = 2)
intercept <- rep(1, n)
signal <- data*4 + intercept
xbar <- mean(signal); xbar
## [1] 41.72325
sd_signal <- sd(signal); sd_signal
## [1] 7.302527
# simulated data -- w/ just signal
hist(signal,
probability = TRUE,
col = "lightgray")
# true underlyinh model
curve(
dnorm(x, mean = 4*10 + 1, sd = 8),
add = TRUE,
col = "red",
lwd = 3
)
abline(v= 4*10 + 1, col="red")
abline(v= 4*10 + 1 + 8 , col="blue")
abline(v= 4*10 + 1 - 8 , col="blue")
# choose variable noise is
noise_var <- 5
noise <- rnorm(n, mean=0, sd = noise_var)
real_data <- signal+noise
df <-
data.frame(x=data, signal, noise, y=real_data)
df[,c("x", "y")] |> plot()
df[,c("x", "y")] |> var()
## x y
## x 3.332931 12.89451
## y 12.894515 73.20579
df[,c("x", "y")] |> cov() |> eigen(symmetric=T)
## eigen() decomposition
## $values
## [1] 75.509431 1.029294
##
## $vectors
## [,1] [,2]
## [1,] 0.1758680 -0.9844138
## [2,] 0.9844138 0.1758680
df[,c("x", "y")] |> cor()
## x y
## x 1.0000000 0.8255038
## y 0.8255038 1.0000000
plot(df$x, df$signal)
lm(df$x ~ df$signal)
##
## Call:
## lm(formula = df$x ~ df$signal)
##
## Coefficients:
## (Intercept) df$signal
## -0.25 0.25
mdl <- lm(df$x ~ df$y)
mdl
##
## Call:
## lm(formula = df$x ~ df$y)
##
## Coefficients:
## (Intercept) df$y
## 2.9264 0.1761
plot(x ~ y, data=df)
abline(mdl, col="red")
xy <- df[,c("x", "y")]
mns <- xy |> colMeans() |> as.matrix() |> t()
mns <- mns[rep(1,n),]
centered <- xy-mns
eig <- eigen(cov(centered))
eig_val <- eig$values
eig_v <- eig$vectors
scale <- sqrt(eig_val)
# ellipse
theta <- seq(0, 2*pi, length.out = 200)
# unit circle
circle <- rbind(cos(theta), sin(theta))
# transform by covariance eigendecomposition -- scale by sd of eig
ellipse <- eig_v %*% diag(sqrt(eig_val)) %*% circle
plot(centered$x, centered$y, col="grey",xlab="x",ylab="y", asp = 1)
points(0, 0, pch = 16, cex = 1, col = "red")
arrows(0, 0,
scale[1] * eig_v[1,1],
scale[1] * eig_v[2,1],
col = "red", lwd = 2, length = 0.1)
arrows(0, 0,
scale[2] * eig_v[1,2],
scale[2] * eig_v[2,2],
col = "blue", lwd = 2, length = 0.1)
lines(ellipse[1, ], ellipse[2, ], col = "darkgreen", lwd = 2)
generate_linear_data()generate_linear_data <- function(
n,
sigma_x,
mu_x,
y = "4*x+1",
sigma_n,
b_int = 0,
mu_n = 0
) {
# generate x
x <- rnorm(n, mean = mu_x, sd = sigma_x)
# make x available inside eval()
x_env <- list2env(list(x = x))
# evaluate signal expression
signal <- eval(parse(text = y), envir = x_env)
# optional intercept adjustment
signal <- signal + b_int
# generate noise
noise <- rnorm(n, mean = mu_n, sd = sigma_n)
# observed response
y_obs <- signal + noise
data.frame(
x = x,
signal = signal,
noise = noise,
y = y_obs
)
}
set.seed(123)
# test original example
df <- generate_linear_data(
n = 100,
sigma_x = 2,
mu_x = 10,
y = "4*x+1",
sigma_n = 5
)
plot_density_ellipse()plot_density_ellipse <- function(df) {
xy <- df[, c("x", "y")]
centered <- sweep(
xy,
2,
colMeans(xy),
"-"
)
eig <- eigen(cov(centered))
eig_val <- eig$values
eig_v <- eig$vectors
scale <- sqrt(eig_val)
theta <- seq(
0,
2*pi,
length.out = 200
)
circle <- rbind(
cos(theta),
sin(theta)
)
ellipse <-
eig_v %*%
diag(scale) %*%
circle
plot(
centered$x,
centered$y,
col = "grey",
xlab = "x",
ylab = "y",
asp = 1
)
points(
0,
0,
pch = 16,
col = "red"
)
arrows(
0,0,
scale[1]*eig_v[1,1],
scale[1]*eig_v[2,1],
col="red",
lwd=2,
length=.1
)
arrows(
0,0,
scale[2]*eig_v[1,2],
scale[2]*eig_v[2,2],
col="blue",
lwd=2,
length=.1
)
lines(
ellipse[1,],
ellipse[2,],
col="darkgreen",
lwd=2
)
invisible(
list(
centered = centered,
eig_values = eig_val,
eig_vectors = eig_v
)
)
}
plot_density_ellipse(df)