Website :

https://rpubs.com/Isaiah-Mireles/1436183

i highly suggest using the website version as its much easier to navigate and allows you to play with 3D graphics

knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE
)

1 Q1) Real World Application

library(faraway)
data(punting)

1.1 a)

m1 <- lm(Distance~RStr+LStr+RFlex+LFlex, data=punting)
print("Coef")

## [1] "Coef"

m1$coefficients

## (Intercept)        RStr        LStr       RFlex       LFlex 
## -79.6236456   0.5116373  -0.1861996   2.3745010  -0.5277338

print("--")

## [1] "--"

print("Covariance Matrix")

## [1] "Covariance Matrix"

summary(m1)$cov.unscaled

##               (Intercept)          RStr          LStr         RFlex
## (Intercept) 16.1396579363  0.0002123785  0.0378939380 -0.2123867649
## RStr         0.0002123785  0.0008844524 -0.0006380244 -0.0007371097
## LStr         0.0378939380 -0.0006380244  0.0009870667 -0.0010113169
## RFlex       -0.2123867649 -0.0007371097 -0.0010113169  0.0077499755
## LFlex       -0.0133860443  0.0003449921  0.0001219639 -0.0030130930
##                     LFlex
## (Intercept) -0.0133860443
## RStr         0.0003449921
## LStr         0.0001219639
## RFlex       -0.0030130930
## LFlex        0.0025563651

1.2 b)

confint(m1, level=.8)

##                     10 %       90 %
## (Intercept) -171.2456955 11.9984043
## RStr          -0.1666131  1.1898877
## LStr          -0.9027160  0.5303167
## RFlex          0.3667817  4.3822204
## LFlex         -1.6808266  0.6253591

the Intercept seems significantly different from 0 and RFlex is the only other which doesnt contain 0.

1.3 c)

punting

library(car)
linearHypothesis(m1, "RFlex = LFlex")

After accounting for right and left strength, there is not enough evidence that right flexibility and left flexibility have different effects on punt distance.

\[ H_o : \beta_L=\beta_R \\ H_a : \beta_L\ne\beta_R \]

# doing it manually  : 
X <- model.matrix(m1)
y <- punting$Distance

bhat <- solve(t(X) %*% X) %*% t(X) %*% y
A <- matrix(c(0, 0, 0, 1, -1), nrow = 1) # b3=b4 or b3-b4 = 0

sigma2hat <- sum(resid(m1)^2) / df.residual(m1)

Fstat <- t(A %*% bhat) %*%
  solve(A %*% solve(t(X) %*% X) %*% t(A)) %*%
  (A %*% bhat) / (1 * sigma2hat)
Fstat

##          [,1]
## [1,] 1.934568

# p-val
pf(Fstat, df1 = 1, df2 = df.residual(m1), lower.tail = FALSE)

##           [,1]
## [1,] 0.2017235

1.4 d)

citation : Sec. 4.4.2, p. 116, Introduction to Regression Modeling (B. Abraham and J. Ledolter)

paste0("F-Stat = ",summary(m1)$fstatistic[1]|>round(2))

## [1] "F-Stat = 5.59"

Residual standard error: 16.33 on 8 degrees of freedom
Multiple R-squared:  0.7365,    Adjusted R-squared:  0.6047 
F-statistic:  5.59 on 4 and 8 DF,  p-value: 0.01902

as we can see, atleast one of the coef is NOT 0 (p-value: 0.01902)

1.5 e)

b <- coef(m1)[c("RFlex", "LFlex")]
b

##      RFlex      LFlex 
##  2.3745010 -0.5277338

S <- vcov(m1)[c("RFlex", "LFlex"),
              c("RFlex", "LFlex")]
S

##            RFlex      LFlex
## RFlex  2.0659892 -0.8032306
## LFlex -0.8032306  0.6814760

n <- nrow(punting)
p <- length(coef(m1))

cval <- 2 * qf(0.80, df1 = 2, df2 = n - p)

theta <- seq(0, 2*pi, length.out = 200)

eig <- eigen(S)

A <- eig$vectors %*%
     diag(sqrt(eig$values * cval))

circle <- rbind(cos(theta), sin(theta))

ellipse <- t(b + A %*% circle)

ellipse <- as.data.frame(ellipse)
names(ellipse) <- c("RFlex", "LFlex")

library(ggplot2)
library(grid)

# semi-axis lengths
axis_lengths <- sqrt(eig$values * cval)

ggplot(ellipse, aes(RFlex, LFlex)) +

  # ellipse
  geom_path(color = "blue", linewidth = 1) +

  # center point
  annotate("point",
           x = b[1],
           y = b[2],
           color = "red",
           size = 3) +

  # first eigenvector
  annotate("segment",
           x = b[1],
           y = b[2],
           xend = b[1] + axis_lengths[1] * eig$vectors[1,1],
           yend = b[2] + axis_lengths[1] * eig$vectors[2,1],
           color = "darkgreen",
           linewidth = 1.2,
           arrow = arrow(length = unit(0.2, "cm"))) +

  # second eigenvector
  annotate("segment",
           x = b[1],
           y = b[2],
           xend = b[1] + axis_lengths[2] * eig$vectors[1,2],
           yend = b[2] + axis_lengths[2] * eig$vectors[2,2],
           color = "purple",
           linewidth = 1.2,
           arrow = arrow(length = unit(0.2, "cm"))) +

  labs(
    title = "80% Confidence Region for RFlex and LFlex",
    subtitle = "Eigenvector directions shown",
    x = "RFlex coefficient",
    y = "LFlex coefficient"
  ) +

  theme_minimal()

2 Q2) Real World Application

cancer <- read.table("Downloads/cancer.txt", header=T)

2.1 a)

library(dplyr)
s <- cancer |> select(where(is.numeric)) |> apply(2, sd)
s

## All.cancers        Lung       Colon    Melanoma    F.breast    Pancreas 
##   47.857634   14.724588    7.262243    5.245611    7.888396    1.482258 
##    Leukemia     Ovarian      Cervix    Prostate       Liver 
##    2.255171    1.358915    1.421959   16.858436    2.241009

s  |> as.data.frame() |> dplyr::arrange(desc(s))

range(s)

## [1]  1.358915 47.857634

notice that the variation of the data is highly variet with a range of
1.358915, 47.857634
suggesting non-scaled var may dominate directions

2.2 b)

numeric_cancer <- cancer |> select(where(is.numeric))

# cheating
pca <- 
  prcomp(numeric_cancer, center=F, scale=F)

2.2.1 Manually

wt_S <- cov(numeric_cancer)
eigen(wt_S)

## eigen() decomposition
## $values
##  [1] 2538.0316069  242.2089744   75.0597300   45.8760799   25.7666346
##  [6]   12.0841259    6.0599825    2.0559860    1.5425573    0.7040127
## [11]    0.6408904
## 
## $vectors
##              [,1]         [,2]        [,3]        [,4]         [,5]
##  [1,] 0.948911183  0.048606436  0.20935669  0.10770752  0.115235991
##  [2,] 0.173707714  0.668729611 -0.56080328 -0.41636726 -0.079471933
##  [3,] 0.098867058  0.127018189 -0.01199414  0.54795323 -0.397823583
##  [4,] 0.029715889 -0.112583098  0.14293336 -0.28359794  0.694490359
##  [5,] 0.077227953 -0.127033747  0.46441765 -0.63576945 -0.555746943
##  [6,] 0.015969632  0.005649558  0.06769941  0.02884992  0.032789987
##  [7,] 0.019995381 -0.039163522  0.04279231  0.08030527 -0.028433176
##  [8,] 0.004948948 -0.004370774  0.07143164 -0.02340922 -0.057920669
##  [9,] 0.014052752  0.035246938 -0.01491943  0.05488954  0.006994686
## [10,] 0.227792155 -0.708933677 -0.62503708 -0.12892118 -0.147295540
## [11,] 0.002286687  0.008824253  0.05478220  0.05187759 -0.063138084
##               [,6]         [,7]        [,8]        [,9]       [,10]
##  [1,] -0.151902582  0.025821125 -0.05163051  0.03245476 -0.03184170
##  [2,]  0.118220768 -0.012290183  0.08765643 -0.04797850  0.05448703
##  [3,]  0.711305149 -0.086208795  0.01615325  0.01060887  0.00232018
##  [4,]  0.605596140 -0.113265129  0.14449875 -0.01166668 -0.01237297
##  [5,]  0.199046073 -0.015777078  0.03263704  0.05340703  0.02882697
##  [6,] -0.009420867 -0.187574322 -0.18120002 -0.48984821  0.80727359
##  [7,] -0.024805801  0.572854039  0.75927739 -0.08114244  0.26637074
##  [8,] -0.016496386  0.008520542  0.01949652 -0.83868297 -0.43471333
##  [9,] -0.026794542 -0.209819414  0.32218462 -0.17376917 -0.27853322
## [10,]  0.094279052 -0.057460006  0.03201328 -0.05044051  0.02786371
## [11,] -0.200114261 -0.753610008  0.50289278  0.10430398  0.07107011
##               [,11]
##  [1,] -0.0045249774
##  [2,] -0.0369717911
##  [3,] -0.0408123799
##  [4,] -0.0502259945
##  [5,]  0.0673818868
##  [6,]  0.1829592658
##  [7,] -0.0784928893
##  [8,] -0.3128281937
##  [9,]  0.8596244870
## [10,] -0.0003258794
## [11,] -0.3367980425

here we see 11 PCs, whereby the variation

eigen(wt_S)$values

##  [1] 2538.0316069  242.2089744   75.0597300   45.8760799   25.7666346
##  [6]   12.0841259    6.0599825    2.0559860    1.5425573    0.7040127
## [11]    0.6408904

here we dramatically different variations

Lets consider the correlation between the score and raw data to see where each maps onto

score <- numeric_cancer |> as.matrix() %*% eigen(wt_S)$vectors
cor(score, numeric_cancer)[1:2,] |> as.data.frame(row.names = c("PC1", "PC2"))

here we see PC1 maps strongly (cor > .7) onto All.cancers , and almost Colon + Prostate
here we see PC2 maps strongly (cor > .7) onto Lung , and almost Prostate

pca$rotation[, 1:2]

##                     PC1           PC2
## All.cancers -0.92713211 -0.1530569566
## Lung        -0.11384304 -0.6815012161
## Colon       -0.08992133 -0.1367690498
## Melanoma    -0.04067026  0.1121880804
## F.breast    -0.20590064  0.2517498719
## Pancreas    -0.02201104  0.0001258049
## Leukemia    -0.02615877  0.0390810036
## Ovarian     -0.02037864  0.0204077046
## Cervix      -0.01256432 -0.0355731175
## Prostate    -0.27084794  0.6436035596
## Liver       -0.01517559  0.0049920437

s[!(names(s) %in% c("All.cancers", "Colon", "Prostate", "Lung"))] |> round(1)

## Melanoma F.breast Pancreas Leukemia  Ovarian   Cervix    Liver 
##      5.2      7.9      1.5      2.3      1.4      1.4      2.2

s[c("All.cancers", "Colon", "Prostate", "Lung")] |> round(1)

## All.cancers       Colon    Prostate        Lung 
##        47.9         7.3        16.9        14.7

and as we can see, the ones it maps onto are normally the ones with large sd – showing how scale messing things up.

2.3 c)

scale_df <- numeric_cancer |> scale() |> as.data.frame()
scale_cov <- cov(scale_df) 
scale_cov

##             All.cancers        Lung       Colon    Melanoma    F.breast
## All.cancers  1.00000000  0.58875621  0.68945026  0.28681733  0.49437002
## Lung         0.58875621  1.00000000  0.52406545 -0.08097882  0.06450011
## Colon        0.68945026  0.52406545  1.00000000 -0.13422455  0.11322753
## Melanoma     0.28681733 -0.08097882 -0.13422455  1.00000000  0.34006107
## F.breast     0.49437002  0.06450011  0.11322753  0.34006107  1.00000000
## Pancreas     0.56098838  0.20702708  0.41884934  0.25593370  0.33972785
## Leukemia     0.45145940 -0.02192680  0.34181185  0.09585910  0.25929143
## Ovarian      0.19528851 -0.04672923  0.09083837  0.06103151  0.46420489
## Cervix       0.50393937  0.54862565  0.56130273 -0.08429720 -0.04964454
## Prostate     0.65589071  0.05971904  0.28605314  0.33457097  0.38179845
## Liver        0.06285748 -0.02625419  0.08898474 -0.17648425  0.06207339
##               Pancreas    Leukemia     Ovarian      Cervix    Prostate
## All.cancers 0.56098838  0.45145940  0.19528851  0.50393937  0.65589071
## Lung        0.20702708 -0.02192680 -0.04672923  0.54862565  0.05971904
## Colon       0.41884934  0.34181185  0.09083837  0.56130273  0.28605314
## Melanoma    0.25593370  0.09585910  0.06103151 -0.08429720  0.33457097
## F.breast    0.33972785  0.25929143  0.46420489 -0.04964454  0.38179845
## Pancreas    1.00000000  0.09837914  0.40341323  0.38709200  0.19561036
## Leukemia    0.09837914  1.00000000  0.19003994  0.07386321  0.41411563
## Ovarian     0.40341323  0.19003994  1.00000000  0.06983663  0.02859465
## Cervix      0.38709200  0.07386321  0.06983663  1.00000000  0.10398098
## Prostate    0.19561036  0.41411563  0.02859465  0.10398098  1.00000000
## Liver       0.30483692 -0.25885521  0.09312273  0.42160105 -0.07330917
##                   Liver
## All.cancers  0.06285748
## Lung        -0.02625419
## Colon        0.08898474
## Melanoma    -0.17648425
## F.breast     0.06207339
## Pancreas     0.30483692
## Leukemia    -0.25885521
## Ovarian      0.09312273
## Cervix       0.42160105
## Prostate    -0.07330917
## Liver        1.00000000

scaled covariance matrix (ie correlation matrix – seen below)

cor(numeric_cancer)

##             All.cancers        Lung       Colon    Melanoma    F.breast
## All.cancers  1.00000000  0.58875621  0.68945026  0.28681733  0.49437002
## Lung         0.58875621  1.00000000  0.52406545 -0.08097882  0.06450011
## Colon        0.68945026  0.52406545  1.00000000 -0.13422455  0.11322753
## Melanoma     0.28681733 -0.08097882 -0.13422455  1.00000000  0.34006107
## F.breast     0.49437002  0.06450011  0.11322753  0.34006107  1.00000000
## Pancreas     0.56098838  0.20702708  0.41884934  0.25593370  0.33972785
## Leukemia     0.45145940 -0.02192680  0.34181185  0.09585910  0.25929143
## Ovarian      0.19528851 -0.04672923  0.09083837  0.06103151  0.46420489
## Cervix       0.50393937  0.54862565  0.56130273 -0.08429720 -0.04964454
## Prostate     0.65589071  0.05971904  0.28605314  0.33457097  0.38179845
## Liver        0.06285748 -0.02625419  0.08898474 -0.17648425  0.06207339
##               Pancreas    Leukemia     Ovarian      Cervix    Prostate
## All.cancers 0.56098838  0.45145940  0.19528851  0.50393937  0.65589071
## Lung        0.20702708 -0.02192680 -0.04672923  0.54862565  0.05971904
## Colon       0.41884934  0.34181185  0.09083837  0.56130273  0.28605314
## Melanoma    0.25593370  0.09585910  0.06103151 -0.08429720  0.33457097
## F.breast    0.33972785  0.25929143  0.46420489 -0.04964454  0.38179845
## Pancreas    1.00000000  0.09837914  0.40341323  0.38709200  0.19561036
## Leukemia    0.09837914  1.00000000  0.19003994  0.07386321  0.41411563
## Ovarian     0.40341323  0.19003994  1.00000000  0.06983663  0.02859465
## Cervix      0.38709200  0.07386321  0.06983663  1.00000000  0.10398098
## Prostate    0.19561036  0.41411563  0.02859465  0.10398098  1.00000000
## Liver       0.30483692 -0.25885521  0.09312273  0.42160105 -0.07330917
##                   Liver
## All.cancers  0.06285748
## Lung        -0.02625419
## Colon        0.08898474
## Melanoma    -0.17648425
## F.breast     0.06207339
## Pancreas     0.30483692
## Leukemia    -0.25885521
## Ovarian      0.09312273
## Cervix       0.42160105
## Prostate    -0.07330917
## Liver        1.00000000

eigen(scale_cov)

## eigen() decomposition
## $values
##  [1] 3.64050939 2.01901973 1.48934946 1.02774399 0.85217841 0.64292122
##  [7] 0.45622412 0.38436246 0.26127071 0.18558497 0.04083554
## 
## $vectors
##            [,1]        [,2]        [,3]        [,4]        [,5]        [,6]
##  [1,] 0.5010584 -0.03752827  0.13564906  0.10200653  0.00661866 -0.10905402
##  [2,] 0.2867743  0.34463886  0.24975314  0.12961783  0.53167942 -0.28217876
##  [3,] 0.3905015  0.24854728  0.20719239 -0.20988812 -0.03157362  0.11291776
##  [4,] 0.1316843 -0.41857218 -0.05859752  0.60913595  0.15614063  0.38162735
##  [5,] 0.2764510 -0.37218720 -0.27254464 -0.03686353  0.14210375 -0.57513296
##  [6,] 0.3555025  0.02591798 -0.38107453  0.09964642  0.07993039  0.45035470
##  [7,] 0.2406797 -0.28184171  0.30247121 -0.48821395 -0.28970563  0.27410672
##  [8,] 0.1834695 -0.18665306 -0.48476086 -0.49457994  0.29556288  0.06044586
##  [9,] 0.3183348  0.43674873 -0.03254542  0.07805234 -0.10765380  0.18024636
## [10,] 0.3061206 -0.30469450  0.22752557  0.19475514 -0.44400656 -0.24994785
## [11,] 0.0861796  0.32523824 -0.52350067  0.13936291 -0.53413119 -0.20674709
##              [,7]         [,8]        [,9]       [,10]        [,11]
##  [1,]  0.04374209  0.006792296  0.11185970 -0.18508041  0.811210603
##  [2,] -0.14537839  0.007252804  0.29034347 -0.33824243 -0.370950840
##  [3,]  0.37193300  0.131962267 -0.70550259 -0.07120336 -0.162640106
##  [4,] -0.30791519  0.098058275 -0.31623819 -0.21670726 -0.107522730
##  [5,] -0.04175042  0.414550768 -0.10494771  0.40179558 -0.111317074
##  [6,]  0.50502008  0.098916420  0.43699136  0.16737802 -0.157430047
##  [7,] -0.33568094  0.364797251  0.27142205 -0.15902722 -0.170336907
##  [8,] -0.19263901 -0.503971154 -0.13826954 -0.22490690  0.009368924
##  [9,] -0.54255118 -0.168849994 -0.04772674  0.57564539  0.012902563
## [10,]  0.15577866 -0.581436598  0.09082973  0.03419142 -0.303946738
## [11,] -0.13637156  0.193887091 -0.01248343 -0.44882159 -0.086554294

as we can see, after using the correlation matrix vs. the cov matrix, our variance of each comp is lower and more similar to one another.

2.3.1 investigate the loadings :

score <- scale_df |> as.matrix() %*% eigen(scale_cov)$vectors
cor(score, scale_df)[1:2, ] |> as.data.frame(row.names = c("PC1", "PC2"))

2.4 d)

scaled as scaling account for fake variability – ie. using standardized units (Z-score) and therefore using the the covariance of those (Correlation matrix) – we can utilize Z-score as a unitless measure to compare these in relative terms of how many sd away they are from their mean after centering at 0.

3 Q3) Real World Application : PCA

library(dplyr)
df <- read.csv("Downloads/farmers.csv")

3.1 a) Scatter Plots & Outliers

df <- df |> select(Family, DistRD, Cattle)

3.2 Family vs. DistRD

plot(df$Family, df$DistRD)

3 obvious outliers

Remove outliers

df <- df |> arrange(desc(Family)) |> slice(-1) |> arrange(desc(DistRD)) |> slice(-(1:2))
plot(df$Family, df$DistRD)

3.3 DistRD vs Cattle

plot(df$DistRD, df$Cattle)

there appears to be 3 groups and 1 outlier

df <- df |> arrange(desc(Cattle)) |> slice(-1)

plot(df$DistRD, df$Cattle)

3.4 b) Perform PCA

# standardize data -- center 0, sd 1
df <- scale(df) |> as.data.frame()

# verify 
lapply(df, sd)

## $Family
## [1] 1
## 
## $DistRD
## [1] 1
## 
## $Cattle
## [1] 1

print("-")

## [1] "-"

lapply(df, mean)

## $Family
## [1] 8.516779e-17
## 
## $DistRD
## [1] -2.037944e-16
## 
## $Cattle
## [1] 2.433366e-17

here we standardize so the squared distance dont dominate the PC direction and allow eachother to be compared in releative terms.

recall

\[ y=\Gamma(x-\mu) \]except we’ve already centered our data so \(\mu=0\)

S <- cov(df)
eigen(S)$values

## [1] 1.5550096 1.0068188 0.4381716

these are the variances, so these are the SD

sqrt(eigen(S)$values)

## [1] 1.2470003 1.0034036 0.6619453

eigen(S)$vectors

##            [,1]        [,2]       [,3]
## [1,] 0.70557081 -0.10127783  0.7013648
## [2,] 0.02986413  0.99310480  0.1133622
## [3,] 0.70800986  0.05903939 -0.7037303

these are our principle components as linear combinations of our original, centered data

Verify :

prcomp(df)

## Standard deviations (1, .., p=3):
## [1] 1.2470003 1.0034036 0.6619453
## 
## Rotation (n x k) = (3 x 3):
##                PC1         PC2        PC3
## Family -0.70557081 -0.10127783 -0.7013648
## DistRD -0.02986413  0.99310480 -0.1133622
## Cattle -0.70800986  0.05903939  0.7037303

as we can see, both match

3.4.1 Decide the number of PCS

prcomp(df) |> summary()

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.2470 1.0034 0.6619
## Proportion of Variance 0.5183 0.3356 0.1461
## Cumulative Proportion  0.5183 0.8539 1.0000

we can see with just 2 PCs, we get “0.8539” of the variation, we would like to reduce the dimensions and keep a good amt of the variation therefore I would suggest keeping 2 PCs

3.5 c)

3.6 Engaging in my own curiosity

library(tidyverse)
library(plotly)

df <- 
  read.csv("Downloads/farmers.csv") |>
  select(Family, DistRD, Cattle) |> 
  arrange(desc(Family)) |> slice(-1) |> 
  arrange(desc(DistRD)) |> slice(-c(1:2)) |> 
  arrange(desc(Cattle)) |> slice(-1)



plot_ly(
  data = df,
  x = ~Family,
  y = ~DistRD,
  z = ~Cattle,
  type = "scatter3d",
  mode = "markers"
) |>
  layout(
    scene = list(
      xaxis = list(title = "Family"),
      yaxis = list(title = "DistRD"),
      zaxis = list(title = "Cattle")
    )
  )

okay here we dont standardize the data
we can see there are 2 clusters
most of the variation appears in the DistRD direction and Family direction and little in cattle
- Lets see if this obs remains consistent with loadings
- Before this lets investigate covariance matrix

cov(df)

##           Family     DistRD    Cattle
## Family 385.68113 -15.865202 69.557268
## DistRD -15.86520 575.979547  8.730784
## Cattle  69.55727   8.730784 40.796804

As said above, we see DistRD with the largest variance and Family with less but comparable amt of variation—and Cattle with relatively little. Therefore, we should expect the linear combination of the \(X_i \in X \in R^{3x1}\) to have the large variance directions dominate.

# center data 
xbar <- colMeans(df) |> as.matrix() 
Xbar <- xbar %*% rep(1, nrow(df)) |> as.matrix() |> t()
center <- df - Xbar

pc_comp <-  prcomp(center)$rotation
pc_comp

##                 PC1        PC2         PC3
## Family  0.080418438 0.97823884  0.19126327
## DistRD -0.996744351 0.07780729  0.02113583
## Cattle -0.005794211 0.19234030 -0.98131118

lambda <- prcomp(center)$sdev^2

notice our data is centered now

Lets verify properties of our PCs :

t(pc_comp[,1]) %*% pc_comp[,2]

##              [,1]
## [1,] 1.415412e-18

t(pc_comp[,1]) %*% pc_comp[,3]

##               [,1]
## [1,] -5.189965e-17

t(pc_comp[,2]) %*% pc_comp[,3]

##              [,1]
## [1,] 1.856125e-17

here we see each is about 0 for dot product indicating each PC is orthogonal as it should be as we are decomposing our symmetric centered-covariance matrix.

t(pc_comp) %*% pc_comp

##               PC1          PC2           PC3
## PC1  1.000000e+00 1.415412e-18 -5.189965e-17
## PC2  1.415412e-18 1.000000e+00  1.856125e-17
## PC3 -5.189965e-17 1.856125e-17  1.000000e+00

these values are re-see in the off diagnols and we see that as we should get with our orthonormal principle component matrix, \(\Gamma'=\Gamma^{-1}\) suggests, \(\Gamma'\Gamma=I\) as we see above^

colSums(pc_comp^2)

## PC1 PC2 PC3 
##   1   1   1

further, as we see each principle component is unit length as it should be

PC1 <- pc_comp[,1]
PC2 <- pc_comp[,2]
PC3 <- pc_comp[,3]

# scale by eigen values (most to least variance)
s1 <- lambda[1] * PC1
s2 <- lambda[2] * PC2
s3 <- lambda[3] * PC3

plot_ly(
  data = center,
  x = ~Family,
  y = ~DistRD,
  z = ~Cattle,
  type = "scatter3d",
  mode = "markers"
) |>

  add_trace(
    x = c(-s1[1], s1[1]),
    y = c(-s1[2], s1[2]),
    z = c(-s1[3], s1[3]),
    type = "scatter3d",
    mode = "lines",
    name = "PC1"
  ) |>

  add_trace(
  x = c(-s2[1], s2[1]),
  y = c(-s2[2], s2[2]),
  z = c(-s2[3], s2[3]),
  type = "scatter3d",
  mode = "lines",
  name = "PC2"
) |>

add_trace(
  x = c(-s3[1], s3[1]),
  y = c(-s3[2], s3[2]),
  z = c(-s3[3], s3[3]),
  type = "scatter3d",
  mode = "lines",
  name = "PC3"
) |>

  layout(
    scene = list(
      xaxis = list(title = "Family"),
      yaxis = list(title = "DistRD"),
      zaxis = list(title = "Cattle")
    )
  )

as we can see, our unscaled but centered data has most of its in the direction of greatest variation.

center_scale <- scale(center) |> as.data.frame() 
pc_comp <-  prcomp(center_scale)$rotation
pc_comp

##                PC1         PC2        PC3
## Family -0.70557081 -0.10127783 -0.7013648
## DistRD -0.02986413  0.99310480 -0.1133622
## Cattle -0.70800986  0.05903939  0.7037303

lambda <- prcomp(center_scale)$sdev^2


PC1 <- pc_comp[,1]
PC2 <- pc_comp[,2]
PC3 <- pc_comp[,3]

# scale by eigen values (most to least variance)
s1 <- lambda[1] * PC1
s2 <- lambda[2] * PC2
s3 <- lambda[3] * PC3

plot_ly(
  data = center_scale,
  x = ~Family,
  y = ~DistRD,
  z = ~Cattle,
  type = "scatter3d",
  mode = "markers"
) |>

  add_trace(
    x = c(-s1[1], s1[1]),
    y = c(-s1[2], s1[2]),
    z = c(-s1[3], s1[3]),
    type = "scatter3d",
    mode = "lines",
    name = "PC1"
  ) |>

  add_trace(
  x = c(-s2[1], s2[1]),
  y = c(-s2[2], s2[2]),
  z = c(-s2[3], s2[3]),
  type = "scatter3d",
  mode = "lines",
  name = "PC2"
) |>

add_trace(
  x = c(-s3[1], s3[1]),
  y = c(-s3[2], s3[2]),
  z = c(-s3[3], s3[3]),
  type = "scatter3d",
  mode = "lines",
  name = "PC3"
) |>

  layout(
    scene = list(
      xaxis = list(title = "Family"),
      yaxis = list(title = "DistRD"),
      zaxis = list(title = "Cattle")
    )
  )

as we can see after standardizing measures, shape remains consistent

4 Q4) Theory : PCA

4.1 a)

4.2 b)

4.3 c)

4.4 d)

4.5 e)

4.6 f)

5 Q5) Application : K-Means

itm <- c("A", "B", "C", "D")
x1 <- c(5, 1, -1, 3)
x2 <- c(4,-2,1,1)

df <- data.frame(itm, x1, x2)
plot(x1, x2)
text(x1, x2, labels = itm, pos = 4)

5.1 a)

using the stats package^ (colored by initial clustering)
this is what we should end up with using MacQueen (sequential) update rule
- ie MacQueen updates by each point vs. lloyd adjusts centroids after we consider cluster-membership of all points

# manually
df <- cbind(df, cluster=rep(c("C1","C2"), each=2))
df

plot(x1, x2,  col = c("red", "blue")[factor(df$cluster)])
text(x1, x2, labels = itm, pos = 2) 
points(c(3, 1),
       c(1, 1),
       pch = 8,
       cex = 2, 
       col=c("red", "blue")[factor(df$cluster)])

df$cluster <- c("C1", rep("C2", 3))

plot(df$x1, df$x2,
     col = c("red", "blue")[factor(df$cluster)],
     pch = 19)

text(df$x1, df$x2,
     labels = df$itm,
     pos = 2)

# centroids
points(c(5, 1),
       c(4, 0),
       pch = 8,
       cex = 2,
       col = c("red", "blue"))

5.2 b)

df$cluster <- rep(c("C1", "C2"), 2)
df

plot(df$x1, df$x2,
     col = c("red", "blue")[factor(df$cluster)],
     pch = 19)

text(df$x1, df$x2,
     labels = df$itm,
     pos = 2)

# centroids
points(c(2, 2),
       c(2.5, -0.5),
       pch = 8,
       cex = 2,
       col = c("red", "blue"))

groups = {(A, C), (B, D)}
the results are not the same – neither the centroids nor membership is the same.
here we see equivalent “tie” in distances, which i take as staying with the same membership. Meaning both pts {C, D} are equally distant from the centroids of {C1, C2}

5.3 c)

df$cluster <- rep(c("C1","C2"), each = 2)

plot(df$x1, df$x2,
     col = c("red", "blue")[factor(df$cluster)],
     pch = 19)

text(df$x1, df$x2,
     labels = df$itm,
     pos = 2)

# centroids
points(c(3, 1),
       c(1, 1),
       pch = 8,
       cex = 2,
       col = c("red", "blue"))

df$cluster <- c("C1", "C1", "C2", "C1")
plot(df$x1, df$x2,
     col = c("red", "blue")[factor(df$cluster)],
     pch = 19,
     xlab = "x1",
     ylab = "x2")

text(df$x1, df$x2,
     labels = df$itm,
     pos = 2)

# centroids
points(c(3, -1),
       c(1, 1),
       pch = 8,
       cex = 2,
       col = c("red", "blue"))

text(c(3, -1),
     c(1, 1),
     labels = c("m1", "m2"),
     pos = 4)

the solutions are not the same.
we notice different ending centroids and a switch between membership
notice that the groups switched
indicating that the initial conditions matter a lot for this algorithm to work

5.4 Engaging in my own curiosity

Notice that the previous results were more like MacQueen’s (change as you go) algorithm while, above its more like Lloyd where we first consider the distance of all pts and then change clusters. I believe this result is much, much better clustering.

X <- df[,-c(1,4)]
km <- kmeans(X, centers = 2, algorithm="Lloyd")
km

## K-means clustering with 2 clusters of sizes 2, 2
## 
## Cluster means:
##   x1   x2
## 1  4  2.5
## 2  0 -0.5
## 
## Clustering vector:
## [1] 1 2 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 6.5 6.5
##  (between_SS / total_SS =  65.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

df$cluster <- c("C1", "C2", "C2", "C1")

plot(df$x1, df$x2,
     col = c("red", "blue")[factor(df$cluster)],
     pch = 19,
     xlab = "x1",
     ylab = "x2")

text(df$x1, df$x2,
     labels = df$itm,
     pos = 2)

# centroids
points(c(4, 0),
       c(2.5, -0.5),
       pch = 8,
       cex = 2,
       col = c("red", "blue"))

text(c(4, 0),
     c(2.5, -0.5),
     labels = c("m1", "m2"),
     pos = 4)

as we can see these results cluster our points much better.

6 Q6) EM-Algorithm

6.1 a)

6.2 b)

x <- c(5, 9, 8, 4, 7) # observed number of heads
m <- 10               # number of flips per experiment
K <- 2                # two latent coins: A and B

# initial guesses for:
# pi    = mixing probabilities
# theta = coin biases
pi <- c(0.5, 0.5)
theta <- c(0.6, 0.5)

# keep track of parameter history
history <- data.frame()

for(iter in 1:100){

  # -------------------
  # E-step
  # -------------------
  # Compute q_ik:
  # probability observation i came from coin k

  q <- sapply(1:K, function(k){

    pi[k] *
      dbinom(
        x,
        size = m,
        prob = theta[k]
      )

  })

  # normalize rows so probabilities sum to 1
  q <- q / rowSums(q)

  # -------------------
  # M-step
  # -------------------
  # Update mixing probabilities

  pi_new <- colMeans(q)

  # Update coin biases
  theta_new <- sapply(1:K, function(k){

    sum(q[, k] * x) /
      sum(q[, k] * m)

  })

  # save iteration results
  history <- rbind(
    history,
    data.frame(
      iter = iter,
      pi1 = pi_new[1],
      pi2 = pi_new[2],
      theta1 = theta_new[1],
      theta2 = theta_new[2]
    )
  )

  # largest parameter change
  delta <-
    max(
      abs(
        c(pi_new, theta_new) -
          c(pi, theta)
      )
    )

  # stop when parameters stop changing
  if(delta < 1e-6){
    cat("Converged after", iter, "iterations\n")
    break
  }

  # replace old parameters with updated values
  pi <- pi_new
  theta <- theta_new
}

## Converged after 52 iterations

# final estimates
pi

## [1] 0.5227561 0.4772439

theta

## [1] 0.7933666 0.5139149

# final responsibilities
q

##           [,1]       [,2]
## [1,] 0.1176410 0.88235904
## [2,] 0.9586597 0.04134027
## [3,] 0.8646000 0.13539998
## [4,] 0.0354128 0.96458720
## [5,] 0.6374627 0.36253725

# parameter path
history

plot(
  history$iter,
  history$theta1,
  type = "b",
  ylim = range(history[, c("theta1", "theta2")]),
  ylab = expression(theta),
  xlab = "Iteration",
  col="red"
)

lines(
  history$iter,
  history$theta2,
  type = "b",
  lty = 2,
  col="blue"
)

as we can see from our intial guess to the 50+ iteration our prob for each coin changes steeply at first then stabilizes

plot(
  history$iter,
  history$pi1,
  type = "b",
  ylim = range(history[, c("pi1", "pi2")]),
  ylab = expression(pi),
  xlab = "Iteration",
  col = "red",
  pch = 19
)

lines(
  history$iter,
  history$pi2,
  type = "b",
  lty = 2,
  col = "blue",
  pch = 19
)

as we can see from our intial guess to the 50+ iteration our responsibility for observation i belonging to each coin changes steeply at first then stabilizes

7 Personal Exploration of Concepts

rm(list=ls())

7.1 Sampling Variability in Slope

here we see that the underlying true regression model is est. with a model (red) with some bias, our confidence ban includes true model

7.2 Samp. Var. in \(\hat{w}\) and SVD decomp. \(\Sigma_{X}\)

set.seed(123)
n <- 100
data <- rnorm(n, mean=10, sd = 2)
intercept <- rep(1, n)
signal <- data*4 + intercept
xbar <- mean(signal); xbar

## [1] 41.72325

sd_signal <- sd(signal); sd_signal

## [1] 7.302527

# simulated data -- w/ just signal
hist(signal,
     probability = TRUE,
     col = "lightgray")

# true underlyinh model
curve(
  dnorm(x, mean = 4*10 + 1, sd = 8),
  add = TRUE,
  col = "red",
  lwd = 3
)

abline(v= 4*10 + 1, col="red")
abline(v= 4*10 + 1 + 8 , col="blue")
abline(v= 4*10 + 1 - 8 , col="blue")

# choose variable noise is
noise_var <- 5
noise <- rnorm(n, mean=0, sd = noise_var)
real_data <- signal+noise

df <-
  data.frame(x=data, signal, noise, y=real_data)

df[,c("x", "y")] |> plot()

df[,c("x", "y")] |> var()

##           x        y
## x  3.332931 12.89451
## y 12.894515 73.20579

df[,c("x", "y")] |> cov() |> eigen(symmetric=T)

## eigen() decomposition
## $values
## [1] 75.509431  1.029294
## 
## $vectors
##           [,1]       [,2]
## [1,] 0.1758680 -0.9844138
## [2,] 0.9844138  0.1758680

df[,c("x", "y")] |> cor()

##           x         y
## x 1.0000000 0.8255038
## y 0.8255038 1.0000000

plot(df$x, df$signal)

lm(df$x ~ df$signal)

## 
## Call:
## lm(formula = df$x ~ df$signal)
## 
## Coefficients:
## (Intercept)    df$signal  
##       -0.25         0.25

mdl <- lm(df$x ~ df$y)
mdl

## 
## Call:
## lm(formula = df$x ~ df$y)
## 
## Coefficients:
## (Intercept)         df$y  
##      2.9264       0.1761

plot(x ~ y, data=df)
abline(mdl, col="red")

7.2.1 Center Data

xy <- df[,c("x", "y")] 
mns <- xy |> colMeans() |> as.matrix() |> t()
mns <- mns[rep(1,n),]
centered <- xy-mns 
eig <- eigen(cov(centered))
eig_val <- eig$values
eig_v <- eig$vectors
scale <- sqrt(eig_val)

# ellipse
theta <- seq(0, 2*pi, length.out = 200)

# unit circle
circle <- rbind(cos(theta), sin(theta))

# transform by covariance eigendecomposition -- scale by sd of eig
ellipse <- eig_v %*% diag(sqrt(eig_val)) %*% circle

plot(centered$x, centered$y, col="grey",xlab="x",ylab="y", asp = 1)
points(0, 0, pch = 16, cex = 1, col = "red")

arrows(0, 0,
       scale[1] * eig_v[1,1],
       scale[1] * eig_v[2,1],
       col = "red", lwd = 2, length = 0.1)

arrows(0, 0,
       scale[2] * eig_v[1,2],
       scale[2] * eig_v[2,2],
       col = "blue", lwd = 2, length = 0.1)

lines(ellipse[1, ], ellipse[2, ], col = "darkgreen", lwd = 2)

7.2.1.1 Arbitrary Function

7.2.1.1.1 `generate_linear_data()`

generate_linear_data <- function(
    n,
    sigma_x,
    mu_x,
    y = "4*x+1",
    sigma_n,
    b_int = 0,
    mu_n = 0
) {

  # generate x
  x <- rnorm(n, mean = mu_x, sd = sigma_x)

  # make x available inside eval()
  x_env <- list2env(list(x = x))

  # evaluate signal expression
  signal <- eval(parse(text = y), envir = x_env)

  # optional intercept adjustment
  signal <- signal + b_int

  # generate noise
  noise <- rnorm(n, mean = mu_n, sd = sigma_n)

  # observed response
  y_obs <- signal + noise

  data.frame(
    x = x,
    signal = signal,
    noise = noise,
    y = y_obs
  )
}

set.seed(123)

# test original example
df <- generate_linear_data(
  n = 100,
  sigma_x = 2,
  mu_x = 10,
  y = "4*x+1",
  sigma_n = 5
)

7.2.1.1.2 `plot_density_ellipse()`

plot_density_ellipse <- function(df) {

  xy <- df[, c("x", "y")]

  centered <- sweep(
    xy,
    2,
    colMeans(xy),
    "-"
  )

  eig <- eigen(cov(centered))

  eig_val <- eig$values
  eig_v   <- eig$vectors

  scale <- sqrt(eig_val)

  theta <- seq(
    0,
    2*pi,
    length.out = 200
  )

  circle <- rbind(
    cos(theta),
    sin(theta)
  )

  ellipse <-
    eig_v %*%
    diag(scale) %*%
    circle

  plot(
    centered$x,
    centered$y,
    col = "grey",
    xlab = "x",
    ylab = "y",
    asp = 1
  )

  points(
    0,
    0,
    pch = 16,
    col = "red"
  )

  arrows(
    0,0,
    scale[1]*eig_v[1,1],
    scale[1]*eig_v[2,1],
    col="red",
    lwd=2,
    length=.1
  )

  arrows(
    0,0,
    scale[2]*eig_v[1,2],
    scale[2]*eig_v[2,2],
    col="blue",
    lwd=2,
    length=.1
  )

  lines(
    ellipse[1,],
    ellipse[2,],
    col="darkgreen",
    lwd=2
  )

  invisible(
    list(
      centered = centered,
      eig_values = eig_val,
      eig_vectors = eig_v
    )
  )
}

plot_density_ellipse(df)

7.3 Further Investigating \(\Sigma^{-1}_x\)