1 Q1) Real World Application

library(faraway)
data(punting)

1.1 a)

m1 <- lm(Distance~RStr+LStr+RFlex+LFlex, data=punting)
print("Coef")
## [1] "Coef"
m1$coefficients
## (Intercept)        RStr        LStr       RFlex       LFlex 
## -79.6236456   0.5116373  -0.1861996   2.3745010  -0.5277338
print("--")
## [1] "--"
print("Covariance Matrix")
## [1] "Covariance Matrix"
summary(m1)$cov.unscaled
##               (Intercept)          RStr          LStr         RFlex
## (Intercept) 16.1396579363  0.0002123785  0.0378939380 -0.2123867649
## RStr         0.0002123785  0.0008844524 -0.0006380244 -0.0007371097
## LStr         0.0378939380 -0.0006380244  0.0009870667 -0.0010113169
## RFlex       -0.2123867649 -0.0007371097 -0.0010113169  0.0077499755
## LFlex       -0.0133860443  0.0003449921  0.0001219639 -0.0030130930
##                     LFlex
## (Intercept) -0.0133860443
## RStr         0.0003449921
## LStr         0.0001219639
## RFlex       -0.0030130930
## LFlex        0.0025563651

1.2 b)

confint(m1, level=.8)
##                     10 %       90 %
## (Intercept) -171.2456955 11.9984043
## RStr          -0.1666131  1.1898877
## LStr          -0.9027160  0.5303167
## RFlex          0.3667817  4.3822204
## LFlex         -1.6808266  0.6253591
  • the Intercept seems significantly different from 0 and RFlex is the only other which doesnt contain 0.

1.3 c) *

1.4 d)

citation : Sec. 4.4.2, p. 116, Introduction to Regression Modeling (B. Abraham and J. Ledolter)

paste0("F-Stat = ",summary(m1)$fstatistic[1]|>round(2))
## [1] "F-Stat = 5.59"
Residual standard error: 16.33 on 8 degrees of freedom
Multiple R-squared:  0.7365,    Adjusted R-squared:  0.6047 
F-statistic:  5.59 on 4 and 8 DF,  p-value: 0.01902
  • as we can see, atleast one of the coef is NOT 0 (p-value: 0.01902)

1.5 e)

b <- coef(m1)[c("RFlex", "LFlex")]
b
##      RFlex      LFlex 
##  2.3745010 -0.5277338
S <- vcov(m1)[c("RFlex", "LFlex"),
              c("RFlex", "LFlex")]
S
##            RFlex      LFlex
## RFlex  2.0659892 -0.8032306
## LFlex -0.8032306  0.6814760
n <- nrow(punting)
p <- length(coef(m1))

cval <- 2 * qf(0.80, df1 = 2, df2 = n - p)
theta <- seq(0, 2*pi, length.out = 200)

eig <- eigen(S)

A <- eig$vectors %*%
     diag(sqrt(eig$values * cval))

circle <- rbind(cos(theta), sin(theta))

ellipse <- t(b + A %*% circle)

ellipse <- as.data.frame(ellipse)
names(ellipse) <- c("RFlex", "LFlex")
library(ggplot2)
library(grid)

# semi-axis lengths
axis_lengths <- sqrt(eig$values * cval)

ggplot(ellipse, aes(RFlex, LFlex)) +

  # ellipse
  geom_path(color = "blue", linewidth = 1) +

  # center point
  annotate("point",
           x = b[1],
           y = b[2],
           color = "red",
           size = 3) +

  # first eigenvector
  annotate("segment",
           x = b[1],
           y = b[2],
           xend = b[1] + axis_lengths[1] * eig$vectors[1,1],
           yend = b[2] + axis_lengths[1] * eig$vectors[2,1],
           color = "darkgreen",
           linewidth = 1.2,
           arrow = arrow(length = unit(0.2, "cm"))) +

  # second eigenvector
  annotate("segment",
           x = b[1],
           y = b[2],
           xend = b[1] + axis_lengths[2] * eig$vectors[1,2],
           yend = b[2] + axis_lengths[2] * eig$vectors[2,2],
           color = "purple",
           linewidth = 1.2,
           arrow = arrow(length = unit(0.2, "cm"))) +

  labs(
    title = "80% Confidence Region for RFlex and LFlex",
    subtitle = "Eigenvector directions shown",
    x = "RFlex coefficient",
    y = "LFlex coefficient"
  ) +

  theme_minimal()

2 Q2) Real World Application

cancer <- read.table("Downloads/cancer.txt", header=T)

2.1 a)

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
s <- cancer |> select(where(is.numeric)) |> apply(2, sd)
s
## All.cancers        Lung       Colon    Melanoma    F.breast    Pancreas 
##   47.857634   14.724588    7.262243    5.245611    7.888396    1.482258 
##    Leukemia     Ovarian      Cervix    Prostate       Liver 
##    2.255171    1.358915    1.421959   16.858436    2.241009
numeric_cancer <- cancer |>
  select(where(is.numeric))

for (col in names(numeric_cancer)) {
  hist(
    numeric_cancer[[col]],
    main = paste("Histogram of", col),
    xlab = col
  )
}

  • also included hist of each

2.2 b)

pca <- 
  prcomp(numeric_cancer,
              center = TRUE,
              scale. = TRUE)

2.2.1 Manually

X <- as.matrix(numeric_cancer)

col_means <- colMeans(X)

centered <- X - matrix(
  col_means,
  nrow = nrow(X),
  ncol = ncol(X),
  byrow = TRUE
)

s <- apply(X, 2, sd)

centered_scaled <- centered / matrix(
  s,
  nrow = nrow(centered),
  ncol = ncol(centered),
  byrow = TRUE
)

Sigma_centered_scaled <- cov(centered_scaled)

eig <- eigen(Sigma_centered_scaled)
X <- as.matrix(numeric_cancer)

col_means <- colMeans(X)

centered <- X - matrix(
  col_means,
  nrow = nrow(X),
  ncol = ncol(X),
  byrow = TRUE
)

s <- apply(X, 2, sd)

centered_scaled <- centered / matrix(
  s,
  nrow = nrow(centered),
  ncol = ncol(centered),
  byrow = TRUE
)

# covariance matrix of centered, scaled data 
Sigma_centered_scaled <- cov(centered_scaled)

eig <- eigen(Sigma_centered_scaled)

Verify results :

eig$vectors[,1]
##  [1] -0.5010584 -0.2867743 -0.3905015 -0.1316843 -0.2764510 -0.3555025
##  [7] -0.2406797 -0.1834695 -0.3183348 -0.3061206 -0.0861796
print("-")
## [1] "-"
pca$rotation[,1]
## All.cancers        Lung       Colon    Melanoma    F.breast    Pancreas 
##  -0.5010584  -0.2867743  -0.3905015  -0.1316843  -0.2764510  -0.3555025 
##    Leukemia     Ovarian      Cervix    Prostate       Liver 
##  -0.2406797  -0.1834695  -0.3183348  -0.3061206  -0.0861796
scores <- centered_scaled %*% eig$vectors
var_explained <- eig$values / sum(eig$values)
plot(
  scores[, 1],
  scores[, 2],
  xlab = paste0("PC1 (", round(100 * var_explained[1], 1), "%)"),
  ylab = paste0("PC2 (", round(100 * var_explained[2], 1), "%)"),
  main = "PCA: First Two Principal Components",
  pch = 19
)

Verify plot

plot(pca$x[, 1], pca$x[, 2])