library(faraway)
data(punting)
m1 <- lm(Distance~RStr+LStr+RFlex+LFlex, data=punting)
print("Coef")
## [1] "Coef"
m1$coefficients
## (Intercept) RStr LStr RFlex LFlex
## -79.6236456 0.5116373 -0.1861996 2.3745010 -0.5277338
print("--")
## [1] "--"
print("Covariance Matrix")
## [1] "Covariance Matrix"
summary(m1)$cov.unscaled
## (Intercept) RStr LStr RFlex
## (Intercept) 16.1396579363 0.0002123785 0.0378939380 -0.2123867649
## RStr 0.0002123785 0.0008844524 -0.0006380244 -0.0007371097
## LStr 0.0378939380 -0.0006380244 0.0009870667 -0.0010113169
## RFlex -0.2123867649 -0.0007371097 -0.0010113169 0.0077499755
## LFlex -0.0133860443 0.0003449921 0.0001219639 -0.0030130930
## LFlex
## (Intercept) -0.0133860443
## RStr 0.0003449921
## LStr 0.0001219639
## RFlex -0.0030130930
## LFlex 0.0025563651
confint(m1, level=.8)
## 10 % 90 %
## (Intercept) -171.2456955 11.9984043
## RStr -0.1666131 1.1898877
## LStr -0.9027160 0.5303167
## RFlex 0.3667817 4.3822204
## LFlex -1.6808266 0.6253591
Intercept seems significantly different
from 0 and RFlex is the only other which doesnt contain
0.citation : Sec. 4.4.2, p. 116, Introduction to Regression Modeling (B. Abraham and J. Ledolter)
paste0("F-Stat = ",summary(m1)$fstatistic[1]|>round(2))
## [1] "F-Stat = 5.59"
Residual standard error: 16.33 on 8 degrees of freedom
Multiple R-squared: 0.7365, Adjusted R-squared: 0.6047
F-statistic: 5.59 on 4 and 8 DF, p-value: 0.01902
p-value: 0.01902)b <- coef(m1)[c("RFlex", "LFlex")]
b
## RFlex LFlex
## 2.3745010 -0.5277338
S <- vcov(m1)[c("RFlex", "LFlex"),
c("RFlex", "LFlex")]
S
## RFlex LFlex
## RFlex 2.0659892 -0.8032306
## LFlex -0.8032306 0.6814760
n <- nrow(punting)
p <- length(coef(m1))
cval <- 2 * qf(0.80, df1 = 2, df2 = n - p)
theta <- seq(0, 2*pi, length.out = 200)
eig <- eigen(S)
A <- eig$vectors %*%
diag(sqrt(eig$values * cval))
circle <- rbind(cos(theta), sin(theta))
ellipse <- t(b + A %*% circle)
ellipse <- as.data.frame(ellipse)
names(ellipse) <- c("RFlex", "LFlex")
library(ggplot2)
library(grid)
# semi-axis lengths
axis_lengths <- sqrt(eig$values * cval)
ggplot(ellipse, aes(RFlex, LFlex)) +
# ellipse
geom_path(color = "blue", linewidth = 1) +
# center point
annotate("point",
x = b[1],
y = b[2],
color = "red",
size = 3) +
# first eigenvector
annotate("segment",
x = b[1],
y = b[2],
xend = b[1] + axis_lengths[1] * eig$vectors[1,1],
yend = b[2] + axis_lengths[1] * eig$vectors[2,1],
color = "darkgreen",
linewidth = 1.2,
arrow = arrow(length = unit(0.2, "cm"))) +
# second eigenvector
annotate("segment",
x = b[1],
y = b[2],
xend = b[1] + axis_lengths[2] * eig$vectors[1,2],
yend = b[2] + axis_lengths[2] * eig$vectors[2,2],
color = "purple",
linewidth = 1.2,
arrow = arrow(length = unit(0.2, "cm"))) +
labs(
title = "80% Confidence Region for RFlex and LFlex",
subtitle = "Eigenvector directions shown",
x = "RFlex coefficient",
y = "LFlex coefficient"
) +
theme_minimal()
cancer <- read.table("Downloads/cancer.txt", header=T)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
s <- cancer |> select(where(is.numeric)) |> apply(2, sd)
s
## All.cancers Lung Colon Melanoma F.breast Pancreas
## 47.857634 14.724588 7.262243 5.245611 7.888396 1.482258
## Leukemia Ovarian Cervix Prostate Liver
## 2.255171 1.358915 1.421959 16.858436 2.241009
numeric_cancer <- cancer |>
select(where(is.numeric))
for (col in names(numeric_cancer)) {
hist(
numeric_cancer[[col]],
main = paste("Histogram of", col),
xlab = col
)
}
pca <-
prcomp(numeric_cancer,
center = TRUE,
scale. = TRUE)
X <- as.matrix(numeric_cancer)
col_means <- colMeans(X)
centered <- X - matrix(
col_means,
nrow = nrow(X),
ncol = ncol(X),
byrow = TRUE
)
s <- apply(X, 2, sd)
centered_scaled <- centered / matrix(
s,
nrow = nrow(centered),
ncol = ncol(centered),
byrow = TRUE
)
Sigma_centered_scaled <- cov(centered_scaled)
eig <- eigen(Sigma_centered_scaled)
X <- as.matrix(numeric_cancer)
col_means <- colMeans(X)
centered <- X - matrix(
col_means,
nrow = nrow(X),
ncol = ncol(X),
byrow = TRUE
)
s <- apply(X, 2, sd)
centered_scaled <- centered / matrix(
s,
nrow = nrow(centered),
ncol = ncol(centered),
byrow = TRUE
)
# covariance matrix of centered, scaled data
Sigma_centered_scaled <- cov(centered_scaled)
eig <- eigen(Sigma_centered_scaled)
Verify results :
eig$vectors[,1]
## [1] -0.5010584 -0.2867743 -0.3905015 -0.1316843 -0.2764510 -0.3555025
## [7] -0.2406797 -0.1834695 -0.3183348 -0.3061206 -0.0861796
print("-")
## [1] "-"
pca$rotation[,1]
## All.cancers Lung Colon Melanoma F.breast Pancreas
## -0.5010584 -0.2867743 -0.3905015 -0.1316843 -0.2764510 -0.3555025
## Leukemia Ovarian Cervix Prostate Liver
## -0.2406797 -0.1834695 -0.3183348 -0.3061206 -0.0861796
scores <- centered_scaled %*% eig$vectors
var_explained <- eig$values / sum(eig$values)
plot(
scores[, 1],
scores[, 2],
xlab = paste0("PC1 (", round(100 * var_explained[1], 1), "%)"),
ylab = paste0("PC2 (", round(100 * var_explained[2], 1), "%)"),
main = "PCA: First Two Principal Components",
pch = 19
)
Verify plot
plot(pca$x[, 1], pca$x[, 2])