rm(list = ls())
getwd()
## [1] "C:/Users/ASUS/Desktop/RMIT S2 2025/MATH2472, R, Python, Statistical Data Science/Week 4"
setwd("C:/Users/ASUS/Desktop/RMIT S2 2025/MATH2472, R, Python, Statistical Data Science/Week 4")
# Week 1
library(ggplot2)
library(patchwork) # serveral plots in the same frame
library(MASS) # for parallel coordinate plot
# Week2
library(GGally) # ggpairs()
library(dplyr) # mutate
library(tidyverse) # mutate part
library(plot3D)
# Week3
# Week4
library(broom) # glance()
library(pls)
library(MVN) # detect outlier
We consider the HDLSS Breast Tumour Gene Expression data of van’t Veer et.al. which we met in Lecture 3. We observed that one of the 78 observations appears to be an outlier. By restricting the number of variablesto smaller subsets, we want to work out which groups of variables show up this observation as an outlier. We treat the genes as variables as in the lecture slides
As indroduced in week3 notes, tumour data set: n = 78, d = 4751
tumour <- read.table("tumour.csv", header = FALSE, sep = "")
tumour <- t(as.matrix(tumour))
dim(tumour) # make sure is 78 4751
## [1] 78 4751
pca_raw <- prcomp(tumour, center = TRUE, scale. = FALSE)
PC_score_raw <- pca_raw$x
p1 <- ggplot(PC_score_raw, aes(x = PC1, y = PC2)) +
geom_point() +
labs(title = "PC1 vs PC2 scatter plot")
print(p1)
# The point at bottom left is outlier. which with very small PC1 and PC2
PC1_PC2 <- as.data.frame(PC_score_raw[,1:2])
the_outlier <- which.min(PC1_PC2$PC1 + PC1_PC2$PC2)
the_outlier
## [1] 54
The outlier is the 54th observation.
We consider the full data with all variables and two subset of the variables: all 4751 variables as in the analysis in the lecture, the first 2376 and the remaining 2375 variables . You may want to create separate datasets/frames for each subset of variables.
For each of the three groups of variables and all 78 observations, conduct a PCA. Using the outcome of the PCA, find the rank of the sample covariance matrix in each case, display the eigenvalue plots, the cumulative contribution to total variance plots and the PC score plots analogous to those shown on p22 and p23 respectively of the lecture slides.
dim(tumour)
## [1] 78 4751
tumour_1 <- as.data.frame(tumour[,1:2376 ]) # fist 2376
tumour_2 <- as.data.frame(tumour[,2377:4751 ]) # last 2375
# Note: pca_raw is pca for whole data set.
pca_raw_1 <- prcomp(tumour_1, center = TRUE, scale. = FALSE)
pca_raw_2 <- prcomp(tumour_2, center = TRUE, scale. = FALSE)
# Find the eigenvalues vector "Lambda"
lambda_raw <- diag(pca_raw$sdev^2)
lambda_raw_1 <- diag(pca_raw_1$sdev^2)
lambda_raw_2 <- diag(pca_raw_2$sdev^2)
# Define eigenvector matrix
Gamma_raw <- pca_raw$rotation
Gamma_raw_1 <- pca_raw_2$rotation
Gamma_raw_2 <- pca_raw_2$rotation
# Sample Covariance matrix sigma
Sigma_raw <- Gamma_raw %*% lambda_raw %*% t(Gamma_raw)
Sigma_raw_1 <- Gamma_raw_1 %*% lambda_raw_1 %*% t(Gamma_raw_1)
Sigma_raw_2 <- Gamma_raw_2 %*% lambda_raw_2 %*% t(Gamma_raw_2)
#Calculate rank, the operation here took more than 5 mins, Not sure if any other method can be applied ?
#rank_of_raw <- qr(Sigma_raw) $ rank
#rank_of_raw_1 <- qr(Sigma_raw_1) $ rank
#rank_of_raw_2 <- qr(Sigma_raw_2) $ rank
#rank_of_raw #77
#rank_of_raw_1 #77
#rank_of_raw_2 #77
#Eigenvalue plot
Eigen_raw <- pca_raw$sdev^2
Eigen_raw_1 <- pca_raw_1$sdev^2
Eigen_raw_2 <- pca_raw_2$sdev^2
index <- seq_along(Eigen_raw)
plot(index, Eigen_raw,
main = "Eigenvalue plot of raw data",
type = "b", pch = 16)
plot(index, Eigen_raw_1,
main = "Eigenvalue plot of 1st part raw data",
type = "b", pch = 16)
plot(index, Eigen_raw_2,
main = "Eigenvalue plot of 2nd part raw data",
type = "b", pch = 16)
# Cumulative variance
cum_var_raw <- cumsum(Eigen_raw/sum(Eigen_raw))
plot(cum_var_raw, type = "b", pch = 16,
main = "Cumulative contribution to total variance (raw)")
cum_var_raw_1 <- cumsum(Eigen_raw_1/sum(Eigen_raw_1))
plot(cum_var_raw, type = "b", pch = 16,
main = "Cumulative contribution to total variance (raw_1)")
cum_var_raw_2 <- cumsum(Eigen_raw_2/sum(Eigen_raw_2))
plot(cum_var_raw, type = "b", pch = 16,
main = "Cumulative contribution to total variance (raw_2)")
# PC score plot
score_raw <- as.data.frame(PC_score_raw[,1:4])
score_raw$obs <- 1:nrow(score_raw)
outlier_id <- 54
score_raw$IsOutlier <- ifelse(score_raw$obs == outlier_id, "Outlier", "Normal")
PC_score_plot_raw_1 <- ggplot(score_raw, aes(x = PC1, y = PC2, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_raw_2 <- ggplot(score_raw, aes(x = PC1, y = PC3, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_raw_3 <- ggplot(score_raw, aes(x = PC1, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_raw_4 <- ggplot(score_raw, aes(x = PC2, y = PC3, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_raw_5 <- ggplot(score_raw, aes(x = PC2, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_raw_6 <- ggplot(score_raw, aes(x = PC3, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
print((PC_score_plot_raw_1|PC_score_plot_raw_2|PC_score_plot_raw_3)/
(PC_score_plot_raw_4|PC_score_plot_raw_5|PC_score_plot_raw_6))
Compare the results of the different groups with the results shown in the lecture slides and comment. What happened to the outlier? Did it move closer to the centre of the PC1/PC2 data?
Yes the outlier move closer tot the center of the PC1/PC2 data.
What do you expect to change if you leave out the outlying observations from the calculations of part(b)
The leading eigenvalues will decrease, the total variance will be more evenly distributed across several PCs.
Work with the full data of Q1 (we do not use the subsets of variables here). Scale the data and repeat part (b) of Q1 on the scaled data. Compare the results with those of Q1, with particular reference to the outlier.You will need to use scale. = TRUE in prcomp
pca_scale <- prcomp(tumour, center = TRUE, scale. = TRUE)
lambda_scaled <- diag(pca_scale$sdev^2)
eigenvalue <- pca_scale$sdev^2
Gamma_scaled <- pca_scale$rotation
Sigma_scaled <- Gamma_scaled %*% lambda_scaled%*% t(Gamma_scaled)
index <- seq_along(eigenvalue)
plot(index, eigenvalue,
main = "Eigenvalue plot of scaled data",
type = "b", pch = 16)
cum_var_scaled <- cumsum(eigenvalue/sum(eigenvalue))
plot(cum_var_scaled, type = "b", pch = 16,
main = "Cumulative contribution to total variance (scaled)")
PC_score_scaled<- pca_scale$x
score_scaled <- as.data.frame(PC_score_scaled[,1:4])
score_scaled$obs <- 1:nrow(score_scaled)
score_scaled$IsOutlier <- ifelse(score_scaled$obs == outlier_id, "Outlier", "Normal")
PC_score_plot_scaled_1 <- ggplot(score_scaled, aes(x = PC1, y = PC2, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_scaled_2 <- ggplot(score_scaled, aes(x = PC1, y = PC3, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_scaled_3 <- ggplot(score_scaled, aes(x = PC1, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_scaled_4 <- ggplot(score_scaled, aes(x = PC2, y = PC3, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_scaled_5 <- ggplot(score_scaled, aes(x = PC2, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
PC_score_plot_scaled_6 <- ggplot(score_scaled, aes(x = PC3, y = PC4, colour = IsOutlier)) +
scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
geom_point(size = 2, alpha = 0.8)
print(
(PC_score_plot_scaled_1|PC_score_plot_scaled_2|PC_score_plot_scaled_3)/(PC_score_plot_scaled_4|PC_score_plot_scaled_5|PC_score_plot_scaled_6))
The outlier is more sloser to the main cluster in each plot
For the abalone data we want to compare the performance of linear regression and PCR. We do this by calculating the residual standard deviation, and we study how this depends on the number of predictors for linear regression and the number of PC scores for PCR. Regard the variable Rings as the response, and all other variables apart from Sex as predictors. For \(p = 1\) to 7 calculate the residual standard deviation in the following cases:
coln = c(
"Sex", # nominal M, F, and I (infant)
"Length", # continuous mm Longest shell measurement
"Diameter", # continuous mm perpendicular to length
"Height", # continuous mm with meat in shell
"Whole_weight", # continuous grams whole abalone
"Shucked_weight", # continuous grams weight of meat
"Viscera_weight", # continuous grams gut weight (after bleeding)
"Shell_weight", # continuous grams after being dried
"Rings" # integer +1.5 gives the age in years
)
abalone = read_csv( file = "abalone.csv", col_names = coln )
## Rows: 4177 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sex
## dbl (8): Length, Diameter, Height, Whole_weight, Shucked_weight, Viscera_wei...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary( abalone )
## Sex Length Diameter Height
## Length:4177 Min. :0.075 Min. :0.0550 Min. :0.0000
## Class :character 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## Mode :character Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## Whole_weight Shucked_weight Viscera_weight Shell_weight
## Min. :0.0020 Min. :0.0010 Min. :0.0005 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.0935 1st Qu.:0.1300
## Median :0.7995 Median :0.3360 Median :0.1710 Median :0.2340
## Mean :0.8287 Mean :0.3594 Mean :0.1806 Mean :0.2388
## 3rd Qu.:1.1530 3rd Qu.:0.5020 3rd Qu.:0.2530 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.934
## 3rd Qu.:11.000
## Max. :29.000
summary( factor( abalone$Sex ) )
## F I M
## 1307 1342 1528
d = 9-1 n = 4177
Note: add1() is for forward selection. where fm.fwd is the null model, big.lm is the biggest possible model.
# Linear Regression
big.lm = lm( Rings ~ Length + Diameter + Height + Whole_weight + Shucked_weight + Viscera_weight + Shell_weight, data = abalone )
# The biggest model.
summary( big.lm )
##
## Call:
## lm(formula = Rings ~ Length + Diameter + Height + Whole_weight +
## Shucked_weight + Viscera_weight + Shell_weight, data = abalone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1632 -1.3613 -0.3885 0.9054 13.7440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9852 0.2691 11.092 < 2e-16 ***
## Length -1.5719 1.8248 -0.861 0.389
## Diameter 13.3609 2.2371 5.972 2.53e-09 ***
## Height 11.8261 1.5481 7.639 2.70e-14 ***
## Whole_weight 9.2474 0.7326 12.622 < 2e-16 ***
## Shucked_weight -20.2139 0.8233 -24.552 < 2e-16 ***
## Viscera_weight -9.8297 1.3040 -7.538 5.82e-14 ***
## Shell_weight 8.5762 1.1367 7.545 5.54e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.218 on 4169 degrees of freedom
## Multiple R-squared: 0.5276, Adjusted R-squared: 0.5268
## F-statistic: 665.2 on 7 and 4169 DF, p-value: < 2.2e-16
# Forward selection
fm.fwd <- fm.null <- lm(Rings ~ 1, abalone) # the simplest model
#apply add1 for forward selection
summary( lm( fm.fwd ) )
##
## Call:
## lm(formula = fm.fwd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9337 -1.9337 -0.9337 1.0663 19.0663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.93368 0.04989 199.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.224 on 4176 degrees of freedom
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ 1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 43411 9780.8
## Length 1 13454.5 29956 8233.3 1875.17 < 2.2e-16 ***
## Diameter 1 14335.7 29075 8108.6 2058.52 < 2.2e-16 ***
## Height 1 13490.7 29920 8228.2 1882.48 < 2.2e-16 ***
## Whole_weight 1 12676.8 30734 8340.3 1722.07 < 2.2e-16 ***
## Shucked_weight 1 7689.9 35721 8968.4 898.79 < 2.2e-16 ***
## Viscera_weight 1 11019.1 32392 8559.8 1420.27 < 2.2e-16 ***
## Shell_weight 1 17097.2 26313 7691.7 2712.72 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Want: * Large Sum of sqaure * Small RSS * Small AIC * Large F
Since all p-values are very small, look at F value, add the largest one.
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight)
summary( lm( fm.fwd ) )
##
## Call:
## lm(formula = fm.fwd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9830 -1.6005 -0.5843 0.9390 15.6334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.46212 0.07715 83.76 <2e-16 ***
## Shell_weight 14.53568 0.27908 52.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.51 on 4175 degrees of freedom
## Multiple R-squared: 0.3938, Adjusted R-squared: 0.3937
## F-statistic: 2713 on 1 and 4175 DF, p-value: < 2.2e-16
glancerows = data.frame()
fm.fwd <- fm.null <- lm(Rings ~ 1, abalone)
summary( lm( fm.fwd ) )
##
## Call:
## lm(formula = fm.fwd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9337 -1.9337 -0.9337 1.0663 19.0663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.93368 0.04989 199.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.224 on 4176 degrees of freedom
row1 = data.frame(modelno = 0, sigma = glance( lm( fm.fwd ) )$sigma )
# note: glance()$sigma is the residual std error.
row1 # This output is the residual std error.
## modelno sigma
## 1 0 3.224169
glancerows = rbind( glancerows, row1 )
The res.std.error for model 0 os 3.2242
Use add1() to decide the term to add next.
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ 1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 43411 9780.8
## Length 1 13454.5 29956 8233.3 1875.17 < 2.2e-16 ***
## Diameter 1 14335.7 29075 8108.6 2058.52 < 2.2e-16 ***
## Height 1 13490.7 29920 8228.2 1882.48 < 2.2e-16 ***
## Whole_weight 1 12676.8 30734 8340.3 1722.07 < 2.2e-16 ***
## Shucked_weight 1 7689.9 35721 8968.4 898.79 < 2.2e-16 ***
## Viscera_weight 1 11019.1 32392 8559.8 1420.27 < 2.2e-16 ***
## Shell_weight 1 17097.2 26313 7691.7 2712.72 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight)
row1 = data.frame(modelno = 1, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 1 2.5105
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 26313 7691.7
## Length 1 9.9 26304 7692.1 1.5726 0.2099
## Diameter 1 10.2 26303 7692.1 1.6127 0.2042
## Height 1 259.3 26054 7652.3 41.5374 1.288e-10 ***
## Whole_weight 1 1740.8 24573 7407.8 295.7039 < 2.2e-16 ***
## Shucked_weight 1 3476.1 22837 7101.9 635.3248 < 2.2e-16 ***
## Viscera_weight 1 1067.0 25246 7520.8 176.4105 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After added shell_weight, the res.std.error (model1) is 2.5105 And add the shucked weight to next model.
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight + Shucked_weight)
row1 = data.frame(modelno = 2, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 2 2.339087
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight + Shucked_weight
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 22837 7101.9
## Length 1 978.17 21859 6921.0 186.74 < 2.2e-16 ***
## Diameter 1 1233.62 21604 6871.9 238.29 < 2.2e-16 ***
## Height 1 802.26 22035 6954.5 151.93 < 2.2e-16 ***
## Whole_weight 1 803.77 22034 6954.2 152.23 < 2.2e-16 ***
## Viscera_weight 1 73.91 22763 7090.4 13.55 0.0002353 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The residual std.error is now 2.3390 (model2) and diameter now has the largest F value
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight +
Diameter )
row1 = data.frame(modelno = 3, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 3 2.275306
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 21604 6871.9
## Length 1 2.58 21601 6873.4 0.499 0.4800
## Height 1 308.62 21295 6813.8 60.463 9.382e-15 ***
## Whole_weight 1 549.30 21054 6766.4 108.846 < 2.2e-16 ***
## Viscera_weight 1 3.93 21600 6873.2 0.760 0.3834
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model3 with 2.2753, Whole_ _weight next.
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight +
Diameter + Whole_weight )
row1 = data.frame(modelno = 4, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 4 2.246463
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 21054 6766.4
## Length 1 8.564 21046 6766.7 1.6973 0.1927
## Height 1 257.206 20797 6717.0 51.5841 8.084e-13 ***
## Viscera_weight 1 259.383 20795 6716.6 52.0262 6.472e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 4 with 2.2465, Viscera_weight next.
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight +
Diameter + Whole_weight + Viscera_weight )
row1 = data.frame(modelno = 5, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 5 2.23285
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight +
## Viscera_weight
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 20795 6716.6
## Length 1 2.107 20793 6718.2 0.4225 0.5157
## Height 1 285.478 20510 6660.8 58.0434 3.154e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model 5 2.2329, Height next
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight +
Diameter + Whole_weight + Viscera_weight
+ Height )
row1 = data.frame(modelno = 6, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 6 2.217736
glancerows = rbind( glancerows, row1 )
add1(fm.fwd, big.lm, test="F")
## Single term additions
##
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight +
## Viscera_weight + Height
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 20510 6660.8
## Length 1 3.6499 20506 6662.1 0.7421 0.3891
Model 6 2.2177, Length the last. But at this stage, the p-value of Length is larger than 0.05, Lets see the output of add the last variable.
fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight +
Diameter + Whole_weight + Viscera_weight
+ Height + Length )
row1 = data.frame(modelno = 7, sigma = glance( lm( fm.fwd ) )$sigma )
row1
## modelno sigma
## 1 7 2.217805
glancerows = rbind( glancerows, row1 )
# add1(fm.fwd, big.lm, test="F")
Model 7 2.2178, which is worse than model 6.
abalone_new <- abalone[,2:8]
y <- abalone$Rings
PCA_abalone_raw <- prcomp(abalone_new, center = TRUE, scale. = FALSE)
summary(PCA_abalone_raw)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 0.5815 0.06296 0.05392 0.03248 0.02213 0.02066 0.01217
## Proportion of Variance 0.9741 0.01142 0.00838 0.00304 0.00141 0.00123 0.00043
## Cumulative Proportion 0.9741 0.98552 0.99389 0.99693 0.99834 0.99957 1.00000
glancerows_pcr <- data.frame()
glancerows_pcr <- bind_rows(glancerows_pcr, glancerows[1,]) # Add model 0 into.
glancerows_pcr
## modelno sigma
## 1 0 3.224169
for (p in 1:7) {
pc_data <- as.data.frame(PCA_abalone_raw$x[,1:p])
colnames(pc_data) <- paste0("PC",1:p)
fm.pcr <- lm(y ~ ., data = pc_data)
row <- data.frame(modelno = p,
sigma = glance(fm.pcr)$sigma)
glancerows_pcr <- rbind(glancerows_pcr, row)
}
glancerows_pcr # The residual std error based on PC.
## modelno sigma
## 1 0 3.224169
## 2 1 2.723087
## 3 2 2.267147
## 4 3 2.248847
## 5 4 2.245327
## 6 5 2.243432
## 7 6 2.220831
## 8 7 2.217805
A <- transform(glancerows, Method = "Forward selection")
B <- transform(glancerows_pcr, Method = "PCR")
both <- rbind(A, B)
ggplot(both, aes(x = modelno, y = sigma, color = Method)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_continuous(breaks = 0:7) +
labs(title = "Residual standard error vs number of predictors / PCs",
x = "p (number of predictors / PCs)",
y = "Residual standard error (σ)") +
theme_minimal(base_size = 13)
Both approach reach their minimum residual standar error aroud p = 5-6. Forward selection may achieve marginally lower σ, but PCR provides a more stable model since its predictors (PC scores) are uncorrelated and unaffected by multicollinearity.
Bonus question (no calculations needed): why would a comparison of PCR and linear regression based on best subset selection not work so well? Hints: We describe some of the regression commands and how we may use glance to gather and simplify the information we obtain
The comparison doesn’t work well because PCR and best-subset regression are built on different selection bases (unsupervised vs supervised) and operate in different predictor spaces, so their “best” models aren’t directly comparable.