Set-up

rm(list = ls())
getwd()

## [1] "C:/Users/ASUS/Desktop/RMIT S2 2025/MATH2472, R, Python, Statistical Data Science/Week 4"

setwd("C:/Users/ASUS/Desktop/RMIT S2 2025/MATH2472, R, Python, Statistical Data Science/Week 4")

Package

# Week 1
library(ggplot2)
library(patchwork)   # serveral plots in the same frame
library(MASS)        # for parallel coordinate plot

# Week2

library(GGally)     # ggpairs()
library(dplyr)      # mutate
library(tidyverse)  # mutate part
library(plot3D)

# Week3
# Week4 
library(broom) # glance()
library(pls)
library(MVN)   # detect outlier

Introduction

We consider the HDLSS Breast Tumour Gene Expression data of van’t Veer et.al. which we met in Lecture 3. We observed that one of the 78 observations appears to be an outlier. By restricting the number of variablesto smaller subsets, we want to work out which groups of variables show up this observation as an outlier. We treat the genes as variables as in the lecture slides

(a) Read the logged data into R/Python. Which observation number corresponds to the outlier? What did you compute to find it? (There is no unique or correct way of doing it!)

As indroduced in week3 notes, tumour data set: n = 78, d = 4751

tumour <- read.table("tumour.csv", header = FALSE, sep = "")
tumour <- t(as.matrix(tumour))
dim(tumour)  # make sure is 78 4751

## [1]   78 4751

pca_raw <- prcomp(tumour, center = TRUE, scale. = FALSE)
PC_score_raw <- pca_raw$x

p1 <- ggplot(PC_score_raw, aes(x = PC1, y = PC2)) +
  geom_point() +
  labs(title = "PC1 vs PC2 scatter plot")
print(p1)

# The point at bottom left is outlier. which with very small PC1 and PC2
PC1_PC2 <- as.data.frame(PC_score_raw[,1:2])
the_outlier <- which.min(PC1_PC2$PC1 + PC1_PC2$PC2)
the_outlier

## [1] 54

The outlier is the 54th observation.

(b)

We consider the full data with all variables and two subset of the variables: all 4751 variables as in the analysis in the lecture, the first 2376 and the remaining 2375 variables . You may want to create separate datasets/frames for each subset of variables.

For each of the three groups of variables and all 78 observations, conduct a PCA. Using the outcome of the PCA, find the rank of the sample covariance matrix in each case, display the eigenvalue plots, the cumulative contribution to total variance plots and the PC score plots analogous to those shown on p22 and p23 respectively of the lecture slides.

dim(tumour)

## [1]   78 4751

tumour_1 <- as.data.frame(tumour[,1:2376 ])   # fist 2376
tumour_2 <-  as.data.frame(tumour[,2377:4751 ])   # last 2375

# Note: pca_raw is pca for whole data set.
pca_raw_1 <- prcomp(tumour_1, center = TRUE, scale. = FALSE)
pca_raw_2 <- prcomp(tumour_2, center = TRUE, scale. = FALSE)

# Find the eigenvalues vector "Lambda"
lambda_raw <- diag(pca_raw$sdev^2)
lambda_raw_1 <- diag(pca_raw_1$sdev^2)
lambda_raw_2 <- diag(pca_raw_2$sdev^2)

# Define eigenvector matrix
Gamma_raw <- pca_raw$rotation
Gamma_raw_1 <- pca_raw_2$rotation
Gamma_raw_2 <- pca_raw_2$rotation

# Sample Covariance matrix sigma
Sigma_raw <- Gamma_raw %*% lambda_raw %*% t(Gamma_raw)
Sigma_raw_1 <- Gamma_raw_1 %*% lambda_raw_1 %*% t(Gamma_raw_1)
Sigma_raw_2 <- Gamma_raw_2 %*% lambda_raw_2 %*% t(Gamma_raw_2)

#Calculate rank, the operation here took more than 5 mins, Not sure if any other method can be applied ? 

#rank_of_raw <- qr(Sigma_raw) $ rank
#rank_of_raw_1 <- qr(Sigma_raw_1) $ rank
#rank_of_raw_2 <- qr(Sigma_raw_2) $ rank

#rank_of_raw         #77
#rank_of_raw_1       #77
#rank_of_raw_2       #77


#Eigenvalue plot

Eigen_raw <- pca_raw$sdev^2
Eigen_raw_1 <- pca_raw_1$sdev^2
Eigen_raw_2 <- pca_raw_2$sdev^2
index <- seq_along(Eigen_raw)
plot(index, Eigen_raw, 
     main = "Eigenvalue plot of raw data",
     type = "b", pch = 16)

plot(index, Eigen_raw_1, 
     main = "Eigenvalue plot of 1st part raw data",
     type = "b", pch = 16)

plot(index, Eigen_raw_2, 
     main = "Eigenvalue plot of 2nd part raw data",
     type = "b", pch = 16)

# Cumulative variance

cum_var_raw <- cumsum(Eigen_raw/sum(Eigen_raw))
plot(cum_var_raw, type = "b", pch = 16,
     main = "Cumulative contribution to total variance (raw)")

cum_var_raw_1 <- cumsum(Eigen_raw_1/sum(Eigen_raw_1))
plot(cum_var_raw, type = "b", pch = 16,
     main = "Cumulative contribution to total variance (raw_1)")

cum_var_raw_2 <- cumsum(Eigen_raw_2/sum(Eigen_raw_2))
plot(cum_var_raw, type = "b", pch = 16,
     main = "Cumulative contribution to total variance (raw_2)")

# PC score plot

score_raw <- as.data.frame(PC_score_raw[,1:4])
score_raw$obs <- 1:nrow(score_raw) 
outlier_id <- 54
score_raw$IsOutlier <- ifelse(score_raw$obs == outlier_id, "Outlier", "Normal")


PC_score_plot_raw_1 <- ggplot(score_raw, aes(x = PC1, y = PC2, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)

PC_score_plot_raw_2 <- ggplot(score_raw, aes(x = PC1, y = PC3, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_raw_3 <- ggplot(score_raw, aes(x = PC1, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_raw_4 <- ggplot(score_raw, aes(x = PC2, y = PC3, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_raw_5 <- ggplot(score_raw, aes(x = PC2, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_raw_6 <- ggplot(score_raw, aes(x = PC3, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)

print((PC_score_plot_raw_1|PC_score_plot_raw_2|PC_score_plot_raw_3)/
        (PC_score_plot_raw_4|PC_score_plot_raw_5|PC_score_plot_raw_6))

(c)

Compare the results of the different groups with the results shown in the lecture slides and comment. What happened to the outlier? Did it move closer to the centre of the PC1/PC2 data?

Yes the outlier move closer tot the center of the PC1/PC2 data.

(d)

What do you expect to change if you leave out the outlying observations from the calculations of part(b)

The leading eigenvalues will decrease, the total variance will be more evenly distributed across several PCs.

Question 2

Work with the full data of Q1 (we do not use the subsets of variables here). Scale the data and repeat part (b) of Q1 on the scaled data. Compare the results with those of Q1, with particular reference to the outlier.You will need to use scale. = TRUE in prcomp

pca_scale <- prcomp(tumour, center = TRUE, scale. = TRUE)

lambda_scaled <- diag(pca_scale$sdev^2)
eigenvalue <- pca_scale$sdev^2

Gamma_scaled <- pca_scale$rotation
Sigma_scaled <- Gamma_scaled %*% lambda_scaled%*% t(Gamma_scaled)

index <- seq_along(eigenvalue)
plot(index, eigenvalue, 
     main = "Eigenvalue plot of scaled data",
     type = "b", pch = 16)

cum_var_scaled <- cumsum(eigenvalue/sum(eigenvalue))
plot(cum_var_scaled, type = "b", pch = 16,
     main = "Cumulative contribution to total variance (scaled)")

PC_score_scaled<- pca_scale$x
score_scaled <- as.data.frame(PC_score_scaled[,1:4])
score_scaled$obs <- 1:nrow(score_scaled) 
score_scaled$IsOutlier <- ifelse(score_scaled$obs == outlier_id, "Outlier", "Normal")
PC_score_plot_scaled_1 <- ggplot(score_scaled, aes(x = PC1, y = PC2, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)

PC_score_plot_scaled_2 <- ggplot(score_scaled, aes(x = PC1, y = PC3, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_scaled_3 <- ggplot(score_scaled, aes(x = PC1, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_scaled_4 <- ggplot(score_scaled, aes(x = PC2, y = PC3, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_scaled_5 <- ggplot(score_scaled, aes(x = PC2, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)


PC_score_plot_scaled_6 <- ggplot(score_scaled, aes(x = PC3, y = PC4, colour = IsOutlier)) +
   scale_color_manual(values = c("Normal" = "grey50", "Outlier" = "red")) +
  geom_point(size = 2, alpha = 0.8)
print(
  (PC_score_plot_scaled_1|PC_score_plot_scaled_2|PC_score_plot_scaled_3)/(PC_score_plot_scaled_4|PC_score_plot_scaled_5|PC_score_plot_scaled_6))

The outlier is more sloser to the main cluster in each plot

Question 3

For the abalone data we want to compare the performance of linear regression and PCR. We do this by calculating the residual standard deviation, and we study how this depends on the number of predictors for linear regression and the number of PC scores for PCR. Regard the variable Rings as the response, and all other variables apart from Sex as predictors. For \(p = 1\) to 7 calculate the residual standard deviation in the following cases:

Add column names (given)

coln = c(
    "Sex", #        nominal         M, F, and I (infant)
    "Length", #         continuous  mm  Longest shell measurement
    "Diameter", #   continuous  mm  perpendicular to length
    "Height", #     continuous  mm  with meat in shell
    "Whole_weight", #   continuous  grams   whole abalone
    "Shucked_weight", # continuous  grams   weight of meat
    "Viscera_weight", # continuous  grams   gut weight (after bleeding)
    "Shell_weight", #   continuous  grams   after being dried
    "Rings" #       integer         +1.5 gives the age in years
)
abalone = read_csv( file = "abalone.csv",  col_names = coln )

## Rows: 4177 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sex
## dbl (8): Length, Diameter, Height, Whole_weight, Shucked_weight, Viscera_wei...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

summary( abalone )

##      Sex                Length         Diameter          Height      
##  Length:4177        Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  Class :character   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  Mode  :character   Median :0.545   Median :0.4250   Median :0.1400  
##                     Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##                     3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##                     Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   Whole_weight    Shucked_weight   Viscera_weight    Shell_weight   
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000

summary( factor( abalone$Sex ) )

##    F    I    M 
## 1307 1342 1528

d = 9-1 n = 4177

(a) Based on the best p predictors in linear regression, based on Forward Selection

Teaching part:

Note: add1() is for forward selection. where fm.fwd is the null model, big.lm is the biggest possible model.

# Linear Regression 
big.lm = lm( Rings ~  Length + Diameter + Height + Whole_weight + Shucked_weight + Viscera_weight + Shell_weight, data = abalone  )
# The biggest model.
summary( big.lm )

## 
## Call:
## lm(formula = Rings ~ Length + Diameter + Height + Whole_weight + 
##     Shucked_weight + Viscera_weight + Shell_weight, data = abalone)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1632  -1.3613  -0.3885   0.9054  13.7440 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.9852     0.2691  11.092  < 2e-16 ***
## Length          -1.5719     1.8248  -0.861    0.389    
## Diameter        13.3609     2.2371   5.972 2.53e-09 ***
## Height          11.8261     1.5481   7.639 2.70e-14 ***
## Whole_weight     9.2474     0.7326  12.622  < 2e-16 ***
## Shucked_weight -20.2139     0.8233 -24.552  < 2e-16 ***
## Viscera_weight  -9.8297     1.3040  -7.538 5.82e-14 ***
## Shell_weight     8.5762     1.1367   7.545 5.54e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.218 on 4169 degrees of freedom
## Multiple R-squared:  0.5276, Adjusted R-squared:  0.5268 
## F-statistic: 665.2 on 7 and 4169 DF,  p-value: < 2.2e-16

# Forward selection
fm.fwd <- fm.null <- lm(Rings ~ 1, abalone) # the simplest model
#apply add1 for forward selection
summary( lm( fm.fwd  ) )

## 
## Call:
## lm(formula = fm.fwd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9337 -1.9337 -0.9337  1.0663 19.0663 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.93368    0.04989   199.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.224 on 4176 degrees of freedom

add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ 1
##                Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>                      43411 9780.8                      
## Length          1   13454.5 29956 8233.3 1875.17 < 2.2e-16 ***
## Diameter        1   14335.7 29075 8108.6 2058.52 < 2.2e-16 ***
## Height          1   13490.7 29920 8228.2 1882.48 < 2.2e-16 ***
## Whole_weight    1   12676.8 30734 8340.3 1722.07 < 2.2e-16 ***
## Shucked_weight  1    7689.9 35721 8968.4  898.79 < 2.2e-16 ***
## Viscera_weight  1   11019.1 32392 8559.8 1420.27 < 2.2e-16 ***
## Shell_weight    1   17097.2 26313 7691.7 2712.72 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Want: * Large Sum of sqaure * Small RSS * Small AIC * Large F

Since all p-values are very small, look at F value, add the largest one.

fm.fwd <- update(fm.fwd, . ~ . + Shell_weight)
summary( lm( fm.fwd  ) )

## 
## Call:
## lm(formula = fm.fwd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9830 -1.6005 -0.5843  0.9390 15.6334 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.46212    0.07715   83.76   <2e-16 ***
## Shell_weight 14.53568    0.27908   52.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.51 on 4175 degrees of freedom
## Multiple R-squared:  0.3938, Adjusted R-squared:  0.3937 
## F-statistic:  2713 on 1 and 4175 DF,  p-value: < 2.2e-16

(a) Start from here

glancerows = data.frame()
fm.fwd <- fm.null <- lm(Rings ~ 1, abalone)
summary( lm( fm.fwd  ) )

## 
## Call:
## lm(formula = fm.fwd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9337 -1.9337 -0.9337  1.0663 19.0663 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.93368    0.04989   199.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.224 on 4176 degrees of freedom

row1 = data.frame(modelno = 0, sigma = glance( lm( fm.fwd  ) )$sigma )
# note: glance()$sigma is the residual std error.
row1  # This output is the residual std error.

##   modelno    sigma
## 1       0 3.224169

glancerows = rbind( glancerows, row1 )

The res.std.error for model 0 os 3.2242

Use add1() to decide the term to add next.

add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ 1
##                Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>                      43411 9780.8                      
## Length          1   13454.5 29956 8233.3 1875.17 < 2.2e-16 ***
## Diameter        1   14335.7 29075 8108.6 2058.52 < 2.2e-16 ***
## Height          1   13490.7 29920 8228.2 1882.48 < 2.2e-16 ***
## Whole_weight    1   12676.8 30734 8340.3 1722.07 < 2.2e-16 ***
## Shucked_weight  1    7689.9 35721 8968.4  898.79 < 2.2e-16 ***
## Viscera_weight  1   11019.1 32392 8559.8 1420.27 < 2.2e-16 ***
## Shell_weight    1   17097.2 26313 7691.7 2712.72 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

 fm.fwd <- update(fm.fwd, . ~ . + Shell_weight)
 row1 = data.frame(modelno = 1, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno  sigma
## 1       1 2.5105

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight
##                Df Sum of Sq   RSS    AIC  F value    Pr(>F)    
## <none>                      26313 7691.7                       
## Length          1       9.9 26304 7692.1   1.5726    0.2099    
## Diameter        1      10.2 26303 7692.1   1.6127    0.2042    
## Height          1     259.3 26054 7652.3  41.5374 1.288e-10 ***
## Whole_weight    1    1740.8 24573 7407.8 295.7039 < 2.2e-16 ***
## Shucked_weight  1    3476.1 22837 7101.9 635.3248 < 2.2e-16 ***
## Viscera_weight  1    1067.0 25246 7520.8 176.4105 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

After added shell_weight, the res.std.error (model1) is 2.5105 And add the shucked weight to next model.

 fm.fwd <- update(fm.fwd, . ~ . + Shell_weight + Shucked_weight)
 row1 = data.frame(modelno = 2, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno    sigma
## 1       2 2.339087

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight + Shucked_weight
##                Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>                      22837 7101.9                      
## Length          1    978.17 21859 6921.0  186.74 < 2.2e-16 ***
## Diameter        1   1233.62 21604 6871.9  238.29 < 2.2e-16 ***
## Height          1    802.26 22035 6954.5  151.93 < 2.2e-16 ***
## Whole_weight    1    803.77 22034 6954.2  152.23 < 2.2e-16 ***
## Viscera_weight  1     73.91 22763 7090.4   13.55 0.0002353 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The residual std.error is now 2.3390 （model2) and diameter now has the largest F value

 fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight + 
                   Diameter )
 row1 = data.frame(modelno = 3, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno    sigma
## 1       3 2.275306

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter
##                Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>                      21604 6871.9                      
## Length          1      2.58 21601 6873.4   0.499    0.4800    
## Height          1    308.62 21295 6813.8  60.463 9.382e-15 ***
## Whole_weight    1    549.30 21054 6766.4 108.846 < 2.2e-16 ***
## Viscera_weight  1      3.93 21600 6873.2   0.760    0.3834    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model3 with 2.2753, Whole_ _weight next.

fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight + 
                   Diameter + Whole_weight  )
 row1 = data.frame(modelno = 4, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno    sigma
## 1       4 2.246463

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight
##                Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>                      21054 6766.4                      
## Length          1     8.564 21046 6766.7  1.6973    0.1927    
## Height          1   257.206 20797 6717.0 51.5841 8.084e-13 ***
## Viscera_weight  1   259.383 20795 6716.6 52.0262 6.472e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 4 with 2.2465, Viscera_weight next.

fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight + 
                   Diameter + Whole_weight + Viscera_weight )
 row1 = data.frame(modelno = 5, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno   sigma
## 1       5 2.23285

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight + 
##     Viscera_weight
##        Df Sum of Sq   RSS    AIC F value    Pr(>F)    
## <none>              20795 6716.6                      
## Length  1     2.107 20793 6718.2  0.4225    0.5157    
## Height  1   285.478 20510 6660.8 58.0434 3.154e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 5 2.2329, Height next

fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight + 
                   Diameter + Whole_weight + Viscera_weight 
                 + Height )
 row1 = data.frame(modelno = 6, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno    sigma
## 1       6 2.217736

 glancerows = rbind( glancerows, row1 )
 add1(fm.fwd, big.lm, test="F")

## Single term additions
## 
## Model:
## Rings ~ Shell_weight + Shucked_weight + Diameter + Whole_weight + 
##     Viscera_weight + Height
##        Df Sum of Sq   RSS    AIC F value Pr(>F)
## <none>              20510 6660.8               
## Length  1    3.6499 20506 6662.1  0.7421 0.3891

Model 6 2.2177, Length the last. But at this stage, the p-value of Length is larger than 0.05, Lets see the output of add the last variable.

fm.fwd <- update(fm.fwd, . ~ . + Shell_weight+ Shucked_weight + 
                   Diameter + Whole_weight + Viscera_weight 
                 + Height + Length )
 row1 = data.frame(modelno = 7, sigma = glance( lm( fm.fwd ) )$sigma )
 row1

##   modelno    sigma
## 1       7 2.217805

 glancerows = rbind( glancerows, row1 )
 # add1(fm.fwd, big.lm, test="F")

Model 7 2.2178, which is worse than model 6.

Hence Model 6 has the lowest residual std error. (p=6)

(b) Based on the first \(p\) PC scores, that is, the \(p\)-dimensional PC data, used as predictors in PCR.

abalone_new <- abalone[,2:8]
y <- abalone$Rings
PCA_abalone_raw <- prcomp(abalone_new, center = TRUE, scale. = FALSE)
summary(PCA_abalone_raw)

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     0.5815 0.06296 0.05392 0.03248 0.02213 0.02066 0.01217
## Proportion of Variance 0.9741 0.01142 0.00838 0.00304 0.00141 0.00123 0.00043
## Cumulative Proportion  0.9741 0.98552 0.99389 0.99693 0.99834 0.99957 1.00000

glancerows_pcr <- data.frame()
glancerows_pcr <- bind_rows(glancerows_pcr, glancerows[1,]) # Add model 0 into.
glancerows_pcr

##   modelno    sigma
## 1       0 3.224169

for (p in 1:7) {
  pc_data <- as.data.frame(PCA_abalone_raw$x[,1:p])
  colnames(pc_data) <- paste0("PC",1:p)
  
  fm.pcr <- lm(y ~ ., data = pc_data)
  
  row <- data.frame(modelno = p,
                    sigma = glance(fm.pcr)$sigma)
  glancerows_pcr <- rbind(glancerows_pcr, row)
}

glancerows_pcr # The residual std error based on PC.

##   modelno    sigma
## 1       0 3.224169
## 2       1 2.723087
## 3       2 2.267147
## 4       3 2.248847
## 5       4 2.245327
## 6       5 2.243432
## 7       6 2.220831
## 8       7 2.217805

(c) Plot residual standard deviations against the predictors for the results in parts (a) and (b). Compare the results and comment.

A <- transform(glancerows,        Method = "Forward selection")
B <- transform(glancerows_pcr,    Method = "PCR")
both <- rbind(A, B)

ggplot(both, aes(x = modelno, y = sigma, color = Method)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = 0:7) +
  labs(title = "Residual standard error vs number of predictors / PCs",
       x = "p  (number of predictors / PCs)",
       y = "Residual standard error (σ)") +
  theme_minimal(base_size = 13)

Both approach reach their minimum residual standar error aroud p = 5-6. Forward selection may achieve marginally lower σ, but PCR provides a more stable model since its predictors (PC scores) are uncorrelated and unaffected by multicollinearity.

(d)

Bonus question (no calculations needed): why would a comparison of PCR and linear regression based on best subset selection not work so well? Hints: We describe some of the regression commands and how we may use glance to gather and simplify the information we obtain

The comparison doesn’t work well because PCR and best-subset regression are built on different selection bases (unsupervised vs supervised) and operate in different predictor spaces, so their “best” models aren’t directly comparable.

Week4_practice

Wei Lin

2025-10-03