Activity 3.1 - SVD

SUBMISSION INSTRUCTIONS

Render to html
Publish your html to RPubs

Click the Publish button
Select RPubs
Follow the prompts for creating an account on RPubs and publishing your solution

Submit a link to your published solutions

Problem 1

Reconsider the US air pollution data set:

library(HSAUR2)

Loading required package: tools

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
data(USairpollution)

A)

Perform singular value decomposition of this data matrix. Then create the matrix \(D\). Describe what this matrix looks like.

components <- svd(USairpollution)
D <- components$d
D_matrix <- matrix(diag(D),nrow=7,ncol=7)
D_matrix

         [,1]     [,2]    [,3]     [,4]     [,5]    [,6]    [,7]
[1,] 7051.949   0.0000   0.000  0.00000  0.00000  0.0000  0.0000
[2,]    0.000 931.1211   0.000  0.00000  0.00000  0.0000  0.0000
[3,]    0.000   0.0000 540.463  0.00000  0.00000  0.0000  0.0000
[4,]    0.000   0.0000   0.000 92.70909  0.00000  0.0000  0.0000
[5,]    0.000   0.0000   0.000  0.00000 85.23724  0.0000  0.0000
[6,]    0.000   0.0000   0.000  0.00000  0.00000 52.9465  0.0000
[7,]    0.000   0.0000   0.000  0.00000  0.00000  0.0000 10.1409

The matrix looks like a diagonal matrix with all values aligned across the diagonal line and 0 filling the rest of the space.

B)

Verify that \(X=UDV^T\) by plotting all the entries of \(X\) versus all the entries of \(UDV^T\) with the 0/1 line.

k <- 7
D <- components$d
U <- components$u
V <- components$v
X_back <- (U %*% diag(D) %*% t(V)) %>% round(1)
(Xtilde <- U[,1:k] %*% diag(D[1:k]) %*% t(V[,1:k])) %>% round(1)

      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
 [1,]   46 47.6   44  116  8.8 33.4  135
 [2,]   11 56.8   46  244  8.9  7.8   58
 [3,]   24 61.5  368  497  9.1 48.3  115
 [4,]   47 55.0  625  905  9.6 41.3  111
 [5,]   11 47.1  391  463 12.4 36.1  166
 [6,]   31 55.2   35   71  6.5 40.7  148
 [7,]  110 50.6 3344 3369 10.4 34.4  122
 [8,]   23 54.0  462  453  7.1 39.0  132
 [9,]   65 49.7 1007  751 10.9 35.0  155
[10,]   26 51.5  266  540  8.6 37.0  134
[11,]    9 66.2  641  844 10.9 35.9   78
[12,]   17 51.9  454  515  9.0 12.9   86
[13,]   17 49.0  104  201 11.2 30.8  103
[14,]   35 49.9 1064 1513 10.1 31.0  129
[15,]   56 49.1  412  158  9.0 43.4  127
[16,]   10 68.9  721 1233 10.8 48.2  103
[17,]   28 52.3  361  746  9.7 38.7  121
[18,]   14 68.4  136  529  8.8 54.5  116
[19,]   14 54.5  381  507 10.0 37.0   99
[20,]   13 61.0   91  132  8.2 48.5  100
[21,]   30 55.6  291  593  8.3 43.1  123
[22,]   10 61.6  337  624  9.2 49.1  105
[23,]   10 75.5  207  335  9.0 59.8  128
[24,]   16 45.7  569  717 11.8 29.1  123
[25,]   29 43.5  699  744 10.6 25.9  137
[26,]   18 59.4  275  448  7.9 46.0  119
[27,]    9 68.3  204  361  8.4 56.8  113
[28,]   31 59.3   96  308 10.6 44.7  116
[29,]   14 51.5  181  347 10.9 30.2   98
[30,]   69 54.6 1692 1950  9.6 39.9  115
[31,]   10 70.3  213  582  6.0  7.1   36
[32,]   61 50.4  347  520  9.4 36.2  147
[33,]   94 50.0  343  179 10.6 42.7  125
[34,]   26 57.8  197  299  7.6 42.6  115
[35,]   28 51.0  137  176  8.7 15.2   89
[36,]   12 56.7  453  716  8.7 20.7   67
[37,]   29 51.1  379  531  9.4 38.8  164
[38,]   56 55.9  775  622  9.5 35.9  105
[39,]   29 57.3  434  757  9.3 38.9  111
[40,]    8 56.6  125  277 12.7 30.6   82
[41,]   36 54.0   80   80  9.0 40.2  114

plot(X_back, Xtilde);abline(0,1)

cor(as.vector(X_back), as.vector(Xtilde))

[1] 1

C)

Consider low-dimensional approximations of the data matrix. What is the fewest number of dimensions required to yield a correlation between the entries of \(X\) and \(\tilde X\) of at least 0.9?

k2 <- 2
(Xtilde <- U[,1:k2] %*% diag(D[1:k2]) %*% t(V[,1:k2])) %>% round(1)

       [,1]  [,2]   [,3]   [,4] [,5] [,6]  [,7]
 [1,]   3.5  23.8   17.2  151.5  3.6 15.2  42.1
 [2,]   5.9  35.8   45.7  244.5  5.5 22.8  63.7
 [3,]  17.3  37.8  352.3  516.8  6.2 24.6  73.1
 [4,]  30.4  63.4  629.6  900.0 10.5 41.3 123.5
 [5,]  17.1  32.9  363.0  498.0  5.5 21.5  64.8
 [6,]   2.4  20.0    0.5  115.4  3.0 12.7  35.1
 [7,] 131.5  80.9 3371.0 3334.4 17.1 56.6 208.0
 [8,]  18.2  20.7  433.8  488.4  3.8 13.8  44.9
 [9,]  34.5  -2.1  962.8  807.5  1.1  0.1  15.2
[10,]  16.2  56.9  260.0  548.2  9.0 36.6 104.9
[11,]  29.3  50.8  643.7  839.4  8.6 33.3 101.6
[12,]  19.2  26.4  443.0  528.5  4.6 17.4  54.9
[13,]   6.3  26.2   86.1  223.8  4.1 16.8  47.7
[14,]  50.9  96.7 1088.6 1481.9 16.2 63.2 190.8
[15,]  11.3 -15.3  362.9  221.1 -1.8 -9.2 -19.7
[16,]  38.8  98.3  746.5 1199.8 16.0 63.6 186.9
[17,]  22.1  73.6  368.8  736.7 11.7 47.4 136.3
[18,]  13.5  71.1  138.3  526.0 11.0 45.4 127.3
[19,]  17.7  35.8  369.9  520.6  6.0 23.4  70.1
[20,]   4.6  18.8   65.6  163.8  2.9 12.1  34.2
[21,]  17.7  60.9  289.1  595.9  9.6 39.2 112.5
[22,]  19.2  58.8  336.5  624.1  9.4 37.9 109.7
[23,]  11.0  36.3  183.1  364.5  5.8 23.4  67.3
[24,]  25.5  43.8  559.9  728.1  7.4 28.7  87.7
[25,]  28.5  32.6  680.9  766.8  5.9 21.8  70.5
[26,]  14.5  42.3  261.2  465.3  6.8 27.3  79.3
[27,]  11.4  38.9  186.7  382.2  6.2 25.0  71.8
[28,]   8.3  44.1   83.6  324.4  6.8 28.2  78.9
[29,]  10.6  37.2  171.2  359.4  5.9 23.9  68.6
[30,]  71.6  81.4 1712.1 1924.8 14.8 54.4 176.3
[31,]  15.9  62.2  233.5  555.9  9.8 39.9 113.6
[32,]  17.4  45.1  331.6  541.4  7.3 29.2  85.6
[33,]  10.4  -4.0  301.8  233.9 -0.2 -2.1  -1.1
[34,]  10.0  30.3  177.0  324.6  4.8 19.5  56.6
[35,]   6.4  17.3  118.4  200.1  2.8 11.2  32.7
[36,]  23.2  53.9  462.5  703.5  8.8 35.0 103.6
[37,]  18.2  44.1  358.0  558.0  7.2 28.6  84.3
[38,]  27.5   4.6  745.0  660.4  1.8  4.0  22.6
[39,]  23.7  65.8  438.8  751.3 10.6 42.5 123.9
[40,]   8.1  33.2  115.3  288.9  5.2 21.3  60.5
[41,]   3.4  13.3   49.9  118.9  2.1  8.5  24.3

plot(X_back, Xtilde);abline(0,1)

cor(as.vector(X_back), as.vector(Xtilde))

[1] 0.9965505

The fewest number of dimensions to yield a correlation of at least 0.9 is 2 dimensions.

D)

Find \(\Sigma\), the covariance matrix of this data set. Then perform eigen-decomposition of this matrix. Verify that

The eigenvectors of \(\Sigma\) equal the columns of \(V\)
The eigenvalues of \(\Sigma\) equal the diagonals of \(D^2/(n-1)\)

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

US_df<- (USairpollution
        %>%scale(center=TRUE, scale = FALSE)
)
t(US_df)%*%US_df/(41-1)

                SO2        temp        manu       popul        wind
SO2      550.947561  -73.560671   8527.7201   6711.9945   3.1753049
temp     -73.560671   52.239878   -773.9713   -262.3496  -3.6113537
manu    8527.720122 -773.971341 317502.8902 311718.8140 191.5481098
popul   6711.994512 -262.349634 311718.8140 335371.8939 175.9300610
wind       3.175305   -3.611354    191.5481    175.9301   2.0410244
precip    15.001799   32.862988   -215.0199   -178.0529  -0.2185311
predays  229.929878  -82.426159   1968.9598    645.9860   6.2143902
              precip    predays
SO2       15.0017988  229.92988
temp      32.8629884  -82.42616
manu    -215.0199024 1968.95976
popul   -178.0528902  645.98598
wind      -0.2185311    6.21439
precip   138.5693840  154.79290
predays  154.7929024  702.59024

cov(USairpollution)

                SO2        temp        manu       popul        wind
SO2      550.947561  -73.560671   8527.7201   6711.9945   3.1753049
temp     -73.560671   52.239878   -773.9713   -262.3496  -3.6113537
manu    8527.720122 -773.971341 317502.8902 311718.8140 191.5481098
popul   6711.994512 -262.349634 311718.8140 335371.8939 175.9300610
wind       3.175305   -3.611354    191.5481    175.9301   2.0410244
precip    15.001799   32.862988   -215.0199   -178.0529  -0.2185311
predays  229.929878  -82.426159   1968.9598    645.9860   6.2143902
              precip    predays
SO2       15.0017988  229.92988
temp      32.8629884  -82.42616
manu    -215.0199024 1968.95976
popul   -178.0528902  645.98598
wind      -0.2185311    6.21439
precip   138.5693840  154.79290
predays  154.7929024  702.59024

svd_us <- svd(US_df)
eigenSigma <- eigen(cov(US_df))
eigenSigma$vectors

              [,1]         [,2]         [,3]        [,4]         [,5]
[1,]  0.0168607518 -0.099835625  0.208775573  0.95883106 -0.152191203
[2,] -0.0011417794  0.025814390 -0.071600745 -0.11014784 -0.477854201
[3,]  0.6968327936 -0.710249079 -0.067182201 -0.07319788 -0.009643654
[4,]  0.7170284512  0.692912523  0.056666935  0.04906669  0.010735457
[5,]  0.0004067530 -0.001011680  0.005386606 -0.01506609  0.025401917
[6,] -0.0004336922  0.001225937  0.265807619 -0.16261712 -0.832729325
[7,]  0.0028836950 -0.069155051  0.934279828 -0.18459052  0.232812295
             [,6]          [,7]
[1,] -0.053952911 -2.704138e-02
[2,] -0.852534945 -1.640507e-01
[3,] -0.002153023  1.136208e-03
[4,]  0.002915751 -5.682006e-05
[5,]  0.176541256 -9.838347e-01
[6,]  0.453206155  6.376788e-02
[7,] -0.183568735 -1.891454e-02

svd_us$v

              [,1]         [,2]         [,3]        [,4]         [,5]
[1,] -0.0168607518  0.099835625  0.208775573 -0.95883106  0.152191203
[2,]  0.0011417794 -0.025814390 -0.071600745  0.11014784  0.477854201
[3,] -0.6968327936  0.710249079 -0.067182201  0.07319788  0.009643654
[4,] -0.7170284512 -0.692912523  0.056666935 -0.04906669 -0.010735457
[5,] -0.0004067530  0.001011680  0.005386606  0.01506609 -0.025401917
[6,]  0.0004336922 -0.001225937  0.265807619  0.16261712  0.832729325
[7,] -0.0028836950  0.069155051  0.934279828  0.18459052 -0.232812295
             [,6]          [,7]
[1,] -0.053952911 -2.704138e-02
[2,] -0.852534945 -1.640507e-01
[3,] -0.002153023  1.136208e-03
[4,]  0.002915751 -5.682006e-05
[5,]  0.176541256 -9.838347e-01
[6,]  0.453206155  6.376788e-02
[7,] -0.183568735 -1.891454e-02

eigenSigma$values

[1] 6.384720e+05 1.481204e+04 7.019599e+02 2.050019e+02 1.167047e+02
[6] 1.205705e+01 1.448704e+00

svd_us$d^2/(41-1)

[1] 6.384720e+05 1.481204e+04 7.019599e+02 2.050019e+02 1.167047e+02
[6] 1.205705e+01 1.448704e+00

Problem 2

In this problem we explore how “high” a low-dimensional SVD approximation of an image has to be before you can recognize it.

.Rdata objects are essentially R workspace memory snapshots that, when loaded, load any type of R object that you want into your global environment. The command below, when executed, will load three objects into your memory: mysteryU4, mysteryD4, and mysteryV4. These are the first \(k\) vectors and singular values of an SVD I performed on a 700-pixels-tall \(\times\) 600-pixels-wide image of a well-known villain.

load('Data/mystery_person_k4.Rdata')

A)

Write a function that takes SVD ingredients u, d and v and renders the \(700 \times 600\) image produced by this approximation using functions from the magick package. Use your function to determine whether a 4-dimensional approximation to this image is enough for you to tell who the mystery villain is. Recall that you will likely need to rescale your recomposed approximation so that all pixels are in [0,1].

library(magick)

Linking to ImageMagick 6.9.13.29
Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
Disabled features: fftw, ghostscript, x11

U2 <- mysteryU4
D2 <- mysteryD4
V2 <- mysteryV4
k3 <- 4
recompose_mystery <- U2[,1:k3] %*% diag(D2[1:k3]) %*% t(V2[,1:k3])
R <- max(recompose_mystery)-min(recompose_mystery)
recomposed_scaled_mystery <- (recompose_mystery-min(recompose_mystery))/R
(recomposed_scaled_mystery
  %>% as.raster
  %>% image_read
 )

B)

I’m giving you slightly higher-dimensional approximations (\(k=10\) and \(k=50\), respectively) in the objects below:

load('Data/mystery_person_k10.Rdata')
load('Data/mystery_person_k50.Rdata')

Create both of the images produced by these approximations. At what point can you tell who the mystery villain is?

library(magick)
U10 <- mysteryU10
D10 <- mysteryD10
V10 <- mysteryV10
k4 <- 10
recompose_mystery2 <- U10[,1:k4] %*% diag(D10[1:k4]) %*% t(V10[,1:k4])
R2 <- max(recompose_mystery2)-min(recompose_mystery2)
recomposed_scaled_mystery2 <- (recompose_mystery2-min(recompose_mystery2))/R2
(recomposed_scaled_mystery2
  %>% as.raster
  %>% image_read
 )

library(magick)
U50 <- mysteryU50
D50 <- mysteryD50
V50 <- mysteryV50
k5 <- 50
recompose_mystery3 <- U50[,1:k5] %*% diag(D50[1:k5]) %*% t(V50[,1:k5])
R3 <- max(recompose_mystery3)-min(recompose_mystery3)
recomposed_scaled_mystery3 <- (recompose_mystery3-min(recompose_mystery3))/R3
(recomposed_scaled_mystery3
  %>% as.raster
  %>% image_read
 )

I could make a very good inference at 10 dimensions but the 50 dimensions is a much better image and confirms what I was thinking at 10 dimensions.

C)

How many numbers need to be stored in memory for each of the following:

A full \(700\times 600\) image?

\((700\times 600 = 420000)\)

A 4-dimensional approximation?

\(k = 4\)
\((700\times 4 + 4 + 4^2 = 2820)\)

A 10-dimensional approximation?

\(k = 10\)
\((700\times 10 + 10 + 10^2 = 7110)\)

A 50-dimensional approximation?

\(k = 50\)
\((700\times 50 + 50 + 50^2 = 37550)\)