Linear Algebra for Data Science in R

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( MASS )
library( ggplot2 )

Introduction to Linear Algebra

Motivations

linear operations on mathematical objects such as vectors, matrices and tensors.

Vectors: storing univariate data.
generating vectors in R

#replicating values
rep( c( 1,2,3 ), 4 )
##  [1] 1 2 3 1 2 3 1 2 3 1 2 3
rep( c( 1,2,3 ), each = 4 )
##  [1] 1 1 1 1 2 2 2 2 3 3 3 3
#generating sequences of values
seq( 2,8, by = 2 )
## [1] 2 4 6 8
seq( 0, 55, by = 5 )
##  [1]  0  5 10 15 20 25 30 35 40 45 50 55
#or just specify values
z <- c( 1,5,-2,4 )
z
## [1]  1  5 -2  4
#assigning values of a vector
z[2] <- 7
z
## [1]  1  7 -2  4

Matrices - Storing Tables of Data. can be thought of as a superposition of vectors.
generating matrices in R:

matrix( 5,3,4 )
##      [,1] [,2] [,3] [,4]
## [1,]    5    5    5    5
## [2,]    5    5    5    5
## [3,]    5    5    5    5
#can also specify values:
matrix( c( 1,-1,2,3,2,-2 ), nrow =2, ncol = 3, byrow = TRUE )
##      [,1] [,2] [,3]
## [1,]    1   -1    2
## [2,]    3    2   -2
Z <- matrix( c( 1,-1,2,3,2,-2 ), nrow =2, ncol = 3, byrow = FALSE )
Z
##      [,1] [,2] [,3]
## [1,]    1    2    2
## [2,]   -1    3   -2

can also update values with by assignment:

Z[ 2,1 ] <- 100
Z
##      [,1] [,2] [,3]
## [1,]    1    2    2
## [2,]  100    3   -2

Algebra of Vectors:

x <- seq( 1, 7, by = 1)
y <- seq( 2, 14, by = 2 )
z <- c( 1, 1, 2 )
# Add x to y and print
print(x + y)
## [1]  3  6  9 12 15 18 21
# Multiply z by 2 and print
print(2 * z)
## [1] 2 2 4
# Multiply x and y by each other and print
print(x * y)
## [1]  2  8 18 32 50 72 98

Creating Matrices

# Create a matrix of all 1's and all 2's that are 2 by 3 and 3 by 2, respectively
matrix(1, nrow = 2, ncol = 3)
##      [,1] [,2] [,3]
## [1,]    1    1    1
## [2,]    1    1    1
print(matrix(2, nrow = 3, ncol = 2))
##      [,1] [,2]
## [1,]    2    2
## [2,]    2    2
## [3,]    2    2
A <- matrix( 1, nrow = 2, ncol = 2 )
# Create a matrix B and changing the byrow designation.
B <- matrix(c(1, 2, 3, 2), nrow = 2, ncol = 2, byrow = FALSE)
B <- matrix(c(1, 2, 3, 2), nrow = 2, ncol = 2, byrow = TRUE)

# Add A to the previously-created matrix B
A + B
##      [,1] [,2]
## [1,]    2    3
## [2,]    4    3

Matrix-Vector Operations

an operation with the following dimensions (n,m)*(m,1) = (n,1) size vector.

consider the following matrix:

A = matrix(c(1, 2, 3, -1, 0, 3), nrow = 2, ncol = 3, byrow = TRUE)
A
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]   -1    0    3

This is a 2x3 matrix. therefore, it can be multiplied by a 3x1 vector

Matrix-Matrix Calculations

an operation with the following dimensions (n,m)(m,o) = (n,o) size matrix.
Matrix multiplication is not cummutative (order matters). Also, %
% \(\neq\) *

A <- matrix( c( 1,2,2,1 ), nrow = 2, ncol = 2, byrow = TRUE )
A
##      [,1] [,2]
## [1,]    1    2
## [2,]    2    1
I <- diag( 2 )
I
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1
I%*%A
##      [,1] [,2]
## [1,]    1    2
## [2,]    2    1
A%*%I
##      [,1] [,2]
## [1,]    1    2
## [2,]    2    1

The Identity matrix returns a mirror image of the input.

Some important matrix concepts:
1. Square Matrices 2. The Matrix Inverse 3. Singular Matrices 4. Diagonal and triangular matrices

A <- matrix( c( 1,2,-1,2 ), nrow = 2, ncol = 2, byrow = TRUE )

# Take the inverse of the 2 by 2 identity matrix
solve(diag( 2 ))
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1
# Take the inverse of the matrix A
Ainv <- solve(A)

# Multiply A inverse by A
Ainv%*%A
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1
# Multiply A by its inverse
A%*%Ainv
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

Matrix-Vector Equations

Motivation for Solving Matrix-Vector Equations

can one vector be described as a linear combination of other vectors? \[A\vec{x} = \vec{b}\] can the vector \(\vec{b}\) be described as a linear combination of the columns of \(A\)?

The Massy Matrix:

\[\left( \begin{array}{cc} 4 & -1 & -1 & -1 & -1 \\ -1 & 4 & -1 & -1 & -1 \\ -1 & -1 & 4 & -1 & -1 \\ -1 & -1 & -1 & 4 & -1 \\ -1 & -1 & -1 & -1 & 4 \end{array} \right) \left( \begin{array}{cc} r_{JH} \\ r_{FM} \\ r_G \\ r_D \\ r_{McD} \end{array} \right) = \left( \begin{array}{cc} 103 \\ 28 \\ 15 \\ -40 \\ -106 \end{array} \right)\]

The rankings of the team are the coefficients that describe the linear combination of rows from the Massy matrix that produce the net point differential for the teams

Let’s have a look using some real data:

url <- 'https://assets.datacamp.com/production/repositories/2654/datasets/6bfadc8a2147bddbbaedafc8e21b8576cb4364ce/WNBA_Data_2017_M.csv'
WNBA_df <- read.csv( url )
WNBA_mmat <- data.matrix( WNBA_df )
url <- 'https://assets.datacamp.com/production/repositories/2654/datasets/4e20e9adfd6514bd5b1bfb1464cd6da9fbbadfe9/WNBA_Data_2017_f.csv'
WNBA_pd <- read.csv( url )
# Print the Massey Matrix M
print( WNBA_mmat )
##       Atlanta Chicago Connecticut Dallas Indiana Los.Angeles Minnesota New.York
##  [1,]      33      -4          -2     -3      -3          -3        -3       -3
##  [2,]      -4      33          -3     -3      -3          -3        -2       -3
##  [3,]      -2      -3          34     -3      -3          -3        -3       -4
##  [4,]      -3      -3          -3     34      -3          -4        -3       -3
##  [5,]      -3      -3          -3     -3      33          -3        -3       -3
##  [6,]      -3      -3          -3     -4      -3          41        -8       -3
##  [7,]      -3      -2          -3     -3      -3          -8        41       -3
##  [8,]      -3      -3          -4     -3      -3          -3        -3       34
##  [9,]      -3      -3          -4     -2      -3          -6        -4       -3
## [10,]      -3      -3          -3     -3      -3          -3        -3       -2
## [11,]      -3      -3          -3     -3      -2          -2        -3       -3
## [12,]      -3      -3          -3     -4      -4          -3        -6       -4
## [13,]       1       1           1      1       1           1         1        1
##       Phoenix San.Antonio Seattle Washington WNBA
##  [1,]      -3          -3      -3         -3   -1
##  [2,]      -3          -3      -3         -3   -1
##  [3,]      -4          -3      -3         -3   -1
##  [4,]      -2          -3      -3         -4   -1
##  [5,]      -3          -3      -2         -4   -1
##  [6,]      -6          -3      -2         -3   -1
##  [7,]      -4          -3      -3         -6   -1
##  [8,]      -3          -2      -3         -4   -1
##  [9,]      38          -3      -4         -3   -1
## [10,]      -3          32      -4         -2   -1
## [11,]      -4          -4      33         -3   -1
## [12,]      -3          -2      -3         38   -1
## [13,]       1           1       1          1    1
# Print the vector of point differentials f 
print( WNBA_pd )
##    Differential
## 1          -135
## 2          -171
## 3           152
## 4          -104
## 5          -308
## 6           292
## 7           420
## 8            83
## 9            -4
## 10         -213
## 11           -5
## 12           -7
## 13            0
# Find the sum of the first column of M
sum(WNBA_mmat[,1])
## [1] 1
# Find the sum of the vector f
sum(WNBA_pd)
## [1] 0

Matrix-Vector Equations - Some Theory

What conditions are needed for Matrix-Vector equations to have a unique solution?
I \(A\) is an \(n*n\) square matrix, then the following conditions are equivalent and imply a unique solution:

  • The matrix \(A\) must have an inverse
  • The determinant of \(A\) is nonzero
  • The rows and columns of \(A\) form a basis for the set of all vectors with \(n\) elements
  • the homogeneous equation \(A\vec{x}=\vec{0}\) has just the trivial (\(\vec{x}=0\)) solution

Does the matrix have an inverse:

solve( A )
##      [,1]  [,2]
## [1,] 0.50 -0.50
## [2,] 0.25  0.25

Is the determinant > 0?

det( A )
## [1] 4

Try it with the WNBA Massy Matrix:

solve( WNBA_mmat )
##                     [,1]         [,2]         [,3]         [,4]         [,5]
## Atlanta      0.032449804  0.005402927  0.003876665  0.004630004  0.004629590
## Chicago      0.005402927  0.032446789  0.004608094  0.004626913  0.004628272
## Connecticut  0.003876665  0.004608094  0.031714805  0.004613451  0.004629714
## Dallas       0.004630004  0.004626913  0.004613451  0.031707219  0.004649172
## Indiana      0.004629590  0.004628272  0.004629714  0.004649172  0.032447936
## Los.Angeles  0.004626242  0.004554829  0.004676789  0.005214940  0.004652111
## Minnesota    0.004611109  0.003985203  0.004651940  0.004727810  0.004678479
## New.York     0.004609212  0.004627729  0.005362761  0.004647832  0.004649262
## Phoenix      0.004610546  0.004608018  0.005295038  0.004013187  0.004613089
## San.Antonio  0.004630254  0.004631081  0.004608596  0.004609009  0.004587382
## Seattle      0.004629212  0.004631185  0.004646217  0.004595132  0.003854641
## Washington   0.004627769  0.004582295  0.004649264  0.005298666  0.005313685
## WNBA        -0.083333333 -0.083333333 -0.083333333 -0.083333333 -0.083333333
##                     [,6]         [,7]         [,8]         [,9]        [,10]
## Atlanta      0.004626242  0.004611109  0.004609212  0.004610546  0.004630254
## Chicago      0.004554829  0.003985203  0.004627729  0.004608018  0.004631081
## Connecticut  0.004676789  0.004651940  0.005362761  0.005295038  0.004608596
## Dallas       0.005214940  0.004727810  0.004647832  0.004013187  0.004609009
## Indiana      0.004652111  0.004678479  0.004649262  0.004613089  0.004587382
## Los.Angeles  0.027807608  0.007319076  0.004637275  0.006363490  0.004606288
## Minnesota    0.007319076  0.027810474  0.004677632  0.005388578  0.004578013
## New.York     0.004637275  0.004677632  0.031716432  0.004648253  0.003835528
## Phoenix      0.006363490  0.005388578  0.004648253  0.029212019  0.004646110
## San.Antonio  0.004606288  0.004578013  0.003835528  0.004646110  0.033267202
## Seattle      0.004032687  0.004573214  0.004607331  0.005265228  0.005427397
## Washington   0.004841998  0.006331805  0.005314087  0.004669776  0.003906474
## WNBA        -0.083333333 -0.083333333 -0.083333333 -0.083333333 -0.083333333
##                    [,11]        [,12]        [,13]
## Atlanta      0.004629212  0.004627769 8.333333e-02
## Chicago      0.004631185  0.004582295 8.333333e-02
## Connecticut  0.004646217  0.004649264 8.333333e-02
## Dallas       0.004595132  0.005298666 8.333333e-02
## Indiana      0.003854641  0.005313685 8.333333e-02
## Los.Angeles  0.004032687  0.004841998 8.333333e-02
## Minnesota    0.004573214  0.006331805 8.333333e-02
## New.York     0.004607331  0.005314087 8.333333e-02
## Phoenix      0.005265228  0.004669776 8.333333e-02
## San.Antonio  0.005427397  0.003906474 8.333333e-02
## Seattle      0.032485332  0.004585756 8.333333e-02
## Washington   0.004585756  0.029211757 8.333333e-02
## WNBA        -0.083333333 -0.083333333 2.467162e-17

what is the determinant?

det( WNBA_mmat )
## [1] 3.118587e+18

Solving Matrix-Vector Equations

it’s a whole lot simpler with R:

A <- matrix( c( 1,-2,0,4 ), nrow = 2, ncol = 2, byrow = TRUE )
b <- matrix( c( 1,-2 ), nrow = 2, ncol = 1, byrow = TRUE )
x <- solve( A )%*%b
x
##      [,1]
## [1,]  0.0
## [2,] -0.5

check the solution. does \(A\vec{x}\) give you back \(b\)?

res <- A%*%x
b == res
##      [,1]
## [1,] TRUE
## [2,] TRUE

back to the 2017 WNBA Rating

WNBA_pd$Differential[13] <- 0

#now to solve for the ratings vector
# Solve for r and rename column
ratings <- solve(WNBA_mmat)%*%WNBA_pd$Differential
colnames(ratings) <- "Rating"
ratings_df <- data.frame( ratings )

# Print r
head( ratings_df,5 )
##                Rating
## Atlanta     -4.012938
## Chicago     -5.156260
## Connecticut  4.309525
## Dallas      -2.608129
## Indiana     -8.532958

Which team is the best?

head( arrange(ratings_df, -Rating), 5 )
##                Rating
## Minnesota   10.612415
## Los.Angeles  7.850327
## Connecticut  4.309525
## New.York     2.541565
## Phoenix      0.897911

Other Considerations for Matrix-Vector Equations

Options for non-square matrices:

  • Row Reduction
  • Least Squares
  • Single Value Decomposition
  • Generalized / Pseudo-Inverse (when not invertible)
#ginv() to find the generalized inverse
A <- matrix( c( 2, 3, -1, 4, 1, 7 ), nrow = 3, ncol = 2, byrow = TRUE )
ginv( A )
##           [,1]        [,2]       [,3]
## [1,] 0.3333333 -0.30303030 0.03030303
## [2,] 0.0000000  0.09090909 0.09090909

can think of as serving the same purpose as solve() for an mat w/no inverse.

ginv( A )%*%A
##      [,1]          [,2]
## [1,]    1 -4.996004e-16
## [2,]    0  1.000000e+00

that’s almost the identity matrix….

A%*%ginv( A )
##            [,1]       [,2]      [,3]
## [1,]  0.6666667 -0.3333333 0.3333333
## [2,] -0.3333333  0.6666667 0.3333333
## [3,]  0.3333333  0.3333333 0.6666667

Now to solve the WNBA ratings…

# Find the rating vector the conventional way
ratings <- solve(WNBA_mmat)%*%WNBA_pd$Differential
colnames(ratings) <- "Rating"
ratings_df <- data.frame( ratings )

# Find the rating vector using ginv
r <- ginv(WNBA_mmat) %*% WNBA_pd$Differential
colnames(r) <- "Rating"
print(r)
##              Rating
##  [1,] -4.012938e+00
##  [2,] -5.156260e+00
##  [3,]  4.309525e+00
##  [4,] -2.608129e+00
##  [5,] -8.532958e+00
##  [6,]  7.850327e+00
##  [7,]  1.061241e+01
##  [8,]  2.541565e+00
##  [9,]  8.979110e-01
## [10,] -6.181574e+00
## [11,] -2.666953e-01
## [12,]  5.468121e-01
## [13,] -5.218048e-14

Eigenvalues and Eigenvectors

Intro to Eigenvalues and Eigenvectors

Eigenvectors take collections of objects and subset a selection to approximate the data

Matrix Multiplications: rotations, reflections, dilations, contractions, projections & combinations. Eigenvalue/vector operations can represent complex data by decomposing to the sum of simpler operations

Definition of Eigenvalues and Eigenvectors

For a matrix A, the scalar \(\lambda\) is an eigenvalue of A with associated eigenvector \(\hat{v} \neq \hat{0}\) if the following is true: \[A\hat{v}= \lambda \hat{v}\] In other words: the matrix multiplication \(A\hat{v}\), a matrix-vector operation, produces the same vector as \(\lambda \hat{v}\) a scalar-vector operation.

An example:

A <- matrix( c( 2, 3, 0, 1 ), nrow = 2, ncol = 2, byrow = TRUE )
A
##      [,1] [,2]
## [1,]    2    3
## [2,]    0    1
A %*% c(1,0) == 2*c(1,0)
##      [,1]
## [1,] TRUE
## [2,] TRUE

We can say that 2 is an eigenvalue of \(A\) with the associated eigenvector \(\hat{v} = (1,0)^T\)
These two can be called an eigenpair of the matrix \(A\)

Eigenvectors are all about direction and not magnitude. An eigenvector of \(A\) is a vector that points in its own (or complete opposite) direction upon multiplication by \(A\).

A <- matrix( c( -1, 2, 4, 0, 7, 12, 0, 0, -4 ), nrow = 3, ncol = 3, byrow = TRUE )
# Show that 7 is an eigenvalue for A
A%*%c(0.2425356, 0.9701425, 0) - 7*c(0.2425356, 0.9701425, 0)
##       [,1]
## [1,] 2e-07
## [2,] 0e+00
## [3,] 0e+00
# Show that -4 is an eigenvalue for A
A%*%c(-0.3789810, -0.6821657, 0.6253186) - -4*c(-0.3789810, -0.6821657, 0.6253186)
##               [,1]
## [1,] -2.220446e-16
## [2,]  5.000000e-07
## [3,]  0.000000e+00
# Show that -1 is an eigenvalue for A
A%*%c(1,0,0) - -1*c(1,0,0)
##      [,1]
## [1,]    0
## [2,]    0
## [3,]    0
# Show that double an eigenvector is still an eigenvector
A%*%((2)*c(0.2425356, 0.9701425, 0)) - 7*(2)*c(0.2425356, 0.9701425, 0)
##       [,1]
## [1,] 4e-07
## [2,] 0e+00
## [3,] 0e+00
# Show half of an eigenvector is still an eigenvector
A%*%((0.5)*c(0.2425356, 0.9701425, 0)) - 7*(0.5)*c(0.2425356, 0.9701425, 0)
##       [,1]
## [1,] 1e-07
## [2,] 0e+00
## [3,] 0e+00

Computing Eigenvalues and Eigenvectors in R

Solving Eigenvalue/Eigenvector Problems

Properties:

  • an n by n matrix has up to n eigenvalues
  • even is A is a matrix of all real numbers, some or even all of its eigenvalues could be complex numbers
  • all complex eigenvalues must come in conjugate pairs

Very simple to solve in R

E <- eigen( A )
E
## eigen() decomposition
## $values
## [1]  7 -4 -1
## 
## $vectors
##           [,1]       [,2] [,3]
## [1,] 0.2425356 -0.3789810    1
## [2,] 0.9701425 -0.6821657    0
## [3,] 0.0000000  0.6253186    0
A <- matrix( c( 1, 2, 1, 1 ), nrow = 2, ncol = 2, byrow = TRUE )

# Compute the eigenvalues of A and store in Lambda
Lambda <- eigen(A)

# Print eigenvalues
print(Lambda$values[1])
## [1] 2.414214
print(Lambda$values[2])
## [1] -0.4142136
# Verify that these numbers satisfy the conditions of being an eigenvalue
det(Lambda$values[1]*diag(2) - A)
## [1] -3.140185e-16
det(Lambda$values[2]*diag(2) - A)
## [1] -3.140185e-16
# Print eigenvectors
print(Lambda$vectors[, 1])
## [1] 0.8164966 0.5773503
print(Lambda$vectors[, 2])
## [1] -0.8164966  0.5773503
# Verify that these eigenvectors & their associated eigenvalues satisfy Av - lambda v = 0
Lambda$values[1]*Lambda$vectors[, 1] - A%*%Lambda$vectors[, 1]
##      [,1]
## [1,]    0
## [2,]    0
Lambda$values[2]*Lambda$vectors[, 2] - A%*%Lambda$vectors[, 2]
##               [,1]
## [1,] -1.110223e-16
## [2,]  8.326673e-17

Some More on Eigenvalues and Eigenvectors

If all the eigenvalues are distinct, then the eigenvectors form a basis for the set of n-dimensional vectors.

Eigenpairs turn matrix multiplication into a linear combination of scalar multiplications. can see this by applying the matrix A to \(\vec{x}\) and using the fact that \(A\vec{v}_j = \lambda_j \vec{v}_j\): \[A\vec{x} = c_1\lambda_1 \vec{v}_1 + c_2\lambda_2 \vec{v}_2 + \cdots + c_n\lambda_n \vec{v}_n\] Markov Model for Allele Frequency

A <- matrix( c( 0.98, 0.005, 0.005, 0.01,
                0.005, 0.98, 0.01, 0.005,
                0.005, 0.01, 0.98, 0.005,
                0.01, 0.005, 0.005, 0.98), nrow = 4, ncol = 4, byrow = TRUE )
A
##       [,1]  [,2]  [,3]  [,4]
## [1,] 0.980 0.005 0.005 0.010
## [2,] 0.005 0.980 0.010 0.005
## [3,] 0.005 0.010 0.980 0.005
## [4,] 0.010 0.005 0.005 0.980
# This code iterates mutation 1000 times
x <- c(1, 0, 0, 0)
for (j in 1:1000) {x <- A%*%x}

# Print x
print(x)
##      [,1]
## [1,] 0.25
## [2,] 0.25
## [3,] 0.25
## [4,] 0.25
# Print and scale the first eigenvector of M
Lambda <- eigen(A)
v1 <- Lambda$vectors[, 1]/sum(Lambda$vectors[, 1])

# Print v1
print(v1)
## [1] 0.25 0.25 0.25 0.25

Principal Component Analysis

Intro to the Idea of PCA

url <- 'https://assets.datacamp.com/production/repositories/2654/datasets/760dae913f682ba6b2758207280138662ddedc0d/DataCampCombine.csv'
combine <- read.csv( url )
glimpse( combine )
## Rows: 2,885
## Columns: 13
## $ player     <chr> "Jaire Alexander", "Brian Allen", "Mark Andrews", "Troy Ap…
## $ position   <chr> "CB", "C", "TE", "S", "EDGE", "DE", "WR", "CB", "ILB", "RB…
## $ school     <chr> "Louisville", "Michigan St.", "Oklahoma", "Penn St.", "Kan…
## $ year       <int> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018…
## $ height     <int> 71, 73, 77, 74, 76, 78, 76, 72, 73, 73, 75, 76, 80, 72, 76…
## $ weight     <int> 192, 298, 256, 198, 257, 262, 216, 185, 248, 228, 218, 296…
## $ forty      <dbl> 4.38, 5.34, 4.67, 4.34, 4.87, 4.60, 4.62, 4.36, 4.59, 4.46…
## $ vertical   <dbl> 35.0, 26.5, 31.0, 41.0, 30.0, 38.5, 34.0, 31.5, 36.0, 33.5…
## $ bench      <int> 14, 27, 17, 16, 20, 18, 13, 13, 26, 15, 16, 31, 14, 14, 30…
## $ broad_jump <int> 127, 99, 113, 131, 118, 128, 121, 119, 124, 122, 112, 101,…
## $ three_cone <dbl> 6.71, 7.81, 7.34, 6.56, 7.12, 7.53, 7.07, 6.93, 6.90, 6.91…
## $ shuttle    <dbl> 3.98, 4.71, 4.38, 4.03, 4.23, 4.48, 4.25, 4.40, 4.36, 4.35…
## $ drafted    <chr> "Green Bay Packers / 1st / 18th pick / 2018", "Los Angeles…
ggplot( combine, aes( x = shuttle, y = forty ) ) +
  geom_point( )

Removing redundant features from data. Highly correlated features adds redundancy which detracts from the explanatory value of the model.

Principal Component Analysis

  • One of the more useful methods from applied linear algebra
  • Non-parametric way of extracting meaningful information from confusing data sets
  • Uncovers hidden, low-dimensional structures that underlie your data
  • These structures are more-easily visualized and are often interpretable to content experts
ggplot( combine, aes( x = three_cone, y = forty ) ) +
  geom_point( )

# Find the correlation between variables forty and three_cone
cor(combine$forty, combine$three_cone)
## [1] 0.8315171
ggplot( combine, aes( x = vertical, y = broad_jump ) ) +
  geom_point( )

# Find the correlation between variables forty and three_cone
cor(combine$forty, combine$three_cone)
## [1] 0.8315171

The Linear Algebra Behind PCA

  • the ith element of the diagonal of \(\frac{A^TA}{n-1}\) is the variance of the ith column of the matrix
  • the eigenvalues of \(\frac{A^TA}{n-1}\) are real, and the corresponding eigenvectors are orthogonal or point in distinct directions
  • the total variance of the data set is the sum of the eigenvalues of \(\frac{A^TA}{n-1}\)
  • these eigenvectors are called the principal components of the data set in the matrix A
  • the direction that an eigenvector points can explain eigenvalues of the total variance in the data set. if an eigenvalue or subset of eigenvalues explains a significant amount of the total variance, there is an opportunity for dimension reduction.
# Extract columns 5-12 of combine
A <- combine[, 5:12]

# Make A into a matrix
A <- as.matrix(A)

#find the means of each column
meanA <- colMeans( A )

# Subtract the mean of each column
A <- sweep( A, 2, meanA )
head(A)
##            height     weight       forty  vertical     bench  broad_jump
## [1,] -2.992720971 -60.087348 -0.43158406  2.401733 -7.091854  13.8097054
## [2,] -0.992720971  45.912652  0.52841594 -6.098267  5.908146 -14.1902946
## [3,]  3.007279029   3.912652 -0.14158406 -1.598267 -4.091854  -0.1902946
## [4,]  0.007279029 -54.087348 -0.47158406  8.401733 -5.091854  17.8097054
## [5,]  2.007279029   4.912652  0.05841594 -2.598267 -1.091854   4.8097054
## [6,]  4.007279029   9.912652 -0.21158406  5.901733 -3.091854  14.8097054
##       three_cone     shuttle
## [1,] -0.59571924 -0.42854766
## [2,]  0.50428076  0.30145234
## [3,]  0.03428076 -0.02854766
## [4,] -0.74571924 -0.37854766
## [5,] -0.18571924 -0.17854766
## [6,]  0.22428076  0.07145234
# Create matrix B from equation in instructions
B <- t(A)%*%A/(nrow(A) - 1)

# Compare 1st element of the 1st column of B to the variance of the first column of A
B[1,1]
## [1] 7.159794
var(A[, 1])
## [1] 7.159794
# Compare 1st element of 2nd column of B to the 1st element of the 2nd row of B to the covariance between the first two columns of A
B[1, 2]
## [1] 90.78808
B[2, 1]
## [1] 90.78808
cov(A[, 1], A[, 2])
## [1] 90.78808
# Find eigenvalues of B
V <- eigen( B )

# Print eigenvalues
V$values
## [1] 2.187628e+03 4.403246e+01 2.219205e+01 5.267129e+00 2.699702e+00
## [6] 6.317016e-02 1.480866e-02 1.307283e-02
# Print eigenvectors
V$vectors
##              [,1]         [,2]          [,3]          [,4]         [,5]
## [1,] -0.042047079  0.061885367  0.1454490039 -0.1040556410  0.980792060
## [2,] -0.980711529  0.130912788  0.1270100265  0.0193388930 -0.066908382
## [3,] -0.006112061 -0.012525260  0.0025260713 -0.0021291637 -0.004096693
## [4,]  0.062926466  0.333556369  0.0398922845  0.9366594549  0.074901137
## [5,] -0.088291423  0.313533433 -0.9363461471 -0.0745692157  0.107188391
## [6,]  0.156742686  0.876925849  0.2904565302 -0.3252903706 -0.126494599
## [7,] -0.007468520 -0.014691994  0.0009057581  0.0003320888 -0.020902644
## [8,] -0.004518826 -0.009863931  0.0023111814 -0.0094052914 -0.004010629
##              [,6]          [,7]          [,8]
## [1,] -0.020679696 -6.155636e-03  0.0008055445
## [2,]  0.008423587  6.988341e-04  0.0036087841
## [3,] -0.152469227 -2.539868e-01 -0.9549983725
## [4,] -0.012214516  7.045063e-03 -0.0070051256
## [5,] -0.009167322 -8.604309e-05 -0.0048308793
## [6,] -0.013753112 -2.187651e-03 -0.0076907609
## [7,] -0.894560357 -3.743559e-01  0.2427137770
## [8,] -0.419039274  8.917710e-01 -0.1700673446
# Roughly how much of the variability in the dataset can be explained by the first principal component?

V$values[1] / sum( V$values )
## [1] 0.9671594

That’s right. A great deal of the variability in the athletic profile of future NFL players can be attributed to one linear combination of the data!

Performing PCA in R

combine_pca <- combine %>% 
  dplyr::select( height:shuttle ) %>% 
  prcomp()

summary( combine_pca )
## Importance of components:
##                            PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     46.7721 6.63570 4.71084 2.29502 1.64308 0.25134 0.12169
## Proportion of Variance  0.9672 0.01947 0.00981 0.00233 0.00119 0.00003 0.00001
## Cumulative Proportion   0.9672 0.98663 0.99644 0.99877 0.99996 0.99999 0.99999
##                            PC8
## Standard deviation     0.11434
## Proportion of Variance 0.00001
## Cumulative Proportion  1.00000

extract PC of interest. here we take the first 2 which account for 98.6% of the variance

combine_pca_res <- cbind( combine[, 1:4 ], combine_pca$x[,1:2])
head( combine_pca_res )
##              player position       school year        PC1        PC2
## 1   Jaire Alexander       CB   Louisville 2018 -62.005067  -2.654645
## 2       Brian Allen        C Michigan St. 2018  48.123290   6.693433
## 3      Mark Andrews       TE     Oklahoma 2018   3.732016   1.283046
## 4         Troy Apke        S     Penn St. 2018 -56.823742  -9.764098
## 5 Dorance Armstrong     EDGE       Kansas 2018   4.213670  -3.779862
## 6         Ade Aruna       DE       Tulane 2018   6.924978 -15.530509
ggplot( combine_pca_res, aes( x = PC1, y = PC2, color = position ) ) +
  geom_point()

Subset the data for a certain position

# Subset combine only to "WR"
combine_WR <- subset(combine, position == "WR")

# Scale columns 5-12 of combine_WR
B <- scale(combine_WR[, 5:12])

# Print the first 6 rows of the data
head( B )
##        height      weight       forty   vertical      bench  broad_jump
## 7   1.4022982  0.88324903  1.20674474 -0.3430843 -0.3223377  0.07414249
## 17  0.5575402 -0.09700717 -0.80129388 -0.4969965 -0.7938424 -0.95388361
## 18  0.9799192  1.58343202  0.88968601  1.0421255  0.8564239  1.61618163
## 25  0.9799192  1.16332222  1.41811723 -1.5743819 -0.7938424 -1.29655897
## 29 -1.1319757 -1.56739147 -0.80129388 -0.1891721 -0.0865854 -1.29655897
## 46  0.1351613  0.11304773  0.04419607  0.2725645 -1.0295947  0.24548017
##      three_cone     shuttle
## 7   0.712845019  0.02833449
## 17 -1.098542478  0.84141123
## 18 -1.853287268 -1.46230619
## 25 -1.148858797  0.50262926
## 29  0.008416548 -0.64922946
## 46  0.109049187  0.84141123
# Summarize the principal component analysis
summary(prcomp( B ))
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.5425 1.4255 1.0509 0.9603 0.77542 0.63867 0.59792
## Proportion of Variance 0.2974 0.2540 0.1380 0.1153 0.07516 0.05099 0.04469
## Cumulative Proportion  0.2974 0.5514 0.6894 0.8047 0.87987 0.93085 0.97554
##                            PC8
## Standard deviation     0.44235
## Proportion of Variance 0.02446
## Cumulative Proportion  1.00000