library(igraph)## Warning: package 'igraph' was built under R version 4.1.2
library(tidyverse)## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
library(stringr)Part 1 - Pagerank
Step 1
Form the A matrix. Then, introduce decay and form the B Matrix as we did in the course notes.
A <- matrix(c(0,1/2,1/2,0,0,0,
1/6,1/6,1/6,1/6,1/6,1/6, # replaced row of 0s with uniform probability
1/3,1/3,0,0,1/3,0,
0,0,0,0,1/2,1/2,
0,0,0,1/2,0,1/2,
0,0,0,1,0,0), nrow=6, byrow=TRUE)Creting the A Matrix as it as done in the lecture will leave a “dangling” row in row 2. This will prevent our power iteration method from converging. To remedy this, we can simply use 1/6 for each element in the row, suggesting that there is an equally likely probability for each powition to be selected in the first iteration.
n <- 6
decay <- 0.85
B <- decay * A + ((1-decay)/n)
print(B)## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0250000 0.4500000 0.4500000 0.0250000 0.0250000 0.0250000
## [2,] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [3,] 0.3083333 0.3083333 0.0250000 0.0250000 0.3083333 0.0250000
## [4,] 0.0250000 0.0250000 0.0250000 0.0250000 0.4500000 0.4500000
## [5,] 0.0250000 0.0250000 0.0250000 0.4500000 0.0250000 0.4500000
## [6,] 0.0250000 0.0250000 0.0250000 0.8750000 0.0250000 0.0250000
Step 2
Start with a uniform rank vector r and perform power iterations on B till convergence. That is, compute the solution r = Bn × r. Attempt this for a sufficiently large n so that r actually converges. (5 Points)
r <- matrix(c(1/6,1/6,1/6,1/6,1/6,1/6), nrow=1, ncol=6)power_iteration <- function(decayMat, r, iter) {
index <- c(0)
r1 <- c(r[1]) #one element vector with first element of r
r2 <- c(r[1]) #one element vector with first element of r
r3 <- c(r[1]) #one element vector with first element of r
r4 <- c(r[1]) #one element vector with first element of r
r5 <- c(r[1]) #one element vector with first element of r
power_iteration <- r #set new r equal to starting r
newDecayMat <- decayMat #set new decayMat equal to original decayMat
for (i in 1:iter) { # for each integer between 1 and n (30)
newDecayMat <- newDecayMat%*%decayMat
power_iteration <- r %*% newDecayMat
r1 <- append(r1, power_iteration[1])
r2 <- append(r2, power_iteration[2])
r3 <- append(r3, power_iteration[3])
r4 <- append(r4, power_iteration[4])
r5 <- append(r5, power_iteration[5])
index <- append(index, i)
}
plot(r1, type = "o",col = "red", ylim=c(-.2,.4))
lines(r2, type = "o", col = "blue")
lines(r3, type = "o", col = "green")
lines(r4, type = "o", col = "yellow")
lines(r5, type = "o", col = "black")
ret = list(power_iteration, newDecayMat)
return(ret)
}response = power_iteration(B, r, 30)Based on the above visualization, we can see that convergence occurs roughly around 15 iterations.
power_iteration_r <- response[[1]] #The new value of r
power_iteration_r## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.05170475 0.07367927 0.05741242 0.3487037 0.1999038 0.2685961
newDecayMat <- response[2] #Final iteration of decay matrix
newDecayMat## [[1]]
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.05170476 0.07367928 0.05741243 0.3487037 0.1999038 0.2685961
## [2,] 0.05170475 0.07367927 0.05741242 0.3487037 0.1999038 0.2685961
## [3,] 0.05170475 0.07367928 0.05741242 0.3487037 0.1999038 0.2685961
## [4,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
## [5,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
## [6,] 0.05170474 0.07367926 0.05741241 0.3487037 0.1999038 0.2685961
Step 3
Compute the eigen-decomposition of B and verify that you indeed get an eigenvalue of 1 as the largest eigenvalue and that its corresponding eigenvector is the same vector that you obtained in the previous power iteration method. Further, this eigenvector has all positive entries and it sums to 1.(10 points)
ev <- eigen(t(B))
as.numeric(ev$values)## Warning: imaginary parts discarded in coercion
## [1] 1.00000000 0.57619235 -0.42500000 -0.42500000 -0.34991524 -0.08461044
We can see from the above that the largest eigenvalue is 1
sum(ev$vectors[,1])## [1] 2.019902+0i
However, the sum of the associated eigenvector is greater than 1. This is corrected when transforming the vector to the unit vector (divide each element by the sum of the vector)
eigen_r <- as.numeric(1/(sum(ev$vectors[,1]))*ev$vectors[,1])
sum(eigen_r)## [1] 1
eigen_r## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
power_iteration_r - eigen_r## [,1] [,2] [,3] [,4] [,5]
## [1,] 3.29466e-09 5.724804e-09 3.837685e-09 -6.620217e-09 -1.589199e-09
## [,6]
## [1,] -4.647732e-09
The “power_iter_vector” we received after running our power_iteration function is approximately the same as the “eigen_r” vector we received after performing eigenvalue decomposition on our transition matrix.
Step 4
Use the graph package in R and its page.rank method to compute the Page Rank of the graph as given in A. Note that you don’t need to apply decay. The package starts with a connected graph and applies decay internally. Verify that you do get the same PageRank vector as the two approaches above. (10 points)
#Converting to graph from adjacency matrix
webGraph = igraph::graph_from_adjacency_matrix(A,weighted = T)
#Plot the graph
plot(webGraph)#Resultant vector
pr <- page_rank(webGraph)$vector
pr## [1] 0.05170475 0.07367926 0.05741241 0.34870369 0.19990381 0.26859608
round(eigen_r, 5) == round(pr, 5)## [1] TRUE TRUE TRUE TRUE TRUE TRUE
Part 2 - Kaggle MNIST
Step 1-2
Go to Kaggle.com and build an account if you do not already have one. It is free
Go to https://www.kaggle.com/c/digit-recognizer/overview, accept the rules of the competition, and download the data. You will not be required to submit work to Kaggle, but you do need the data.
Step 3
Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.) (5 points)
train_data <- read_csv("digit-recognizer/train.csv")## Rows: 42000 Columns: 785
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (785): label, pixel0, pixel1, pixel2, pixel3, pixel4, pixel5, pixel6, pi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
labels = train_data[,1]
data <- train_data[,-1]/255plot_array <- function(array){
arrayMat <- matrix(array, nrow=28, ncol=28)
mode(arrayMat) = "numeric"
image(arrayMat)
}for (i in 1:10) {
plot_array(data[i,])
}Step 4
What is the frequency distribution of the numbers in the dataset? (5 points)
table(labels)/42000## labels
## 0 1 2 3 4 5 6
## 0.09838095 0.11152381 0.09945238 0.10359524 0.09695238 0.09035714 0.09850000
## 7 8 9
## 0.10478571 0.09673810 0.09971429
While there is some variation, each label roughly corresponds to 10% of the entire training dataset.
Step 5
For each number, provide the mean pixel intensity. What does this tell you? (5 points)
get_number_intensity <- function(target, labels, data){
x = data[labels==target,]
means = rowMeans(x)
return(mean(means))
}for (i in 1:9) {
mean_intensity <- get_number_intensity(i , labels, data)
ret_string = str_interp("Pixel intensity for number ${i} is ${mean_intensity}")
print(ret_string)
}## [1] "Pixel intensity for number 1 is 0.075972720428906"
## [1] "Pixel intensity for number 2 is 0.149415262873165"
## [1] "Pixel intensity for number 3 is 0.141657603055012"
## [1] "Pixel intensity for number 4 is 0.121212097314368"
## [1] "Pixel intensity for number 5 is 0.129231294625887"
## [1] "Pixel intensity for number 6 is 0.138730078688473"
## [1] "Pixel intensity for number 7 is 0.1147021021542"
## [1] "Pixel intensity for number 8 is 0.150981134516322"
## [1] "Pixel intensity for number 9 is 0.12281787715086"
My intuition for the mean pixel intensity is that more complex numbers which take up a larger portion of the 28x28 matrix will have a greater mean pixel intensity. This intuition is supported by the fact that 1, a straight line, has the smallest mean pixel intensity. Likewise the number 7 is the second smallest mean pixel intensity. Both of these numbers are comprised of simple straight lines. Alternatively, numbers like 8, 3 and 6, which are typically much curvier, have the highest mean pixel intensities.
Step 6
Reduce the data by using principal components that account for 95% of the variance. How many components did you generate? Use PCA to generate all possible components (100% of the variance). How many components are possible? Why? (5 points)
train_pca <- prcomp(data)train_pca_std <- train_pca$sdev
train_pca_cum_var <- cumsum(train_pca_std^2)/sum(train_pca_std^2)plot(train_pca_cum_var)which.max(train_pca_cum_var > 0.95)## [1] 154
With 154 components we are able to account for 95% variance. There are 784 total components possible, 1 for each column in the original dataset.
Step 7
Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)
train_pca_rot <- train_pca$rotationfor (i in 1:10) {
plot_array(train_pca_rot[,i])
} Above are the first 10 images generated by PCA. The images appear to be noisy, but this is simply due to the fact that they were generated via PCA. Essentially, each PCA image is the average of the all the entries with that type.
Step 8
Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see? (5 points)
x = data[labels==8,]
pca_8 <- prcomp(x)pca_8_std <- pca_8$sdev
pca_8_cum_var <- cumsum(pca_8_std^2)/sum(pca_8_std^2)
plot(pca_8_cum_var)pca_8_rot <- pca_8$rotation
for (i in 1:10) {
plot_array(pca_8_rot[,i])
} ## Step 9 An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)