library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.3
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
dim(mtcars)
## [1] 32 11
Perform a principal component analysis, in R it is implemented as the prcomp() function.Here we set center and scale arguments to TRUE, recall from the lecture why this is important. We can try to perform PCA without scaling and centering and compare.
cars_pca <- prcomp(mtcars, center = T, scale. = T)
pca_summary <- summary(cars_pca)
print(pca_summary)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.5707 1.6280 0.79196 0.51923 0.47271 0.46000 0.3678
## Proportion of Variance 0.6008 0.2409 0.05702 0.02451 0.02031 0.01924 0.0123
## Cumulative Proportion 0.6008 0.8417 0.89873 0.92324 0.94356 0.96279 0.9751
## PC8 PC9 PC10 PC11
## Standard deviation 0.35057 0.2776 0.22811 0.1485
## Proportion of Variance 0.01117 0.0070 0.00473 0.0020
## Cumulative Proportion 0.98626 0.9933 0.99800 1.0000
Proportion of variance always sums to 1.
str(cars_pca)
## List of 5
## $ sdev : num [1:11] 2.571 1.628 0.792 0.519 0.473 ...
## $ rotation: num [1:11, 1:11] -0.363 0.374 0.368 0.33 -0.294 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:11] "mpg" "cyl" "disp" "hp" ...
## .. ..$ : chr [1:11] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:11] 20.09 6.19 230.72 146.69 3.6 ...
## ..- attr(*, "names")= chr [1:11] "mpg" "cyl" "disp" "hp" ...
## $ scale : Named num [1:11] 6.027 1.786 123.939 68.563 0.535 ...
## ..- attr(*, "names")= chr [1:11] "mpg" "cyl" "disp" "hp" ...
## $ x : num [1:32, 1:11] -0.647 -0.619 -2.736 -0.307 1.943 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. ..$ : chr [1:11] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
For plotting, (and for me personally!) a dataframe is better than a list.
pca_df <- data.frame(cars_pca$x, make = stringr::word(rownames(mtcars), 1))
head(pca_df)
## PC1 PC2 PC3 PC4 PC5
## Mazda RX4 -0.64686274 1.7081142 -0.5917309 0.11370221 -0.9455234
## Mazda RX4 Wag -0.61948315 1.5256219 -0.3763013 0.19912121 -1.0166807
## Datsun 710 -2.73562427 -0.1441501 -0.2374391 -0.24521545 0.3987623
## Hornet 4 Drive -0.30686063 -2.3258038 -0.1336213 -0.50380035 0.5492089
## Hornet Sportabout 1.94339268 -0.7425211 -1.1165366 0.07446196 0.2075157
## Valiant -0.05525342 -2.7421229 0.1612456 -0.97516743 0.2116654
## PC6 PC7 PC8 PC9 PC10
## Mazda RX4 -0.01698737 -0.42648652 -0.009631217 0.14642303 -0.06670350
## Mazda RX4 Wag -0.24172464 -0.41620046 -0.084520213 0.07452829 -0.12692766
## Datsun 710 -0.34876781 -0.60884146 0.585255765 -0.13122859 0.04573787
## Hornet 4 Drive 0.01929700 -0.04036075 -0.049583029 0.22021812 -0.06039981
## Hornet Sportabout 0.14919276 0.38350816 -0.160297757 -0.02117623 -0.05983003
## Valiant -0.24383585 -0.29464160 0.256612420 -0.03222907 -0.20165466
## PC11 make
## Mazda RX4 0.17969357 Mazda
## Mazda RX4 Wag 0.08864426 Mazda
## Datsun 710 -0.09463291 Datsun
## Hornet 4 Drive 0.14761127 Hornet
## Hornet Sportabout 0.14640690 Hornet
## Valiant 0.01954506 Valiant
ggplot(pca_df, aes(x = PC1, y = PC2, col = make)) +
geom_point(size = 3) +
labs(x = "PC1 60.08%",
y = "PC2 24.09 %",
title = "Principal components for mtcars") +
theme(legend.position = "bottom")
Or using base plotting, but it’s not nearly a pretty!
plot(pca_df$PC1, pca_df$PC2)
To see the representation of the data as a “biplot”. This is a combination of a PCA plot and a score plot. In a biplot, we add the original axis as arrows.
biplot(cars_pca)
That is a pretty ugly plot tbh, I’m thinking it would be good to change the text size of the labels.
?USArrests
## starting httpd help server ... done
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
Perform a PCA on the USArrests. Perform the analysis on the subset USArrests[, -3] data.
arrests <- subset(USArrests[,-3])
head(arrests)
## Murder Assault Rape
## Alabama 13.2 236 21.2
## Alaska 10.0 263 44.5
## Arizona 8.1 294 31.0
## Arkansas 8.8 190 19.5
## California 9.0 276 40.6
## Colorado 7.9 204 38.7
arrests_pca <- prcomp((arrests), center = T, scale. = T)
arrests_summary <- summary(arrests_pca)
print(arrests_summary)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.5358 0.6768 0.42822
## Proportion of Variance 0.7862 0.1527 0.06112
## Cumulative Proportion 0.7862 0.9389 1.00000
str(arrests_pca)
## List of 5
## $ sdev : num [1:3] 1.536 0.677 0.428
## $ rotation: num [1:3, 1:3] -0.583 -0.608 -0.539 0.534 0.214 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "Murder" "Assault" "Rape"
## .. ..$ : chr [1:3] "PC1" "PC2" "PC3"
## $ center : Named num [1:3] 7.79 170.76 21.23
## ..- attr(*, "names")= chr [1:3] "Murder" "Assault" "Rape"
## $ scale : Named num [1:3] 4.36 83.34 9.37
## ..- attr(*, "names")= chr [1:3] "Murder" "Assault" "Rape"
## $ x : num [1:50, 1:3] -1.198 -2.309 -1.503 -0.176 -2.045 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## .. ..$ : chr [1:3] "PC1" "PC2" "PC3"
## - attr(*, "class")= chr "prcomp"
arrests_pca_df <- data.frame(arrests_pca$x, make = stringr::word(rownames(arrests), 1))
head(arrests_pca_df)
## PC1 PC2 PC3 make
## Alabama -1.1980278 0.8338118 -0.16217848 Alabama
## Alaska -2.3087473 -1.5239622 0.03833574 Alaska
## Arizona -1.5033307 -0.4983038 0.87822311 Arizona
## Arkansas -0.1759894 0.3247326 0.07111174 Arkansas
## California -2.0452358 -1.2725770 0.38153933 California
## Colorado -1.2634133 -1.4264063 -0.08369314 Colorado
ggplot(arrests_pca_df, aes(x = PC1, y = PC2, col = make)) +
geom_point(size = 3) +
labs(x = "PC1 60.08%",
y = "PC2 24.09 %",
title = "Principal components for arrests") +
theme(legend.position = "bottom")
Not sure what this is meant to show tbh
Using RNA-seq (measuring expression) from 300 single cells, measuring 8686 genes.
Assemble the data:
pollen_df <-read.table("Pollen2014.txt", sep=',', header = T,row.names=1)
label_df <-read.table("SupplementaryLabels.txt", sep=',', header = T)
pollen_df[1:10, 1:6]
## Cell_2338_1 Cell_2338_10 Cell_2338_11 Cell_2338_12 Cell_2338_13
## MTND2P28 78 559 811 705 384
## MTATP6P1 2053 1958 4922 4409 2610
## NOC2L 1 125 126 0 487
## ISG15 2953 4938 580 523 2609
## CPSF3L 2 42 19 0 37
## MXRA8 0 0 0 0 0
## AURKAIP1 302 132 64 492 11
## CCNL2 0 235 0 84 13
## MRPL20 330 477 288 222 44
## SSU72 604 869 2046 158 530
## Cell_2338_14
## MTND2P28 447
## MTATP6P1 3709
## NOC2L 66
## ISG15 1
## CPSF3L 12
## MXRA8 0
## AURKAIP1 182
## CCNL2 11
## MRPL20 282
## SSU72 272
dim(pollen_df)
## [1] 8686 300
The integer properties of the expression counts are a bit wild, so we need to transform them into more manageable numbers. log2 is the most common method of transformation for RNA-seq data.
ALso need to transpose the data.
pollen_mat <- log2(as.matrix(pollen_df) + 1)
pollen_mat <- t(pollen_mat)
Now need to rename the cells using the label information
colnames(label_df)
## [1] "Cell_Identifier" "Population" "Cell_names"
## [4] "TrueLabel_CellLevel" "Tissue_name" "TrueLabel_TissuelLevel"
rownames(pollen_mat) <- label_df$Cell_names
Next we perform PCA on the data and extract the proportion of variance explained by each component.
sc_pca <- prcomp(pollen_mat)
# variance is the square of the standard deviation
pr_var <- sc_pca$sdev^2
# compute the variance explained by each principal component
prop_var_exp <- pr_var / sum(pr_var)
To visualuse the variance calculated above:
var_exp <- data.frame(variance = prop_var_exp, pc = 1:length(prop_var_exp))
ggplot(var_exp[1:30, ], aes(x = pc, y = variance)) +
geom_bar(stat = "identity") +
labs(x = "Principal Component",
y = "Variance explained")
The first few principal components explain significant variance, but after about the PC10, there is very limited contribution. Next we will plot the data using the first two Principal components.
sc_pca_df <- data.frame(sc_pca$x, cell = rownames(sc_pca$x),
var_exp = prop_var_exp)
ggplot(sc_pca_df, aes(x = PC1, y = PC2, col = cell)) +
geom_point(size = 2) +
theme(legend.position = "bottom")
Why is it not useful to create biplot for this example?
NO IDEA - YET!!