url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw2/p1.txt"
track <- read.table(url(url_data))
colnames(track) <- c("Country","100m","200m","400m","800m","1500m","3000m","Marathon")
summary(track)
## Country 100m 200m 400m
## ARG : 1 Min. :10.49 Min. :21.34 Min. :47.60
## AUS : 1 1st Qu.:11.12 1st Qu.:22.57 1st Qu.:49.97
## AUT : 1 Median :11.32 Median :22.98 Median :51.65
## BEL : 1 Mean :11.36 Mean :23.12 Mean :51.99
## BER : 1 3rd Qu.:11.57 3rd Qu.:23.61 3rd Qu.:53.12
## BRA : 1 Max. :12.52 Max. :25.91 Max. :61.65
## (Other):48
## 800m 1500m 3000m Marathon
## Min. :1.890 Min. :3.840 Min. : 8.100 Min. :135.2
## 1st Qu.:1.970 1st Qu.:4.003 1st Qu.: 8.543 1st Qu.:143.5
## Median :2.005 Median :4.100 Median : 8.845 Median :148.4
## Mean :2.022 Mean :4.189 Mean : 9.081 Mean :153.6
## 3rd Qu.:2.070 3rd Qu.:4.338 3rd Qu.: 9.325 3rd Qu.:157.7
## Max. :2.290 Max. :5.420 Max. :13.120 Max. :221.1
##
apply(track[, -1], 2, mean)
## 100m 200m 400m 800m 1500m 3000m
## 11.357778 23.118519 51.986667 2.022407 4.189444 9.080741
## Marathon
## 153.590370
round(cor(track[, -1]), 3)
## 100m 200m 400m 800m 1500m 3000m Marathon
## 100m 1.000 0.941 0.872 0.809 0.782 0.728 0.672
## 200m 0.941 1.000 0.910 0.820 0.801 0.732 0.682
## 400m 0.872 0.910 1.000 0.806 0.720 0.674 0.678
## 800m 0.809 0.820 0.806 1.000 0.905 0.867 0.854
## 1500m 0.782 0.801 0.720 0.905 1.000 0.973 0.791
## 3000m 0.728 0.732 0.674 0.867 0.973 1.000 0.799
## Marathon 0.672 0.682 0.678 0.854 0.791 0.799 1.000
round(cov(track[, -1]), 3)
## 100m 200m 400m 800m 1500m 3000m Marathon
## 100m 0.155 0.345 0.893 0.028 0.084 0.234 4.360
## 200m 0.345 0.863 2.197 0.066 0.203 0.554 10.437
## 400m 0.893 2.197 6.761 0.182 0.510 1.428 29.031
## 800m 0.028 0.066 0.182 0.008 0.021 0.061 1.222
## 1500m 0.084 0.203 0.510 0.021 0.074 0.216 3.547
## 3000m 0.234 0.554 1.428 0.061 0.216 0.665 10.725
## Marathon 4.360 10.437 29.031 1.222 3.547 10.725 271.049
round(cor(log(track[, -1])), 3)
## 100m 200m 400m 800m 1500m 3000m Marathon
## 100m 1.000 0.940 0.870 0.803 0.780 0.730 0.670
## 200m 0.940 1.000 0.908 0.818 0.805 0.739 0.682
## 400m 0.870 0.908 1.000 0.805 0.727 0.687 0.677
## 800m 0.803 0.818 0.805 1.000 0.907 0.873 0.855
## 1500m 0.780 0.805 0.727 0.907 1.000 0.973 0.811
## 3000m 0.730 0.739 0.687 0.873 0.973 1.000 0.831
## Marathon 0.670 0.682 0.677 0.855 0.811 0.831 1.000
sigma <- matrix(c(2,-1,1,-1,5,0,1,0,3),3,3)
eigen(sigma)
## $values
## [1] 5.342923 3.470683 1.186393
##
## $vectors
## [,1] [,2] [,3]
## [1,] 0.3213152 0.4102578 0.8534900
## [2,] -0.9369890 0.2682622 0.2238013
## [3,] 0.1371429 0.8716215 -0.4706037
lapply(eigen(cor(track[, -1])), function(x) {round(x, 3)})
## $values
## [1] 5.810 0.628 0.278 0.124 0.091 0.055 0.014
##
## $vectors
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] -0.378 -0.406 -0.139 0.586 -0.178 -0.539 0.087
## [2,] -0.383 -0.413 -0.102 0.194 0.090 0.745 -0.266
## [3,] -0.368 -0.461 0.229 -0.641 0.338 -0.241 0.128
## [4,] -0.395 0.162 0.148 -0.311 -0.813 0.016 -0.195
## [5,] -0.389 0.311 -0.421 -0.062 0.026 0.189 0.731
## [6,] -0.376 0.425 -0.404 -0.072 0.354 -0.241 -0.571
## [7,] -0.356 0.384 0.745 0.320 0.243 0.049 0.083
The sum of eigenvalues is
sum(eigen(cor(track[, -1]))$value)
## [1] 7
which is equal to the dimension of the data.
track.pca <- prcomp(track[, -1], scale = TRUE, rotation = TRUE)
track.pca
## Standard deviations:
## [1] 2.4104375 0.7921510 0.5275706 0.3526322 0.3013875 0.2334944 0.1193877
##
## Rotation:
## PC1 PC2 PC3 PC4 PC5
## 100m -0.3778492 0.4063107 0.1389110 -0.58605935 0.17813228
## 200m -0.3832383 0.4129943 0.1021483 -0.19416089 -0.08952885
## 400m -0.3680626 0.4611695 -0.2289716 0.64087751 -0.33775354
## 800m -0.3946293 -0.1619028 -0.1480410 0.31056868 0.81316182
## 1500m -0.3891039 -0.3112989 0.4213477 0.06237477 -0.02597052
## 3000m -0.3759325 -0.4254217 0.4043029 0.07190423 -0.35357081
## Marathon -0.3555685 -0.3841442 -0.7449366 -0.32030965 -0.24342781
## PC6 PC7
## 100m -0.53852791 -0.08738638
## 200m 0.74530395 0.26634188
## 400m -0.24123753 -0.12817011
## 800m 0.01582337 0.19545758
## 1500m 0.18892019 -0.73068552
## 3000m -0.24078231 0.57102680
## Marathon 0.04895750 -0.08259200
The variance retained by each principal component cab be obtained as follow:
eig <- (track.pca$sdev)^2
variance <- eig*100/sum(eig)
cumvar <- cumsum(variance)
data.frame(eig = eig, variance = variance, cumvariance = cumvar)
## eig variance cumvariance
## 1 5.81020907 83.0029867 83.00299
## 2 0.62750320 8.9643315 91.96732
## 3 0.27833077 3.9761538 95.94347
## 4 0.12434950 1.7764214 97.71989
## 5 0.09083440 1.2976343 99.01753
## 6 0.05451965 0.7788521 99.79638
## 7 0.01425341 0.2036202 100.00000
The percentages of total (standardized) sample variation explained by the first and second principal components is \[91.97\%.\]
library(ggplot2)
scores <- data.frame(Country = track$Country, track.pca$x)
qplot(x= PC1, y = PC2, data = scores)
Rank the nations based on their scores on the first principal component. List the top six and the last three countries.
scores <- scores[with(scores, order(-PC1)), ]
head(scores$Country, 6)
## [1] USA GER RUS CHN FRA GBR
## 54 Levels: ARG AUS AUT BEL BER BRA CAN CHI CHN COK COL CRC CZE DEN ... USA
tail(scores$Country, 3)
## [1] PNG COK SAM
## 54 Levels: ARG AUS AUT BEL BER BRA CAN CHI CHN COK COL CRC CZE DEN ... USA
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw2/p5.txt"
air <- read.table(url(url_data))
colnames(air) <-c("Wind", "Radiation", "CO", "NO", "NO2", "O3", "HC")
summary(air)
## Wind Radiation CO NO
## Min. : 5.00 Min. : 30.00 Min. :2.000 Min. :1.00
## 1st Qu.: 6.00 1st Qu.: 68.25 1st Qu.:4.000 1st Qu.:1.00
## Median : 8.00 Median : 76.50 Median :4.000 Median :2.00
## Mean : 7.50 Mean : 73.86 Mean :4.548 Mean :2.19
## 3rd Qu.: 8.75 3rd Qu.: 84.75 3rd Qu.:5.000 3rd Qu.:3.00
## Max. :10.00 Max. :107.00 Max. :7.000 Max. :5.00
## NO2 O3 HC
## Min. : 5.00 Min. : 2.000 Min. :2.000
## 1st Qu.: 8.00 1st Qu.: 6.000 1st Qu.:3.000
## Median : 9.50 Median : 8.500 Median :3.000
## Mean :10.05 Mean : 9.405 Mean :3.095
## 3rd Qu.:12.00 3rd Qu.:11.000 3rd Qu.:3.000
## Max. :21.00 Max. :25.000 Max. :5.000
air.scaledpca <- princomp(air, cor = TRUE, loading = TRUE)
air.pca <- princomp(air, cor = FALSE, loading = TRUE)
summary(air.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 17.234083 5.25384279 3.34537279 1.569785488
## Proportion of Variance 0.872948 0.08112714 0.03289281 0.007242569
## Cumulative Proportion 0.872948 0.95407514 0.98696795 0.994210520
## Comp.5 Comp.6 Comp.7
## Standard deviation 1.117613433 0.718428856 0.4523547944
## Proportion of Variance 0.003671092 0.001516979 0.0006014096
## Cumulative Proportion 0.997881611 0.999398590 1.0000000000
We need 2 components to explain at least \(90\%\) of the original data.
summary(air.scaledpca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.5286539 1.1772853 1.0972994 0.8526937 0.80837896
## Proportion of Variance 0.3338261 0.1980001 0.1720094 0.1038695 0.09335379
## Cumulative Proportion 0.3338261 0.5318262 0.7038356 0.8077051 0.90105889
## Comp.6 Comp.7
## Standard deviation 0.73259047 0.39484041
## Proportion of Variance 0.07666983 0.02227128
## Cumulative Proportion 0.97772872 1.00000000
We need 5 components to explain at least \(90\%\) of the standardized data.
screeplot(air.pca)
screeplot(air.scaledpca)
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw2/p6-data.txt"
data <- read.table(url(url_data), header = FALSE, sep = ",", skip = 1)
colnames(data) <-c("Country", "InfantDeath", "Inhab", "Population", "Population1000", "Literate", "Student", "GNP")
rownames(data) <- data[,1]
data[,1] <- NULL
summary(data)
## InfantDeath Inhab Population Population1000
## Min. : 16.50 Min. : 578 Min. : 1.0 Min. : 21
## 1st Qu.: 27.40 1st Qu.: 830 1st Qu.: 18.0 1st Qu.: 1042
## Median : 38.30 Median :1014 Median : 84.0 Median : 1497
## Mean : 50.48 Mean :1872 Mean : 248.2 Mean : 6462
## 3rd Qu.: 68.90 3rd Qu.:2400 3rd Qu.: 168.0 3rd Qu.: 3420
## Max. :225.00 Max. :6400 Max. :3082.0 Max. :108210
## Literate Student GNP
## Min. :19.30 Min. : 14 Min. : 73.0
## 1st Qu.:60.50 1st Qu.: 220 1st Qu.: 293.0
## Median :89.00 Median : 362 Median : 467.0
## Mean :79.56 Mean : 411 Mean : 647.9
## 3rd Qu.:98.50 3rd Qu.: 529 3rd Qu.: 927.0
## Max. :98.50 Max. :1983 Max. :2577.0
scaled.data <- scale(data)
colMeans(scaled.data)
## InfantDeath Inhab Population Population1000 Literate
## 1.045082e-16 4.814743e-17 -2.078128e-17 -7.222114e-18 -1.752425e-16
## Student GNP
## -4.106692e-18 -3.540252e-17
s <- svd(scaled.data)
PC <- data.frame(s$u %*% diag(s$d))
plot(X1~X2, xlim = c(-7, 7), xlab = 'First PCA component', ylab = 'Second PCA component', main = 'Sample PCA', data = PC)
with(PC, text(X1~X2, labels = rownames(data), pos = 4))
The two obvious outliers are Hong Kong and Singapore.
s <- svd(scaled.data)
PC <- data.frame(diag(s$d) %*% t(s$v))
plot(X1~X2, xlim = c(-7, 7), xlab = 'First PCA component', ylab = 'Second PCA component', main = 'Variable PCA', data = PC)
with(PC, text(X1~X2, labels = colnames(data), pos = 4))
biplot(prcomp(data, scale = TRUE))
The two demographic variables near the two outlier countries are Population/sq.km and Population/1000HectarAgri.
data.removed <- data[!rownames(data) %in% c("Hong Kong", "Singapore"), ]
scaled.data <- scale(data.removed)
s <- svd(scaled.data)
PC <- data.frame(s$u %*% diag(s$d))
plot(X1~X2, xlim = c(-7, 7), xlab = 'First PCA component', ylab = 'Second PCA component', main = 'Sample PCA', data = PC)
with(PC, text(X1~X2, labels = rownames(data.removed), pos = 4))
The two European countries most alike Japan are Belgium and Netherlands.
s <- svd(data)
PC <- data.frame(s$u %*% diag(s$d))
plot(X1~X2, xlim = c(-7, 7), xlab = 'First PCA component', ylab = 'Second PCA component', main = 'Sample PCA', data = PC)
with(PC, text(X1~X2, labels = rownames(data), pos = 4))
PC <- data.frame(diag(s$d) %*% t(s$v))
plot(X1~X2, xlim = c(-7, 7), xlab = 'First PCA component', ylab = 'Second PCA component', main = 'Variable PCA', data = PC)
with(PC, text(X1~X2, labels = colnames(data), pos = 4))
biplot(prcomp(data, scale = FALSE))