w_t <- read.csv("women_trackrec.txt", header=FALSE, sep = "")
colnames(w_t) = c("Country","100m","200m","400m","800m","1500m","3000m","Marathon")
m_t <- read.csv("men_trackrec.txt", header=FALSE, sep = "")
colnames(m_t) = c("Country","100m","200m","400m","800m","1500m","5000m","10000m", "Marathon")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
w_t <- w_t %>%
select("Country","100m","200m","400m","800m","1500m","Marathon")
m_t <- m_t %>%
select("Country","100m","200m","400m","800m","1500m","Marathon")
Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)
Decision Rule: If Tsq < critical value, fail to reject \(H_0\)
Conclusion: As Tsq > critical value, we reject \(H_0\). That is, the difference in means between the groups is significant.
n_1<-dim(w_t[,-1])[1]
n_2<-dim(m_t[,-1])[1]
p<-dim(w_t[,-1])[2]
# Testing H0:mu1-mu2=delta0
delta0=rep(0,p)
xbar_1 = colMeans(w_t[,-1]);xbar_1
## 100m 200m 400m 800m 1500m Marathon
## 11.357778 23.118519 51.989074 2.022407 4.189444 153.619259
xbar_2 = colMeans(m_t[,-1]);xbar_2
## 100m 200m 400m 800m 1500m Marathon
## 10.216852 20.541481 45.829074 1.768148 3.653333 133.478519
S_1 = var(w_t[,-1]);S_1
## 100m 200m 400m 800m 1500m
## 100m 0.15531572 0.3445608 0.8912960 0.027703564 0.08389119
## 200m 0.34456080 0.8630883 2.1928363 0.066165898 0.20276331
## 400m 0.89129602 2.1928363 6.7454576 0.181807932 0.50917683
## 800m 0.02770356 0.0661659 0.1818079 0.007546925 0.02141457
## 1500m 0.08389119 0.2027633 0.5091768 0.021414570 0.07418270
## Marathon 4.33417757 10.3849876 28.9037314 1.219654647 3.53983732
## Marathon
## 100m 4.334178
## 200m 10.384988
## 400m 28.903731
## 800m 1.219655
## 1500m 3.539837
## Marathon 270.270150
S_2 = var(m_t[,-1]);S_2
## 100m 200m 400m 800m 1500m
## 100m 0.048972921 0.11104437 0.25602156 0.008263871 0.025720126
## 200m 0.111044375 0.30090342 0.66681838 0.022929210 0.066193082
## 400m 0.256021558 0.66681838 2.06995573 0.057937876 0.168472956
## 800m 0.008263871 0.02292921 0.05793788 0.002751223 0.007130818
## 1500m 0.025720126 0.06619308 0.16847296 0.007130818 0.023033962
## Marathon 1.340138644 3.54103809 9.17885709 0.378904752 1.192563522
## Marathon
## 100m 1.3401386
## 200m 3.5410381
## 400m 9.1788571
## 800m 0.3789048
## 1500m 1.1925635
## Marathon 80.1353563
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
## 100m 200m 400m 800m 1500m
## 100m 0.10214432 0.22780259 0.5736588 0.017983718 0.05480566
## 200m 0.22780259 0.58199588 1.4298274 0.044547554 0.13447820
## 400m 0.57365879 1.42982736 4.4077067 0.119872904 0.33882490
## 800m 0.01798372 0.04454755 0.1198729 0.005149074 0.01427269
## 1500m 0.05480566 0.13447820 0.3388249 0.014272694 0.04860833
## Marathon 2.83715811 6.96301282 19.0412942 0.799279700 2.36620042
## Marathon
## 100m 2.8371581
## 200m 6.9630128
## 400m 19.0412942
## 800m 0.7992797
## 1500m 2.3662004
## Marathon 175.2027533
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
## [,1]
## [1,] 680.4875
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))
## [1] 13.78843
The first two principal components: \(Y_1 = -0.413(100m) - 0.420(200m) - 0.406(400m) - 0.422(800m) - 0.407(1500M) - 0.3784(Marathon)\)
\(Y_2 = -0.385(100m) - 0.382(200m) - 0.393(400m) + 0.286(800m) + 0.316(1500M) +0.608(Marathon)\)
#Women's Correlation
cor(w_t[,-1])
## 100m 200m 400m 800m 1500m Marathon
## 100m 1.0000000 0.9410886 0.8707802 0.8091758 0.7815510 0.6689597
## 200m 0.9410886 1.0000000 0.9088096 0.8198258 0.8013282 0.6799537
## 400m 0.8707802 0.9088096 1.0000000 0.8057904 0.7197996 0.6769384
## 800m 0.8091758 0.8198258 0.8057904 1.0000000 0.9050509 0.8539900
## 1500m 0.7815510 0.8013282 0.7197996 0.9050509 1.0000000 0.7905565
## Marathon 0.6689597 0.6799537 0.6769384 0.8539900 0.7905565 1.0000000
#Women's Track Team Eigen Values and Eigen Vector
e <- eigen(cor(w_t[,-1]))$vec
l <- eigen(cor(w_t[,-1]))$val
e
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.4137534 -0.3846299 -0.12054812 0.5735489 -0.3939281 -0.42684706
## [2,] -0.4202377 -0.3824032 -0.06535754 0.1897090 0.3176902 0.73210645
## [3,] -0.4061645 -0.3929376 0.44254638 -0.5768634 0.1985368 -0.33555163
## [4,] -0.4226394 0.2858530 -0.09940074 -0.3891141 -0.6981406 0.30161790
## [5,] -0.4067668 0.3159300 -0.67477321 -0.1371183 0.4276702 -0.27875957
## [6,] -0.3783590 0.6081973 0.56581787 0.3634138 0.1848636 -0.02337895
l
## [1] 5.01728744 0.52410711 0.21557993 0.12351705 0.07381482 0.04569366
cor_new <- matrix(rep(0,36), ncol=6)
for (j in 1:6){
for(k in 1:6) {
cor_new[j,k] = l[j]*e[j,k] }
}
cor_new
## [,1] [,2] [,3] [,4] [,5]
## [1,] -2.07591983 -1.92979883 -0.60482458 2.87765948 -1.976450716
## [2,] -0.22024958 -0.20042024 -0.03425435 0.09942783 0.166503704
## [3,] -0.08756091 -0.08470946 0.09540412 -0.12436018 0.042800549
## [4,] -0.05220317 0.03530772 -0.01227769 -0.04806223 -0.086232268
## [5,] -0.03002542 0.02332032 -0.04980826 -0.01012136 0.031568398
## [6,] -0.01728861 0.02779076 0.02585429 0.01660570 0.008447093
## [,6]
## [1,] -2.14161441
## [2,] 0.38370220
## [3,] -0.07233820
## [4,] 0.03725495
## [5,] -0.02057659
## [6,] -0.00106827
cum_pct <- c(0)
for(i in 1:6){
cum_pct[i] = sum(l[1:i])/6
}
cum_pct
## [1] 0.8362146 0.9235658 0.9594957 0.9800819 0.9923844 1.0000000
#Easier Way
prcomp(w_t[,-1], scale=TRUE, center=TRUE)
## Standard deviations (1, .., p=6):
## [1] 2.2399302 0.7239524 0.4643059 0.3514499 0.2716888 0.2137607
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## 100m -0.4137534 0.3846299 -0.12054812 0.5735489 -0.3939281
## 200m -0.4202377 0.3824032 -0.06535754 0.1897090 0.3176902
## 400m -0.4061645 0.3929376 0.44254638 -0.5768634 0.1985368
## 800m -0.4226394 -0.2858530 -0.09940074 -0.3891141 -0.6981406
## 1500m -0.4067668 -0.3159300 -0.67477321 -0.1371183 0.4276702
## Marathon -0.3783590 -0.6081973 0.56581787 0.3634138 0.1848636
## PC6
## 100m -0.42684706
## 200m 0.73210645
## 400m -0.33555163
## 800m 0.30161790
## 1500m -0.27875957
## Marathon -0.02337895
prcomp((w_t[,-1]), scale=TRUE, center=TRUE)$sdev^2
## [1] 5.01728744 0.52410711 0.21557993 0.12351705 0.07381482 0.04569366
summary(prcomp((w_t[,-1]), scale=TRUE, center=TRUE))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.2399 0.72395 0.46431 0.35145 0.2717 0.21376
## Proportion of Variance 0.8362 0.08735 0.03593 0.02059 0.0123 0.00762
## Cumulative Proportion 0.8362 0.92357 0.95950 0.98008 0.9924 1.00000
From the cumulative percentage of the principal components, we see that the first two principal components explain up to 92% of the total variance.
From what I know about track, I was surprised not to see Kenya on the top 5. Also, Usain Bolt is from Jamaica!
women_PCA <-prcomp((w_t[,-1]), scale=TRUE, center=TRUE)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
a1 <- women_PCA$rotation[,1]
center <- women_PCA$center
scale <- women_PCA$scale
hm <- as.matrix(w_t[,-1])
women_PCA_score <- drop(scale(hm,center = center,scale = scale) %*% a1)
women_PCA_score<- data.frame(women_PCA_score)
women_PCA_score<- cbind(w_t$Country,women_PCA_score)
colnames(women_PCA_score) <- c("Country","Score")
arrange(women_PCA_score,desc(Score))
## Country Score
## 1 USA 3.2548441
## 2 GER 3.0195659
## 3 RUS 2.9393276
## 4 CHN 2.7372241
## 5 FRA 2.5164458
## 6 CZE 2.5023375
## 7 GBR 2.2829618
## 8 POL 2.1872859
## 9 ROM 1.9245159
## 10 AUS 1.8688929
## 11 ESP 1.7445837
## 12 CAN 1.6061855
## 13 ITA 1.4193755
## 14 NED 1.4085145
## 15 BEL 1.2634324
## 16 GRE 1.2430415
## 17 AUT 1.2139069
## 18 FIN 1.1811139
## 19 BRA 1.0726996
## 20 MEX 0.9914729
## 21 POR 0.9660574
## 22 SUI 0.9518668
## 23 SWE 0.7699433
## 24 IRL 0.7477501
## 25 NZL 0.6486268
## 26 TUR 0.6295431
## 27 KEN 0.6243322
## 28 HUN 0.5766679
## 29 IND 0.3980313
## 30 JPN 0.3426013
## 31 NOR 0.3065790
## 32 COL 0.1607321
## 33 DEN -0.1062300
## 34 ARG -0.3706994
## 35 TPE -0.4209546
## 36 ISR -0.4922723
## 37 CHI -0.8022347
## 38 THA -0.8178009
## 39 MYA -0.8908269
## 40 KORS -0.9417002
## 41 BER -1.1362494
## 42 MAS -1.4870939
## 43 PHI -1.5319740
## 44 KORN -1.6680405
## 45 INA -1.6787304
## 46 LUX -1.8114198
## 47 MRI -1.8163846
## 48 CRC -1.9513435
## 49 DOM -1.9529459
## 50 SIN -2.9237631
## 51 GUA -3.2483941
## 52 PNG -5.0931713
## 53 SAM -6.8184047
## 54 COK -7.5398251
The first two principal components: \(Y_1 = -0.413(100m) - 0.418(200m) - 0.403(400m) - 0.410(800m) - 0.421(1500M) - 0.3945(Marathon)\)
\(Y_2 = -0.514(100m) - 0.3917(200m) - 0.288(400m) + 0.3267(800m) + 0.349(1500M) +0.5201(Marathon)\)
From the cumulative percentage of the principal components, we see that the first two principal components explain up to 90% of the total variance.
From what I know about track, I was surprised not to see Kenya on the top 5. Also, Usain Bolt is from Jamaica! I was a bit surprised about China too. These rankings are more intuitive with what is expected though.
#Men's Correlation
cor(m_t[,-1])
## 100m 200m 400m 800m 1500m Marathon
## 100m 1.0000000 0.9147554 0.8041147 0.7119388 0.7657919 0.6764873
## 200m 0.9147554 1.0000000 0.8449159 0.7969162 0.7950871 0.7211157
## 400m 0.8041147 0.8449159 1.0000000 0.7677488 0.7715522 0.7126823
## 800m 0.7119388 0.7969162 0.7677488 1.0000000 0.8957609 0.8069657
## 1500m 0.7657919 0.7950871 0.7715522 0.8957609 1.0000000 0.8777788
## Marathon 0.6764873 0.7211157 0.7126823 0.8069657 0.8777788 1.0000000
#Men's Track Team Eigen Values and Eigen Vector
eigen(cor(m_t[,-1]))$vec
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.4013958 0.5138807 0.4789843 -0.01687632 -0.2969661 -0.50686134
## [2,] -0.4180798 0.3917657 0.1620888 -0.17179519 0.5364074 0.57289582
## [3,] -0.4035829 0.2881765 -0.7458936 0.42347137 -0.1323242 -0.02966769
## [4,] -0.4104405 -0.3266872 -0.2985046 -0.65176976 0.2267076 -0.39938562
## [5,] -0.4208110 -0.3494365 0.1138092 -0.13633585 -0.6628797 0.47944005
## [6,] -0.3945481 -0.5201637 0.2930638 0.58947631 0.3402392 -0.15693998
eigen(cor(m_t[,-1]))$val
## [1] 4.95703332 0.49022363 0.20953295 0.18719579 0.09680142 0.05921289
#Easier Way
prcomp((m_t[,-1]), scale=TRUE, center=TRUE)
## Standard deviations (1, .., p=6):
## [1] 2.2264396 0.7001597 0.4577477 0.4326613 0.3111293 0.2433370
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## 100m -0.4013958 -0.5138807 0.4789843 -0.01687632 -0.2969661
## 200m -0.4180798 -0.3917657 0.1620888 -0.17179519 0.5364074
## 400m -0.4035829 -0.2881765 -0.7458936 0.42347137 -0.1323242
## 800m -0.4104405 0.3266872 -0.2985046 -0.65176976 0.2267076
## 1500m -0.4208110 0.3494365 0.1138092 -0.13633585 -0.6628797
## Marathon -0.3945481 0.5201637 0.2930638 0.58947631 0.3402392
## PC6
## 100m 0.50686134
## 200m -0.57289582
## 400m 0.02966769
## 800m 0.39938562
## 1500m -0.47944005
## Marathon 0.15693998
prcomp((m_t[,-1]), scale=TRUE, center=TRUE)$sdev^2
## [1] 4.95703332 0.49022363 0.20953295 0.18719579 0.09680142 0.05921289
summary(prcomp((m_t[,-1]), scale=TRUE, center=TRUE))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.2264 0.7002 0.45775 0.4327 0.31113 0.24334
## Proportion of Variance 0.8262 0.0817 0.03492 0.0312 0.01613 0.00987
## Cumulative Proportion 0.8262 0.9079 0.94280 0.9740 0.99013 1.00000
#Ranks
men_PCA <-prcomp((m_t[,-1]), scale=TRUE, center=TRUE)
library(plyr)
a1 <- men_PCA$rotation[,1]
center <- men_PCA$center
scale <- men_PCA$scale
hm <- as.matrix(m_t[,-1])
men_PCA_score <- drop(scale(hm,center = center,scale = scale) %*% a1)
men_PCA_score<- data.frame(men_PCA_score)
men_PCA_score<- cbind(m_t$Country,men_PCA_score)
colnames(men_PCA_score) <- c("Country","Score")
arrange(men_PCA_score,desc(Score))
## Country Score
## 1 U.S.A. 3.81443948
## 2 GreatBritain 2.76559122
## 3 Brazil 2.41332264
## 4 France 2.15246264
## 5 Australia 2.11903404
## 6 Italy 2.01870424
## 7 Kenya 1.95130554
## 8 Canada 1.91110348
## 9 Poland 1.90004707
## 10 Germany 1.80363466
## 11 Portugal 1.77060166
## 12 Russia 1.73556781
## 13 Japan 1.47793228
## 14 Belgium 1.44052828
## 15 Switzerland 1.40838536
## 16 Spain 1.34630146
## 17 Netherlands 1.17123014
## 18 Norway 1.14580691
## 19 Greece 1.11906527
## 20 Denmark 1.02624744
## 21 Hungary 0.98914934
## 22 NewZealand 0.96316485
## 23 Mexico 0.79549722
## 24 China 0.63512139
## 25 Chile 0.61460672
## 26 Finland 0.60506231
## 27 Sweden 0.55337404
## 28 Korea,South 0.53956987
## 29 Austria 0.44344557
## 30 Ireland 0.42106340
## 31 CzechRepublic 0.35182100
## 32 Romania 0.21683546
## 33 Argentina 0.09224532
## 34 India -0.05737107
## 35 Mauritius -0.31780244
## 36 Bermuda -0.62129242
## 37 Turkey -0.67538352
## 38 Columbia -0.70322500
## 39 DominicanRepub -0.81091756
## 40 Israel -0.85938431
## 41 Thailand -1.10789164
## 42 Luxembourg -1.11218279
## 43 Taiwan -1.24709871
## 44 Indonesia -1.47604901
## 45 Malaysia -1.76501320
## 46 Guatemala -1.94178678
## 47 CostaRica -1.97544573
## 48 Korea,North -2.07745301
## 49 Philippines -2.17510111
## 50 Singapore -2.83941185
## 51 PapuaNewGuinea -2.93899475
## 52 Myanmar(Burma) -3.22338149
## 53 Samoa -6.75947191
## 54 CookIslands -9.02760982
Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)
Where \(\mu\) is the population principal components.
Decision Rule: If Tsq < critical value, fail to reject \(H_0\)
Conclusion: As Tsq < critical value, we fail to reject \(H_0\). That is, the difference in population principal components between the groups is not significant.
# Testing H0:mu1-mu2=delta0
delta0=rep(0,1)
xbar_1 = mean(men_PCA_score$Score);xbar_1
## [1] 2.978424e-16
xbar_2 = mean(women_PCA_score$Score);xbar_2
## [1] -2.44347e-16
S_1 = var(men_PCA_score$Score);S_1
## [1] 4.957033
S_2 = var(women_PCA_score$Score);S_2
## [1] 5.017287
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
## [1] 4.98716
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
## [,1]
## [1,] 1.591522e-30
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))
## [1] 13.78843
From the factor analysis, we see that the unrotated factor analysis explains about 80% of the data. The 900m has the highest factor loading. Factor 2 does not explain much and increases the cumulative variance explained to 88.2%. Factor 2 contrasts with the shorter events.
After rotating, the first factor explains about 46% of the variance and the 200m explains the highest factor loading. Factor 2 explains about 42% which is a lot more than Factor 2 from the first factor analysis.
The two methods seem inconsistent. Yes, it does make a difference if R or S is factored. They would produce different high factor loads.
mlik.fa=factanal(w_t[,-1],factors=2,rotation="none")
mlik.fa
##
## Call:
## factanal(x = w_t[, -1], factors = 2, rotation = "none")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.926 -0.192
## 200m 0.962 -0.263
## 400m 0.905 -0.145
## 800m 0.942 0.327
## 1500m 0.889 0.209
## Marathon 0.795 0.323
##
## Factor1 Factor2
## SS loadings 4.910 0.382
## Proportion Var 0.818 0.064
## Cumulative Var 0.818 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
#specific variance
1-apply(mlik.fa$loadings^2,1,sum)
## 100m 200m 400m 800m 1500m Marathon
## 0.106129659 0.004996951 0.160313361 0.006039590 0.166955171 0.263530952
##Rotation (varimax)##
mlik.farot=factanal(w_t[,-1],factors=2,rotation="varimax") #rotmat
mlik.farot
##
## Call:
## factanal(x = w_t[, -1], factors = 2, rotation = "varimax")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.824 0.464
## 200m 0.898 0.433
## 400m 0.778 0.485
## 800m 0.495 0.865
## 1500m 0.533 0.741
## Marathon 0.387 0.766
##
## Factor1 Factor2
## SS loadings 2.770 2.522
## Proportion Var 0.462 0.420
## Cumulative Var 0.462 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
#specific variance
1-apply(mlik.farot$loadings^2,1,sum) #same as no rotation.
## 100m 200m 400m 800m 1500m Marathon
## 0.106129659 0.004996951 0.160313361 0.006039590 0.166955171 0.263530952
##Or use the correlation mattrix R
cor.w_t=cor(w_t[,-1])
factanal(covmat=cor.w_t,factors = 2,rotation="none",n.obs=54,method="mle")
##
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "none", method = "mle")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.926 -0.192
## 200m 0.962 -0.263
## 400m 0.905 -0.145
## 800m 0.942 0.327
## 1500m 0.889 0.209
## Marathon 0.795 0.323
##
## Factor1 Factor2
## SS loadings 4.910 0.382
## Proportion Var 0.818 0.064
## Cumulative Var 0.818 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
factanal(covmat=cor.w_t,factors = 2,rotation="varimax",n.obs=54,method="mle")
##
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "varimax", method = "mle")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.824 0.464
## 200m 0.898 0.433
## 400m 0.778 0.485
## 800m 0.495 0.865
## 1500m 0.533 0.741
## Marathon 0.387 0.766
##
## Factor1 Factor2
## SS loadings 2.770 2.522
## Proportion Var 0.462 0.420
## Cumulative Var 0.462 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
From the factor analysis, we see that the unrotated factor analysis explains about 78.3% of the data. The 200m has the highest factor loading. Factor 2 does not explain much and increases the cumulative variance explained to 86.4%.
After rotating, the first factor explains about 46% of the variance and the 200m explains the highest factor loading. Factor 2 explains about 42% which is a lot more than Factor 2 from the first factor analysis.
It seems that both methods for the men’s utilize the 200m as the highest factor loading.
mlik.fa=factanal(m_t[,-1],factors=2,rotation="none")
mlik.fa
##
## Call:
## factanal(x = m_t[, -1], factors = 2, rotation = "none")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.154 0.009 0.250 0.164 0.027 0.208
##
## Loadings:
## Factor1 Factor2
## 100m 0.914 -0.102
## 200m 0.983 -0.158
## 400m 0.865
## 800m 0.859 0.312
## 1500m 0.881 0.445
## Marathon 0.797 0.396
##
## Factor1 Factor2
## SS loadings 4.700 0.488
## Proportion Var 0.783 0.081
## Cumulative Var 0.783 0.865
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 5.5 on 4 degrees of freedom.
## The p-value is 0.239
#specific variance
1-apply(mlik.fa$loadings^2,1,sum)
## 100m 200m 400m 800m 1500m Marathon
## 0.153832828 0.008933034 0.250451379 0.164335242 0.026552134 0.208375301
##Rotation (varimax)##
mlik.farot=factanal(m_t[,-1],factors=2,rotation="varimax") #rotmat
mlik.farot
##
## Call:
## factanal(x = m_t[, -1], factors = 2, rotation = "varimax")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.154 0.009 0.250 0.164 0.027 0.208
##
## Loadings:
## Factor1 Factor2
## 100m 0.806 0.443
## 200m 0.895 0.436
## 400m 0.691 0.522
## 800m 0.523 0.750
## 1500m 0.464 0.870
## Marathon 0.424 0.782
##
## Factor1 Factor2
## SS loadings 2.598 2.589
## Proportion Var 0.433 0.432
## Cumulative Var 0.433 0.865
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 5.5 on 4 degrees of freedom.
## The p-value is 0.239
#specific variance
1-apply(mlik.farot$loadings^2,1,sum) #same as no rotation.
## 100m 200m 400m 800m 1500m Marathon
## 0.153832828 0.008933034 0.250451379 0.164335242 0.026552134 0.208375301
##Or use the correlation mattrix R
cor.w_t=cor(w_t[,-1])
factanal(covmat=cor.w_t,factors = 2,rotation="none",n.obs=54,method="mle")
##
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "none", method = "mle")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.926 -0.192
## 200m 0.962 -0.263
## 400m 0.905 -0.145
## 800m 0.942 0.327
## 1500m 0.889 0.209
## Marathon 0.795 0.323
##
## Factor1 Factor2
## SS loadings 4.910 0.382
## Proportion Var 0.818 0.064
## Cumulative Var 0.818 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
factanal(covmat=cor.w_t,factors = 2,rotation="varimax",n.obs=54,method="mle")
##
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "varimax", method = "mle")
##
## Uniquenesses:
## 100m 200m 400m 800m 1500m Marathon
## 0.106 0.005 0.160 0.006 0.167 0.264
##
## Loadings:
## Factor1 Factor2
## 100m 0.824 0.464
## 200m 0.898 0.433
## 400m 0.778 0.485
## 800m 0.495 0.865
## 1500m 0.533 0.741
## Marathon 0.387 0.766
##
## Factor1 Factor2
## SS loadings 2.770 2.522
## Proportion Var 0.462 0.420
## Cumulative Var 0.462 0.882
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171
The first principal component is made up of equal parts of every variable. It makes up about 81% of the explained variance. The second prinicipal component is made up of two groups - short and long distances. This prinicpal component makes up 90.3% of the explained variance. It is easier to use the speed variable because it is standardized.
speedw <- w_t
speedw[,2]<-100/speedw[,2]
speedw[,3]<-200/speedw[,3]
speedw[,4]<-400/speedw[,4]
speedw[,5]<-800/speedw[,5]/60
speedw[,6]<-1500/speedw[,6]/60
speedw[,7]<-42195/speedw[,7]/60
head(speedw)
## Country 100m 200m 400m 800m 1500m Marathon
## 1 ARG 8.643042 8.718396 7.619048 6.504065 5.882353 4.678353
## 2 AUS 8.992806 8.996851 8.225375 6.734007 6.218905 4.900355
## 3 AUT 8.968610 8.810573 7.902015 6.872852 6.172840 4.556203
## 4 BEL 8.976661 8.896797 7.774538 6.768190 6.127451 4.916113
## 5 BER 8.726003 8.676790 7.504690 6.441224 5.827506 4.037490
## 6 BRA 8.952551 8.849558 7.902015 6.768190 5.995204 4.770708
cov(speedw[,-1])
## 100m 200m 400m 800m 1500m Marathon
## 100m 0.09053826 0.09560635 0.09667244 0.06506402 0.08221980 0.08109987
## 200m 0.09560635 0.11467144 0.11386990 0.07492487 0.09601895 0.09331033
## 400m 0.09667244 0.11386990 0.13778886 0.08094090 0.09544299 0.10188073
## 800m 0.06506402 0.07492487 0.08094090 0.07352284 0.08645423 0.09430563
## 1500m 0.08221980 0.09601895 0.09544299 0.08645423 0.12384050 0.11845777
## Marathon 0.08109987 0.09331033 0.10188073 0.09430563 0.11845777 0.16671409
prcomp((speedw[,-1]), scale=TRUE, center=TRUE)
## Standard deviations (1, .., p=6):
## [1] 2.2394797 0.7372135 0.4298389 0.3581696 0.2840273 0.2180105
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## 100m 0.4110369 -0.3985261 0.19870602 -0.5452687 0.3871234
## 200m 0.4194769 -0.3817117 0.11579002 -0.1503919 -0.2699084
## 400m 0.4059147 -0.3822980 -0.55004814 0.4729653 -0.2383517
## 800m 0.4213789 0.2724519 0.07729831 0.4566153 0.6932049
## 1500m 0.4105739 0.3252716 0.61695997 0.1892135 -0.4742597
## Marathon 0.3797236 0.6076921 -0.50787892 -0.4605094 -0.1225483
## PC6
## 100m 0.43076577
## 200m -0.75462556
## 400m 0.32560833
## 800m -0.23066019
## 1500m 0.29028244
## Marathon -0.03863049
summary(prcomp((speedw[,-1]), scale=TRUE, center=TRUE))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.2395 0.73721 0.42984 0.35817 0.28403 0.21801
## Proportion of Variance 0.8359 0.09058 0.03079 0.02138 0.01345 0.00792
## Cumulative Proportion 0.8359 0.92646 0.95725 0.97863 0.99208 1.00000
e <- eigen(cor(speedw[,-1]))$vec
l <- eigen(cor(speedw[,-1]))$val
e
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.4110369 0.3985261 -0.19870602 0.5452687 -0.3871234 -0.43076577
## [2,] -0.4194769 0.3817117 -0.11579002 0.1503919 0.2699084 0.75462556
## [3,] -0.4059147 0.3822980 0.55004814 -0.4729653 0.2383517 -0.32560833
## [4,] -0.4213789 -0.2724519 -0.07729831 -0.4566153 -0.6932049 0.23066019
## [5,] -0.4105739 -0.3252716 -0.61695997 -0.1892135 0.4742597 -0.29028244
## [6,] -0.3797236 -0.6076921 0.50787892 0.4605094 0.1225483 0.03863049
l
## [1] 5.01526917 0.54348379 0.18476151 0.12828548 0.08067149 0.04752856
It seems that the first two principal components for women and men are about the same. The first principal component is made up of equal parts of every variable. It makes up about 81% of the explained variance. The second prinicipal component is made up of two groups - short and long distances. This prinicpal component makes up 90.3% of the explained variance. It is easier to use the speed variable because it is standardized.
speedm <- m_t
speedm[,2]<-100/speedm[,2]
speedm[,3]<-200/speedm[,3]
speedm[,4]<-400/speedm[,4]
speedm[,5]<-800/speedm[,5]/60
speedm[,6]<-1500/speedm[,6]/60
speedm[,7]<-42195/speedm[,7]/60
head(speedm)
## Country 100m 200m 400m 800m 1500m Marathon
## 1 Argentina 9.775171 9.818360 8.661758 7.532957 6.793478 5.427568
## 2 Australia 10.070493 9.970090 9.013069 7.662835 7.082153 5.515254
## 3 Austria 9.852217 9.779951 8.733624 7.532957 6.983240 5.318787
## 4 Belgium 9.861933 9.905894 8.884940 7.707129 7.002801 5.528695
## 5 Bermuda 9.737098 9.852217 8.837826 7.448790 6.756757 4.804605
## 6 Brazil 10.000000 10.055304 9.031384 7.843137 7.002801 5.579135
cov(speedm[,-1])
## 100m 200m 400m 800m 1500m Marathon
## 100m 0.04349790 0.04827718 0.04346323 0.03149513 0.04250343 0.04312562
## 200m 0.04827718 0.06484523 0.05586780 0.04323338 0.05352645 0.05629446
## 400m 0.04346323 0.05586780 0.06882169 0.04282214 0.05372066 0.05673423
## 800m 0.03149513 0.04323338 0.04282214 0.04688400 0.05230584 0.05419108
## 1500m 0.04250343 0.05352645 0.05372066 0.05230584 0.07291400 0.07365179
## Marathon 0.04312562 0.05629446 0.05673423 0.05419108 0.07365179 0.09792763
prcomp((speedm[,-1]), scale=TRUE, center=TRUE)
## Standard deviations (1, .., p=6):
## [1] 2.2135294 0.7218360 0.4710989 0.4387464 0.3252378 0.2429571
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## 100m 0.4013532 0.5044835 -0.48337352 0.03846601 0.3388272
## 200m 0.4180942 0.3929087 -0.16824214 0.17014061 -0.5665582
## 400m 0.4026901 0.3010357 0.73492220 -0.43653013 0.1240380
## 800m 0.4111819 -0.3304516 0.30737166 0.63871947 -0.2199149
## 1500m 0.4217625 -0.3523522 -0.08502157 0.14181362 0.6319581
## Marathon 0.3936996 -0.5168620 -0.31020634 -0.59240216 -0.3179448
## PC6
## 100m -0.48422984
## 200m 0.54090531
## 400m -0.03411095
## 800m -0.41343524
## 1500m 0.52082007
## Marathon -0.17203808
summary(prcomp((speedm[,-1]), scale=TRUE, center=TRUE))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.2135 0.72184 0.47110 0.43875 0.32524 0.24296
## Proportion of Variance 0.8166 0.08684 0.03699 0.03208 0.01763 0.00984
## Cumulative Proportion 0.8166 0.90346 0.94045 0.97253 0.99016 1.00000
e <- eigen(cor(speedm[,-1]))$vec
l <- eigen(cor(speedm[,-1]))$val
e
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.4013532 0.5044835 -0.48337352 -0.03846601 -0.3388272 -0.48422984
## [2,] -0.4180942 0.3929087 -0.16824214 -0.17014061 0.5665582 0.54090531
## [3,] -0.4026901 0.3010357 0.73492220 0.43653013 -0.1240380 -0.03411095
## [4,] -0.4111819 -0.3304516 0.30737166 -0.63871947 0.2199149 -0.41343524
## [5,] -0.4217625 -0.3523522 -0.08502157 -0.14181362 -0.6319581 0.52082007
## [6,] -0.3936996 -0.5168620 -0.31020634 0.59240216 0.3179448 -0.17203808
l
## [1] 4.89971248 0.52104716 0.22193414 0.19249842 0.10577963 0.05902817
Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)
Where \(\mu\) are the two principal components.
Decision Rule: If Tsq < critical value, fail to reject \(H_0\)
Conclusion: As Tsq < critical value, we fail to reject \(H_0\). That is, using the first two principal components, the difference between the two populations is not significant.
n_1<-dim(speedw[,-1])[1]
n_2<-dim(speedm[,-1])[1]
p<-dim(speedw[,-1])[2]
# Testing H0:mu1-mu2=delta0
delta0=rep(0,p)
xbar_1 = colMeans(speedw[,-1]);xbar_1
## 100m 200m 400m 800m 1500m Marathon
## 8.814772 8.664408 7.712067 6.604214 5.989687 4.620264
xbar_2 = colMeans(speedm[,-1]);xbar_2
## 100m 200m 400m 800m 1500m Marathon
## 9.792182 9.743064 8.736156 7.547144 6.854053 5.289144
S_1 = var(speedw[,-1]);S_1
## 100m 200m 400m 800m 1500m Marathon
## 100m 0.09053826 0.09560635 0.09667244 0.06506402 0.08221980 0.08109987
## 200m 0.09560635 0.11467144 0.11386990 0.07492487 0.09601895 0.09331033
## 400m 0.09667244 0.11386990 0.13778886 0.08094090 0.09544299 0.10188073
## 800m 0.06506402 0.07492487 0.08094090 0.07352284 0.08645423 0.09430563
## 1500m 0.08221980 0.09601895 0.09544299 0.08645423 0.12384050 0.11845777
## Marathon 0.08109987 0.09331033 0.10188073 0.09430563 0.11845777 0.16671409
S_2 = var(speedm[,-1]);S_2
## 100m 200m 400m 800m 1500m Marathon
## 100m 0.04349790 0.04827718 0.04346323 0.03149513 0.04250343 0.04312562
## 200m 0.04827718 0.06484523 0.05586780 0.04323338 0.05352645 0.05629446
## 400m 0.04346323 0.05586780 0.06882169 0.04282214 0.05372066 0.05673423
## 800m 0.03149513 0.04323338 0.04282214 0.04688400 0.05230584 0.05419108
## 1500m 0.04250343 0.05352645 0.05372066 0.05230584 0.07291400 0.07365179
## Marathon 0.04312562 0.05629446 0.05673423 0.05419108 0.07365179 0.09792763
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p
## 100m 200m 400m 800m 1500m Marathon
## 100m 0.06701808 0.07194176 0.07006783 0.04827957 0.06236161 0.06211274
## 200m 0.07194176 0.08975834 0.08486885 0.05907912 0.07477270 0.07480239
## 400m 0.07006783 0.08486885 0.10330527 0.06188152 0.07458183 0.07930748
## 800m 0.04827957 0.05907912 0.06188152 0.06020342 0.06938003 0.07424836
## 1500m 0.06236161 0.07477270 0.07458183 0.06938003 0.09837725 0.09605478
## Marathon 0.06211274 0.07480239 0.07930748 0.07424836 0.09605478 0.13232086
(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))
## [,1]
## [1,] 676.7041
(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))
## [1] 13.78843