Women’s and Men’s Track Data

w_t <- read.csv("women_trackrec.txt", header=FALSE, sep = "")
colnames(w_t) = c("Country","100m","200m","400m","800m","1500m","3000m","Marathon")
m_t <- read.csv("men_trackrec.txt", header=FALSE, sep = "")
colnames(m_t) = c("Country","100m","200m","400m","800m","1500m","5000m","10000m", "Marathon")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

w_t <- w_t %>%
  select("Country","100m","200m","400m","800m","1500m","Marathon")

m_t <- m_t %>%
  select("Country","100m","200m","400m","800m","1500m","Marathon")

Problem A.

Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)

Decision Rule: If Tsq < critical value, fail to reject \(H_0\)

Conclusion: As Tsq > critical value, we reject \(H_0\). That is, the difference in means between the groups is significant.

n_1<-dim(w_t[,-1])[1]
n_2<-dim(m_t[,-1])[1]
p<-dim(w_t[,-1])[2]

# Testing H0:mu1-mu2=delta0
delta0=rep(0,p)
xbar_1 = colMeans(w_t[,-1]);xbar_1

##       100m       200m       400m       800m      1500m   Marathon 
##  11.357778  23.118519  51.989074   2.022407   4.189444 153.619259

xbar_2 = colMeans(m_t[,-1]);xbar_2

##       100m       200m       400m       800m      1500m   Marathon 
##  10.216852  20.541481  45.829074   1.768148   3.653333 133.478519

S_1 = var(w_t[,-1]);S_1

##                100m       200m       400m        800m      1500m
## 100m     0.15531572  0.3445608  0.8912960 0.027703564 0.08389119
## 200m     0.34456080  0.8630883  2.1928363 0.066165898 0.20276331
## 400m     0.89129602  2.1928363  6.7454576 0.181807932 0.50917683
## 800m     0.02770356  0.0661659  0.1818079 0.007546925 0.02141457
## 1500m    0.08389119  0.2027633  0.5091768 0.021414570 0.07418270
## Marathon 4.33417757 10.3849876 28.9037314 1.219654647 3.53983732
##            Marathon
## 100m       4.334178
## 200m      10.384988
## 400m      28.903731
## 800m       1.219655
## 1500m      3.539837
## Marathon 270.270150

S_2 = var(m_t[,-1]);S_2

##                 100m       200m       400m        800m       1500m
## 100m     0.048972921 0.11104437 0.25602156 0.008263871 0.025720126
## 200m     0.111044375 0.30090342 0.66681838 0.022929210 0.066193082
## 400m     0.256021558 0.66681838 2.06995573 0.057937876 0.168472956
## 800m     0.008263871 0.02292921 0.05793788 0.002751223 0.007130818
## 1500m    0.025720126 0.06619308 0.16847296 0.007130818 0.023033962
## Marathon 1.340138644 3.54103809 9.17885709 0.378904752 1.192563522
##            Marathon
## 100m      1.3401386
## 200m      3.5410381
## 400m      9.1788571
## 800m      0.3789048
## 1500m     1.1925635
## Marathon 80.1353563

S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p

##                100m       200m       400m        800m      1500m
## 100m     0.10214432 0.22780259  0.5736588 0.017983718 0.05480566
## 200m     0.22780259 0.58199588  1.4298274 0.044547554 0.13447820
## 400m     0.57365879 1.42982736  4.4077067 0.119872904 0.33882490
## 800m     0.01798372 0.04454755  0.1198729 0.005149074 0.01427269
## 1500m    0.05480566 0.13447820  0.3388249 0.014272694 0.04860833
## Marathon 2.83715811 6.96301282 19.0412942 0.799279700 2.36620042
##             Marathon
## 100m       2.8371581
## 200m       6.9630128
## 400m      19.0412942
## 800m       0.7992797
## 1500m      2.3662004
## Marathon 175.2027533

(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))

##          [,1]
## [1,] 680.4875

(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))

## [1] 13.78843

Problem B.

The first two principal components: \(Y_1 = -0.413(100m) - 0.420(200m) - 0.406(400m) - 0.422(800m) - 0.407(1500M) - 0.3784(Marathon)\)

\(Y_2 = -0.385(100m) - 0.382(200m) - 0.393(400m) + 0.286(800m) + 0.316(1500M) +0.608(Marathon)\)

#Women's Correlation
cor(w_t[,-1])

##               100m      200m      400m      800m     1500m  Marathon
## 100m     1.0000000 0.9410886 0.8707802 0.8091758 0.7815510 0.6689597
## 200m     0.9410886 1.0000000 0.9088096 0.8198258 0.8013282 0.6799537
## 400m     0.8707802 0.9088096 1.0000000 0.8057904 0.7197996 0.6769384
## 800m     0.8091758 0.8198258 0.8057904 1.0000000 0.9050509 0.8539900
## 1500m    0.7815510 0.8013282 0.7197996 0.9050509 1.0000000 0.7905565
## Marathon 0.6689597 0.6799537 0.6769384 0.8539900 0.7905565 1.0000000

#Women's Track Team Eigen Values and Eigen Vector
e <- eigen(cor(w_t[,-1]))$vec
l <- eigen(cor(w_t[,-1]))$val

e

##            [,1]       [,2]        [,3]       [,4]       [,5]        [,6]
## [1,] -0.4137534 -0.3846299 -0.12054812  0.5735489 -0.3939281 -0.42684706
## [2,] -0.4202377 -0.3824032 -0.06535754  0.1897090  0.3176902  0.73210645
## [3,] -0.4061645 -0.3929376  0.44254638 -0.5768634  0.1985368 -0.33555163
## [4,] -0.4226394  0.2858530 -0.09940074 -0.3891141 -0.6981406  0.30161790
## [5,] -0.4067668  0.3159300 -0.67477321 -0.1371183  0.4276702 -0.27875957
## [6,] -0.3783590  0.6081973  0.56581787  0.3634138  0.1848636 -0.02337895

## [1] 5.01728744 0.52410711 0.21557993 0.12351705 0.07381482 0.04569366

cor_new <- matrix(rep(0,36), ncol=6)
for (j in 1:6){
  for(k in 1:6) { 
    cor_new[j,k] = l[j]*e[j,k] }
}
cor_new

##             [,1]        [,2]        [,3]        [,4]         [,5]
## [1,] -2.07591983 -1.92979883 -0.60482458  2.87765948 -1.976450716
## [2,] -0.22024958 -0.20042024 -0.03425435  0.09942783  0.166503704
## [3,] -0.08756091 -0.08470946  0.09540412 -0.12436018  0.042800549
## [4,] -0.05220317  0.03530772 -0.01227769 -0.04806223 -0.086232268
## [5,] -0.03002542  0.02332032 -0.04980826 -0.01012136  0.031568398
## [6,] -0.01728861  0.02779076  0.02585429  0.01660570  0.008447093
##             [,6]
## [1,] -2.14161441
## [2,]  0.38370220
## [3,] -0.07233820
## [4,]  0.03725495
## [5,] -0.02057659
## [6,] -0.00106827

cum_pct <- c(0)
for(i in 1:6){
  cum_pct[i] = sum(l[1:i])/6
}
cum_pct

## [1] 0.8362146 0.9235658 0.9594957 0.9800819 0.9923844 1.0000000

#Easier Way
prcomp(w_t[,-1], scale=TRUE, center=TRUE)

## Standard deviations (1, .., p=6):
## [1] 2.2399302 0.7239524 0.4643059 0.3514499 0.2716888 0.2137607
## 
## Rotation (n x k) = (6 x 6):
##                 PC1        PC2         PC3        PC4        PC5
## 100m     -0.4137534  0.3846299 -0.12054812  0.5735489 -0.3939281
## 200m     -0.4202377  0.3824032 -0.06535754  0.1897090  0.3176902
## 400m     -0.4061645  0.3929376  0.44254638 -0.5768634  0.1985368
## 800m     -0.4226394 -0.2858530 -0.09940074 -0.3891141 -0.6981406
## 1500m    -0.4067668 -0.3159300 -0.67477321 -0.1371183  0.4276702
## Marathon -0.3783590 -0.6081973  0.56581787  0.3634138  0.1848636
##                  PC6
## 100m     -0.42684706
## 200m      0.73210645
## 400m     -0.33555163
## 800m      0.30161790
## 1500m    -0.27875957
## Marathon -0.02337895

prcomp((w_t[,-1]), scale=TRUE, center=TRUE)$sdev^2

## [1] 5.01728744 0.52410711 0.21557993 0.12351705 0.07381482 0.04569366

summary(prcomp((w_t[,-1]), scale=TRUE, center=TRUE))

## Importance of components:
##                           PC1     PC2     PC3     PC4    PC5     PC6
## Standard deviation     2.2399 0.72395 0.46431 0.35145 0.2717 0.21376
## Proportion of Variance 0.8362 0.08735 0.03593 0.02059 0.0123 0.00762
## Cumulative Proportion  0.8362 0.92357 0.95950 0.98008 0.9924 1.00000

Problem C.

From the cumulative percentage of the principal components, we see that the first two principal components explain up to 92% of the total variance.

Problem D.

From what I know about track, I was surprised not to see Kenya on the top 5. Also, Usain Bolt is from Jamaica!

women_PCA <-prcomp((w_t[,-1]), scale=TRUE, center=TRUE)
library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

a1 <- women_PCA$rotation[,1]
center <- women_PCA$center
scale <- women_PCA$scale
hm <- as.matrix(w_t[,-1])
women_PCA_score <- drop(scale(hm,center = center,scale = scale) %*% a1)
women_PCA_score<- data.frame(women_PCA_score)
women_PCA_score<- cbind(w_t$Country,women_PCA_score)
colnames(women_PCA_score) <- c("Country","Score")
arrange(women_PCA_score,desc(Score))

##    Country      Score
## 1      USA  3.2548441
## 2      GER  3.0195659
## 3      RUS  2.9393276
## 4      CHN  2.7372241
## 5      FRA  2.5164458
## 6      CZE  2.5023375
## 7      GBR  2.2829618
## 8      POL  2.1872859
## 9      ROM  1.9245159
## 10     AUS  1.8688929
## 11     ESP  1.7445837
## 12     CAN  1.6061855
## 13     ITA  1.4193755
## 14     NED  1.4085145
## 15     BEL  1.2634324
## 16     GRE  1.2430415
## 17     AUT  1.2139069
## 18     FIN  1.1811139
## 19     BRA  1.0726996
## 20     MEX  0.9914729
## 21     POR  0.9660574
## 22     SUI  0.9518668
## 23     SWE  0.7699433
## 24     IRL  0.7477501
## 25     NZL  0.6486268
## 26     TUR  0.6295431
## 27     KEN  0.6243322
## 28     HUN  0.5766679
## 29     IND  0.3980313
## 30     JPN  0.3426013
## 31     NOR  0.3065790
## 32     COL  0.1607321
## 33     DEN -0.1062300
## 34     ARG -0.3706994
## 35     TPE -0.4209546
## 36     ISR -0.4922723
## 37     CHI -0.8022347
## 38     THA -0.8178009
## 39     MYA -0.8908269
## 40    KORS -0.9417002
## 41     BER -1.1362494
## 42     MAS -1.4870939
## 43     PHI -1.5319740
## 44    KORN -1.6680405
## 45     INA -1.6787304
## 46     LUX -1.8114198
## 47     MRI -1.8163846
## 48     CRC -1.9513435
## 49     DOM -1.9529459
## 50     SIN -2.9237631
## 51     GUA -3.2483941
## 52     PNG -5.0931713
## 53     SAM -6.8184047
## 54     COK -7.5398251

Problem E.

The first two principal components: \(Y_1 = -0.413(100m) - 0.418(200m) - 0.403(400m) - 0.410(800m) - 0.421(1500M) - 0.3945(Marathon)\)

\(Y_2 = -0.514(100m) - 0.3917(200m) - 0.288(400m) + 0.3267(800m) + 0.349(1500M) +0.5201(Marathon)\)

From the cumulative percentage of the principal components, we see that the first two principal components explain up to 90% of the total variance.

From what I know about track, I was surprised not to see Kenya on the top 5. Also, Usain Bolt is from Jamaica! I was a bit surprised about China too. These rankings are more intuitive with what is expected though.

#Men's Correlation
cor(m_t[,-1])

##               100m      200m      400m      800m     1500m  Marathon
## 100m     1.0000000 0.9147554 0.8041147 0.7119388 0.7657919 0.6764873
## 200m     0.9147554 1.0000000 0.8449159 0.7969162 0.7950871 0.7211157
## 400m     0.8041147 0.8449159 1.0000000 0.7677488 0.7715522 0.7126823
## 800m     0.7119388 0.7969162 0.7677488 1.0000000 0.8957609 0.8069657
## 1500m    0.7657919 0.7950871 0.7715522 0.8957609 1.0000000 0.8777788
## Marathon 0.6764873 0.7211157 0.7126823 0.8069657 0.8777788 1.0000000

#Men's Track Team Eigen Values and Eigen Vector
eigen(cor(m_t[,-1]))$vec

##            [,1]       [,2]       [,3]        [,4]       [,5]        [,6]
## [1,] -0.4013958  0.5138807  0.4789843 -0.01687632 -0.2969661 -0.50686134
## [2,] -0.4180798  0.3917657  0.1620888 -0.17179519  0.5364074  0.57289582
## [3,] -0.4035829  0.2881765 -0.7458936  0.42347137 -0.1323242 -0.02966769
## [4,] -0.4104405 -0.3266872 -0.2985046 -0.65176976  0.2267076 -0.39938562
## [5,] -0.4208110 -0.3494365  0.1138092 -0.13633585 -0.6628797  0.47944005
## [6,] -0.3945481 -0.5201637  0.2930638  0.58947631  0.3402392 -0.15693998

eigen(cor(m_t[,-1]))$val

## [1] 4.95703332 0.49022363 0.20953295 0.18719579 0.09680142 0.05921289

#Easier Way
prcomp((m_t[,-1]), scale=TRUE, center=TRUE)

## Standard deviations (1, .., p=6):
## [1] 2.2264396 0.7001597 0.4577477 0.4326613 0.3111293 0.2433370
## 
## Rotation (n x k) = (6 x 6):
##                 PC1        PC2        PC3         PC4        PC5
## 100m     -0.4013958 -0.5138807  0.4789843 -0.01687632 -0.2969661
## 200m     -0.4180798 -0.3917657  0.1620888 -0.17179519  0.5364074
## 400m     -0.4035829 -0.2881765 -0.7458936  0.42347137 -0.1323242
## 800m     -0.4104405  0.3266872 -0.2985046 -0.65176976  0.2267076
## 1500m    -0.4208110  0.3494365  0.1138092 -0.13633585 -0.6628797
## Marathon -0.3945481  0.5201637  0.2930638  0.58947631  0.3402392
##                  PC6
## 100m      0.50686134
## 200m     -0.57289582
## 400m      0.02966769
## 800m      0.39938562
## 1500m    -0.47944005
## Marathon  0.15693998

prcomp((m_t[,-1]), scale=TRUE, center=TRUE)$sdev^2

## [1] 4.95703332 0.49022363 0.20953295 0.18719579 0.09680142 0.05921289

summary(prcomp((m_t[,-1]), scale=TRUE, center=TRUE))

## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6
## Standard deviation     2.2264 0.7002 0.45775 0.4327 0.31113 0.24334
## Proportion of Variance 0.8262 0.0817 0.03492 0.0312 0.01613 0.00987
## Cumulative Proportion  0.8262 0.9079 0.94280 0.9740 0.99013 1.00000

#Ranks
men_PCA <-prcomp((m_t[,-1]), scale=TRUE, center=TRUE)
library(plyr)
a1 <- men_PCA$rotation[,1]
center <- men_PCA$center
scale <- men_PCA$scale
hm <- as.matrix(m_t[,-1])
men_PCA_score <- drop(scale(hm,center = center,scale = scale) %*% a1)
men_PCA_score<- data.frame(men_PCA_score)
men_PCA_score<- cbind(m_t$Country,men_PCA_score)
colnames(men_PCA_score) <- c("Country","Score")
arrange(men_PCA_score,desc(Score))

##           Country       Score
## 1          U.S.A.  3.81443948
## 2    GreatBritain  2.76559122
## 3          Brazil  2.41332264
## 4          France  2.15246264
## 5       Australia  2.11903404
## 6           Italy  2.01870424
## 7           Kenya  1.95130554
## 8          Canada  1.91110348
## 9          Poland  1.90004707
## 10        Germany  1.80363466
## 11       Portugal  1.77060166
## 12         Russia  1.73556781
## 13          Japan  1.47793228
## 14        Belgium  1.44052828
## 15    Switzerland  1.40838536
## 16          Spain  1.34630146
## 17    Netherlands  1.17123014
## 18         Norway  1.14580691
## 19         Greece  1.11906527
## 20        Denmark  1.02624744
## 21        Hungary  0.98914934
## 22     NewZealand  0.96316485
## 23         Mexico  0.79549722
## 24          China  0.63512139
## 25          Chile  0.61460672
## 26        Finland  0.60506231
## 27         Sweden  0.55337404
## 28    Korea,South  0.53956987
## 29        Austria  0.44344557
## 30        Ireland  0.42106340
## 31  CzechRepublic  0.35182100
## 32        Romania  0.21683546
## 33      Argentina  0.09224532
## 34          India -0.05737107
## 35      Mauritius -0.31780244
## 36        Bermuda -0.62129242
## 37         Turkey -0.67538352
## 38       Columbia -0.70322500
## 39 DominicanRepub -0.81091756
## 40         Israel -0.85938431
## 41       Thailand -1.10789164
## 42     Luxembourg -1.11218279
## 43         Taiwan -1.24709871
## 44      Indonesia -1.47604901
## 45       Malaysia -1.76501320
## 46      Guatemala -1.94178678
## 47      CostaRica -1.97544573
## 48    Korea,North -2.07745301
## 49    Philippines -2.17510111
## 50      Singapore -2.83941185
## 51 PapuaNewGuinea -2.93899475
## 52 Myanmar(Burma) -3.22338149
## 53          Samoa -6.75947191
## 54    CookIslands -9.02760982

Problem F.

Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)

Where \(\mu\) is the population principal components.

Decision Rule: If Tsq < critical value, fail to reject \(H_0\)

Conclusion: As Tsq < critical value, we fail to reject \(H_0\). That is, the difference in population principal components between the groups is not significant.

# Testing H0:mu1-mu2=delta0
delta0=rep(0,1)
xbar_1 = mean(men_PCA_score$Score);xbar_1

## [1] 2.978424e-16

xbar_2 = mean(women_PCA_score$Score);xbar_2

## [1] -2.44347e-16

S_1 = var(men_PCA_score$Score);S_1

## [1] 4.957033

S_2 = var(women_PCA_score$Score);S_2

## [1] 5.017287

S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p

## [1] 4.98716

(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))

##              [,1]
## [1,] 1.591522e-30

(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))

## [1] 13.78843

Problem G.

From the factor analysis, we see that the unrotated factor analysis explains about 80% of the data. The 900m has the highest factor loading. Factor 2 does not explain much and increases the cumulative variance explained to 88.2%. Factor 2 contrasts with the shorter events.

After rotating, the first factor explains about 46% of the variance and the 200m explains the highest factor loading. Factor 2 explains about 42% which is a lot more than Factor 2 from the first factor analysis.

The two methods seem inconsistent. Yes, it does make a difference if R or S is factored. They would produce different high factor loads.

mlik.fa=factanal(w_t[,-1],factors=2,rotation="none")
mlik.fa

## 
## Call:
## factanal(x = w_t[, -1], factors = 2, rotation = "none")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m      0.926  -0.192 
## 200m      0.962  -0.263 
## 400m      0.905  -0.145 
## 800m      0.942   0.327 
## 1500m     0.889   0.209 
## Marathon  0.795   0.323 
## 
##                Factor1 Factor2
## SS loadings      4.910   0.382
## Proportion Var   0.818   0.064
## Cumulative Var   0.818   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

#specific variance
1-apply(mlik.fa$loadings^2,1,sum)

##        100m        200m        400m        800m       1500m    Marathon 
## 0.106129659 0.004996951 0.160313361 0.006039590 0.166955171 0.263530952

##Rotation (varimax)##
mlik.farot=factanal(w_t[,-1],factors=2,rotation="varimax") #rotmat
mlik.farot

## 
## Call:
## factanal(x = w_t[, -1], factors = 2, rotation = "varimax")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m     0.824   0.464  
## 200m     0.898   0.433  
## 400m     0.778   0.485  
## 800m     0.495   0.865  
## 1500m    0.533   0.741  
## Marathon 0.387   0.766  
## 
##                Factor1 Factor2
## SS loadings      2.770   2.522
## Proportion Var   0.462   0.420
## Cumulative Var   0.462   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

#specific variance
1-apply(mlik.farot$loadings^2,1,sum) #same as no rotation.

##        100m        200m        400m        800m       1500m    Marathon 
## 0.106129659 0.004996951 0.160313361 0.006039590 0.166955171 0.263530952

##Or use the correlation mattrix R
cor.w_t=cor(w_t[,-1])
factanal(covmat=cor.w_t,factors = 2,rotation="none",n.obs=54,method="mle")

## 
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "none",     method = "mle")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m      0.926  -0.192 
## 200m      0.962  -0.263 
## 400m      0.905  -0.145 
## 800m      0.942   0.327 
## 1500m     0.889   0.209 
## Marathon  0.795   0.323 
## 
##                Factor1 Factor2
## SS loadings      4.910   0.382
## Proportion Var   0.818   0.064
## Cumulative Var   0.818   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

factanal(covmat=cor.w_t,factors = 2,rotation="varimax",n.obs=54,method="mle")

## 
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "varimax",     method = "mle")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m     0.824   0.464  
## 200m     0.898   0.433  
## 400m     0.778   0.485  
## 800m     0.495   0.865  
## 1500m    0.533   0.741  
## Marathon 0.387   0.766  
## 
##                Factor1 Factor2
## SS loadings      2.770   2.522
## Proportion Var   0.462   0.420
## Cumulative Var   0.462   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

Problem H

From the factor analysis, we see that the unrotated factor analysis explains about 78.3% of the data. The 200m has the highest factor loading. Factor 2 does not explain much and increases the cumulative variance explained to 86.4%.

It seems that both methods for the men’s utilize the 200m as the highest factor loading.

mlik.fa=factanal(m_t[,-1],factors=2,rotation="none")
mlik.fa

## 
## Call:
## factanal(x = m_t[, -1], factors = 2, rotation = "none")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.154    0.009    0.250    0.164    0.027    0.208 
## 
## Loadings:
##          Factor1 Factor2
## 100m      0.914  -0.102 
## 200m      0.983  -0.158 
## 400m      0.865         
## 800m      0.859   0.312 
## 1500m     0.881   0.445 
## Marathon  0.797   0.396 
## 
##                Factor1 Factor2
## SS loadings      4.700   0.488
## Proportion Var   0.783   0.081
## Cumulative Var   0.783   0.865
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 5.5 on 4 degrees of freedom.
## The p-value is 0.239

#specific variance
1-apply(mlik.fa$loadings^2,1,sum)

##        100m        200m        400m        800m       1500m    Marathon 
## 0.153832828 0.008933034 0.250451379 0.164335242 0.026552134 0.208375301

##Rotation (varimax)##
mlik.farot=factanal(m_t[,-1],factors=2,rotation="varimax") #rotmat
mlik.farot

## 
## Call:
## factanal(x = m_t[, -1], factors = 2, rotation = "varimax")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.154    0.009    0.250    0.164    0.027    0.208 
## 
## Loadings:
##          Factor1 Factor2
## 100m     0.806   0.443  
## 200m     0.895   0.436  
## 400m     0.691   0.522  
## 800m     0.523   0.750  
## 1500m    0.464   0.870  
## Marathon 0.424   0.782  
## 
##                Factor1 Factor2
## SS loadings      2.598   2.589
## Proportion Var   0.433   0.432
## Cumulative Var   0.433   0.865
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 5.5 on 4 degrees of freedom.
## The p-value is 0.239

#specific variance
1-apply(mlik.farot$loadings^2,1,sum) #same as no rotation.

##        100m        200m        400m        800m       1500m    Marathon 
## 0.153832828 0.008933034 0.250451379 0.164335242 0.026552134 0.208375301

##Or use the correlation mattrix R
cor.w_t=cor(w_t[,-1])
factanal(covmat=cor.w_t,factors = 2,rotation="none",n.obs=54,method="mle")

## 
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "none",     method = "mle")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m      0.926  -0.192 
## 200m      0.962  -0.263 
## 400m      0.905  -0.145 
## 800m      0.942   0.327 
## 1500m     0.889   0.209 
## Marathon  0.795   0.323 
## 
##                Factor1 Factor2
## SS loadings      4.910   0.382
## Proportion Var   0.818   0.064
## Cumulative Var   0.818   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

factanal(covmat=cor.w_t,factors = 2,rotation="varimax",n.obs=54,method="mle")

## 
## Call:
## factanal(factors = 2, covmat = cor.w_t, n.obs = 54, rotation = "varimax",     method = "mle")
## 
## Uniquenesses:
##     100m     200m     400m     800m    1500m Marathon 
##    0.106    0.005    0.160    0.006    0.167    0.264 
## 
## Loadings:
##          Factor1 Factor2
## 100m     0.824   0.464  
## 200m     0.898   0.433  
## 400m     0.778   0.485  
## 800m     0.495   0.865  
## 1500m    0.533   0.741  
## Marathon 0.387   0.766  
## 
##                Factor1 Factor2
## SS loadings      2.770   2.522
## Proportion Var   0.462   0.420
## Cumulative Var   0.462   0.882
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.41 on 4 degrees of freedom.
## The p-value is 0.171

Problem I

The first principal component is made up of equal parts of every variable. It makes up about 81% of the explained variance. The second prinicipal component is made up of two groups - short and long distances. This prinicpal component makes up 90.3% of the explained variance. It is easier to use the speed variable because it is standardized.

speedw <- w_t
speedw[,2]<-100/speedw[,2]
speedw[,3]<-200/speedw[,3]
speedw[,4]<-400/speedw[,4]
speedw[,5]<-800/speedw[,5]/60
speedw[,6]<-1500/speedw[,6]/60
speedw[,7]<-42195/speedw[,7]/60

head(speedw)

##   Country     100m     200m     400m     800m    1500m Marathon
## 1     ARG 8.643042 8.718396 7.619048 6.504065 5.882353 4.678353
## 2     AUS 8.992806 8.996851 8.225375 6.734007 6.218905 4.900355
## 3     AUT 8.968610 8.810573 7.902015 6.872852 6.172840 4.556203
## 4     BEL 8.976661 8.896797 7.774538 6.768190 6.127451 4.916113
## 5     BER 8.726003 8.676790 7.504690 6.441224 5.827506 4.037490
## 6     BRA 8.952551 8.849558 7.902015 6.768190 5.995204 4.770708

cov(speedw[,-1])

##                100m       200m       400m       800m      1500m   Marathon
## 100m     0.09053826 0.09560635 0.09667244 0.06506402 0.08221980 0.08109987
## 200m     0.09560635 0.11467144 0.11386990 0.07492487 0.09601895 0.09331033
## 400m     0.09667244 0.11386990 0.13778886 0.08094090 0.09544299 0.10188073
## 800m     0.06506402 0.07492487 0.08094090 0.07352284 0.08645423 0.09430563
## 1500m    0.08221980 0.09601895 0.09544299 0.08645423 0.12384050 0.11845777
## Marathon 0.08109987 0.09331033 0.10188073 0.09430563 0.11845777 0.16671409

prcomp((speedw[,-1]), scale=TRUE, center=TRUE)

## Standard deviations (1, .., p=6):
## [1] 2.2394797 0.7372135 0.4298389 0.3581696 0.2840273 0.2180105
## 
## Rotation (n x k) = (6 x 6):
##                PC1        PC2         PC3        PC4        PC5
## 100m     0.4110369 -0.3985261  0.19870602 -0.5452687  0.3871234
## 200m     0.4194769 -0.3817117  0.11579002 -0.1503919 -0.2699084
## 400m     0.4059147 -0.3822980 -0.55004814  0.4729653 -0.2383517
## 800m     0.4213789  0.2724519  0.07729831  0.4566153  0.6932049
## 1500m    0.4105739  0.3252716  0.61695997  0.1892135 -0.4742597
## Marathon 0.3797236  0.6076921 -0.50787892 -0.4605094 -0.1225483
##                  PC6
## 100m      0.43076577
## 200m     -0.75462556
## 400m      0.32560833
## 800m     -0.23066019
## 1500m     0.29028244
## Marathon -0.03863049

summary(prcomp((speedw[,-1]), scale=TRUE, center=TRUE))

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.2395 0.73721 0.42984 0.35817 0.28403 0.21801
## Proportion of Variance 0.8359 0.09058 0.03079 0.02138 0.01345 0.00792
## Cumulative Proportion  0.8359 0.92646 0.95725 0.97863 0.99208 1.00000

e <- eigen(cor(speedw[,-1]))$vec
l <- eigen(cor(speedw[,-1]))$val

e

##            [,1]       [,2]        [,3]       [,4]       [,5]        [,6]
## [1,] -0.4110369  0.3985261 -0.19870602  0.5452687 -0.3871234 -0.43076577
## [2,] -0.4194769  0.3817117 -0.11579002  0.1503919  0.2699084  0.75462556
## [3,] -0.4059147  0.3822980  0.55004814 -0.4729653  0.2383517 -0.32560833
## [4,] -0.4213789 -0.2724519 -0.07729831 -0.4566153 -0.6932049  0.23066019
## [5,] -0.4105739 -0.3252716 -0.61695997 -0.1892135  0.4742597 -0.29028244
## [6,] -0.3797236 -0.6076921  0.50787892  0.4605094  0.1225483  0.03863049

## [1] 5.01526917 0.54348379 0.18476151 0.12828548 0.08067149 0.04752856

Problem J

It seems that the first two principal components for women and men are about the same. The first principal component is made up of equal parts of every variable. It makes up about 81% of the explained variance. The second prinicipal component is made up of two groups - short and long distances. This prinicpal component makes up 90.3% of the explained variance. It is easier to use the speed variable because it is standardized.

speedm <- m_t
speedm[,2]<-100/speedm[,2]
speedm[,3]<-200/speedm[,3]
speedm[,4]<-400/speedm[,4]
speedm[,5]<-800/speedm[,5]/60
speedm[,6]<-1500/speedm[,6]/60
speedm[,7]<-42195/speedm[,7]/60

head(speedm)

##     Country      100m      200m     400m     800m    1500m Marathon
## 1 Argentina  9.775171  9.818360 8.661758 7.532957 6.793478 5.427568
## 2 Australia 10.070493  9.970090 9.013069 7.662835 7.082153 5.515254
## 3   Austria  9.852217  9.779951 8.733624 7.532957 6.983240 5.318787
## 4   Belgium  9.861933  9.905894 8.884940 7.707129 7.002801 5.528695
## 5   Bermuda  9.737098  9.852217 8.837826 7.448790 6.756757 4.804605
## 6    Brazil 10.000000 10.055304 9.031384 7.843137 7.002801 5.579135

cov(speedm[,-1])

##                100m       200m       400m       800m      1500m   Marathon
## 100m     0.04349790 0.04827718 0.04346323 0.03149513 0.04250343 0.04312562
## 200m     0.04827718 0.06484523 0.05586780 0.04323338 0.05352645 0.05629446
## 400m     0.04346323 0.05586780 0.06882169 0.04282214 0.05372066 0.05673423
## 800m     0.03149513 0.04323338 0.04282214 0.04688400 0.05230584 0.05419108
## 1500m    0.04250343 0.05352645 0.05372066 0.05230584 0.07291400 0.07365179
## Marathon 0.04312562 0.05629446 0.05673423 0.05419108 0.07365179 0.09792763

prcomp((speedm[,-1]), scale=TRUE, center=TRUE)

## Standard deviations (1, .., p=6):
## [1] 2.2135294 0.7218360 0.4710989 0.4387464 0.3252378 0.2429571
## 
## Rotation (n x k) = (6 x 6):
##                PC1        PC2         PC3         PC4        PC5
## 100m     0.4013532  0.5044835 -0.48337352  0.03846601  0.3388272
## 200m     0.4180942  0.3929087 -0.16824214  0.17014061 -0.5665582
## 400m     0.4026901  0.3010357  0.73492220 -0.43653013  0.1240380
## 800m     0.4111819 -0.3304516  0.30737166  0.63871947 -0.2199149
## 1500m    0.4217625 -0.3523522 -0.08502157  0.14181362  0.6319581
## Marathon 0.3936996 -0.5168620 -0.31020634 -0.59240216 -0.3179448
##                  PC6
## 100m     -0.48422984
## 200m      0.54090531
## 400m     -0.03411095
## 800m     -0.41343524
## 1500m     0.52082007
## Marathon -0.17203808

summary(prcomp((speedm[,-1]), scale=TRUE, center=TRUE))

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.2135 0.72184 0.47110 0.43875 0.32524 0.24296
## Proportion of Variance 0.8166 0.08684 0.03699 0.03208 0.01763 0.00984
## Cumulative Proportion  0.8166 0.90346 0.94045 0.97253 0.99016 1.00000

e <- eigen(cor(speedm[,-1]))$vec
l <- eigen(cor(speedm[,-1]))$val

e

##            [,1]       [,2]        [,3]        [,4]       [,5]        [,6]
## [1,] -0.4013532  0.5044835 -0.48337352 -0.03846601 -0.3388272 -0.48422984
## [2,] -0.4180942  0.3929087 -0.16824214 -0.17014061  0.5665582  0.54090531
## [3,] -0.4026901  0.3010357  0.73492220  0.43653013 -0.1240380 -0.03411095
## [4,] -0.4111819 -0.3304516  0.30737166 -0.63871947  0.2199149 -0.41343524
## [5,] -0.4217625 -0.3523522 -0.08502157 -0.14181362 -0.6319581  0.52082007
## [6,] -0.3936996 -0.5168620 -0.31020634  0.59240216  0.3179448 -0.17203808

## [1] 4.89971248 0.52104716 0.22193414 0.19249842 0.10577963 0.05902817

Problem K.

Hypothesis Test \(H_0 = \mu_1 - \mu_2 = 0\) \(H_A = \mu_1 - \mu_2 \neq 0\)

Where \(\mu\) are the two principal components.

Decision Rule: If Tsq < critical value, fail to reject \(H_0\)

Conclusion: As Tsq < critical value, we fail to reject \(H_0\). That is, using the first two principal components, the difference between the two populations is not significant.

n_1<-dim(speedw[,-1])[1]
n_2<-dim(speedm[,-1])[1]
p<-dim(speedw[,-1])[2]

# Testing H0:mu1-mu2=delta0
delta0=rep(0,p)
xbar_1 = colMeans(speedw[,-1]);xbar_1

##     100m     200m     400m     800m    1500m Marathon 
## 8.814772 8.664408 7.712067 6.604214 5.989687 4.620264

xbar_2 = colMeans(speedm[,-1]);xbar_2

##     100m     200m     400m     800m    1500m Marathon 
## 9.792182 9.743064 8.736156 7.547144 6.854053 5.289144

S_1 = var(speedw[,-1]);S_1

##                100m       200m       400m       800m      1500m   Marathon
## 100m     0.09053826 0.09560635 0.09667244 0.06506402 0.08221980 0.08109987
## 200m     0.09560635 0.11467144 0.11386990 0.07492487 0.09601895 0.09331033
## 400m     0.09667244 0.11386990 0.13778886 0.08094090 0.09544299 0.10188073
## 800m     0.06506402 0.07492487 0.08094090 0.07352284 0.08645423 0.09430563
## 1500m    0.08221980 0.09601895 0.09544299 0.08645423 0.12384050 0.11845777
## Marathon 0.08109987 0.09331033 0.10188073 0.09430563 0.11845777 0.16671409

S_2 = var(speedm[,-1]);S_2

##                100m       200m       400m       800m      1500m   Marathon
## 100m     0.04349790 0.04827718 0.04346323 0.03149513 0.04250343 0.04312562
## 200m     0.04827718 0.06484523 0.05586780 0.04323338 0.05352645 0.05629446
## 400m     0.04346323 0.05586780 0.06882169 0.04282214 0.05372066 0.05673423
## 800m     0.03149513 0.04323338 0.04282214 0.04688400 0.05230584 0.05419108
## 1500m    0.04250343 0.05352645 0.05372066 0.05230584 0.07291400 0.07365179
## Marathon 0.04312562 0.05629446 0.05673423 0.05419108 0.07365179 0.09792763

S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2);S_p

##                100m       200m       400m       800m      1500m   Marathon
## 100m     0.06701808 0.07194176 0.07006783 0.04827957 0.06236161 0.06211274
## 200m     0.07194176 0.08975834 0.08486885 0.05907912 0.07477270 0.07480239
## 400m     0.07006783 0.08486885 0.10330527 0.06188152 0.07458183 0.07930748
## 800m     0.04827957 0.05907912 0.06188152 0.06020342 0.06938003 0.07424836
## 1500m    0.06236161 0.07477270 0.07458183 0.06938003 0.09837725 0.09605478
## Marathon 0.06211274 0.07480239 0.07930748 0.07424836 0.09605478 0.13232086

(T2 = (n_1*n_2/(n_1+n_2))*t(xbar_1-xbar_2-delta0)%*%solve(S_p)%*%(xbar_1-xbar_2-delta0))

##          [,1]
## [1,] 676.7041

(crit = (n_1+n_2-2)*p/(n_1+n_2-p-1)*qf(.95,p,n_1+n_2-p-1))

## [1] 13.78843

STAT 488 Multi HW5

Kajal Chokshi

4/2/2019

Women’s and Men’s Track Data

Problem A.

Problem B.

Problem C.

Problem D.

Problem E.

Problem F.

Problem G.

Problem H

Problem I

Problem J

Problem K.