Profile Likelihood

The Class Lecture is here.

library(tidyverse)
library(dplyr)
library(tinytex)
library(knitr)
library(ggplot2)

Optim function to find the minimum of a quadratic function,f(x)=(x−3)^2:

# Define the function we want to minimize
f <- function(x) {
  (x - 3)^2
}

# Use optim to find the minimum
result <- optim(par = 0,       # Initial guess for x
                fn = f)        # Function to minimize

# Print the result
result$par  # This gives the value of x that minimizes f(x)
## [1] 2.9

Question: Finding the maximum likelihood estimate (MLE) for the mean parameter μ of a normally distributed dataset.

Maximizing the log-likelihood (our goal) becomes equivalent to minimizing the negative log-likelihood:

-sum(dnorm(data, mu, sig, log=TRUE))

Define the plik Function:

plik <- function(mu, data){
  sig <- optim(par = 1,#The initial value for sigma is set at 1
               fn = function(sigma, data){
                 -sum(log(dnorm(data, mu, sigma)))},
               data = data)$par #some creates problem with/without data=data part, so try for both if creates issues. 
  -sum(dnorm(data, mu, sig, log = TRUE))
    }

Generate Random Data:

set.seed(47)
x <- round(rnorm(n = 50, mean = 35, sd = 4),2)

Calls optim to find the value of mu that maximizes the likelihood for the data in x, starting the optimization from an initial guess of 15.

optim(15, plik, data = x)
## $par
## [1] 34.44287
## 
## $value
## [1] 134.3046
## 
## $counts
## function gradient 
##       32       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

To evaluate the likelihood across a range of potential mu values.

m <- seq(15, 50, length.out = 250)

Calculate the Profile Likelihood for Each mu:

mplik <- NULL
for(i in 1:250){
  mplik[i] = -plik(m[i], x)
}
mplik
##   [1] -220.1395 -219.7885 -219.4351 -219.0795 -218.7209 -218.3602 -217.9968
##   [8] -217.6309 -217.2625 -216.8914 -216.5176 -216.1411 -215.7619 -215.3799
##  [15] -214.9951 -214.6074 -214.2168 -213.8233 -213.4268 -213.0272 -212.6246
##  [22] -212.2188 -211.8099 -211.3978 -210.9825 -210.5638 -210.1417 -209.7163
##  [29] -209.2874 -208.8550 -208.4190 -207.9794 -207.5361 -207.0891 -206.6382
##  [36] -206.1840 -205.7249 -205.2624 -204.7956 -204.3248 -203.8499 -203.3707
##  [43] -202.8872 -202.3993 -201.9070 -201.4100 -200.9084 -200.4021 -199.8911
##  [50] -199.3752 -198.8545 -198.3287 -197.7976 -197.2614 -196.7199 -196.1731
##  [57] -195.6207 -195.0628 -194.4992 -193.9297 -193.3545 -192.7732 -192.1858
##  [64] -191.5922 -190.9923 -190.3859 -189.7731 -189.1535 -188.5271 -187.8939
##  [71] -187.2536 -186.6062 -185.9517 -185.2893 -184.6196 -183.9422 -183.2570
##  [78] -182.5639 -181.8628 -181.1535 -180.4357 -179.7095 -178.9747 -178.2312
##  [85] -177.4789 -176.7175 -175.9471 -175.1674 -174.3785 -173.5801 -172.7722
##  [92] -171.9547 -171.1275 -170.2906 -169.4439 -168.5875 -167.7213 -166.8452
##  [99] -165.9597 -165.0642 -164.1593 -163.2451 -162.3218 -161.3895 -160.4487
## [106] -159.4997 -158.5430 -157.5791 -156.6087 -155.6325 -154.6514 -153.6664
## [113] -152.6785 -151.6891 -150.6997 -149.7118 -148.7273 -147.7483 -146.7770
## [120] -145.8158 -144.8677 -143.9356 -143.0226 -142.1325 -141.2688 -140.4357
## [127] -139.6372 -138.8776 -138.1616 -137.4935 -136.8778 -136.3190 -135.8212
## [134] -135.3885 -135.0244 -134.7322 -134.5141 -134.3725 -134.3085 -134.3228
## [141] -134.4153 -134.5850 -134.8303 -135.1492 -135.5388 -135.9957 -136.5162
## [148] -137.0963 -137.7317 -138.4178 -139.1503 -139.9245 -140.7362 -141.5810
## [155] -142.4548 -143.3537 -144.2740 -145.2123 -146.1656 -147.1307 -148.1051
## [162] -149.0864 -150.0724 -151.0610 -152.0506 -153.0396 -154.0266 -155.0103
## [169] -155.9897 -156.9639 -157.9320 -158.8933 -159.8472 -160.7933 -161.7310
## [176] -162.6600 -163.5801 -164.4909 -165.3923 -166.2842 -167.1664 -168.0388
## [183] -168.9015 -169.7543 -170.5976 -171.4308 -172.2544 -173.0684 -173.8728
## [190] -174.6677 -175.4533 -176.2297 -176.9967 -177.7547 -178.5038 -179.2441
## [197] -179.9757 -180.6988 -181.4135 -182.1198 -182.8180 -183.5082 -184.1905
## [204] -184.8651 -185.5320 -186.1914 -186.8435 -187.4883 -188.1260 -188.7567
## [211] -189.3806 -189.9977 -190.6082 -191.2122 -191.8098 -192.4011 -192.9862
## [218] -193.5653 -194.1393 -194.7057 -195.2672 -195.8231 -196.3735 -196.9183
## [225] -197.4579 -197.9922 -198.5214 -199.0453 -199.5642 -200.0783 -200.5876
## [232] -201.0921 -201.5920 -202.0874 -202.5780 -203.0643 -203.5462 -204.0239
## [239] -204.4973 -204.9665 -205.4318 -205.8929 -206.3522 -206.8033 -207.2529
## [246] -207.6984 -208.1404 -208.5787 -209.0134 -209.4445

Find the Maximum Likelihood Estimate of mu:

mu_lik = as.data.frame(cbind(m, mplik)) %>%
  filter(mplik == max(mplik)) %>% #keep only the row where mplik is maximized.
  select(m)
mu_lik
ggplot()+
geom_line(aes(x = m, y = mplik))+
theme_bw()+
xlab(expression(mu)) + ylab(expression(l(mu))) +
geom_vline(aes(m, mplik), xintercept = 34)

Gamma Distribution Problem

y<-c(39.91,43.97,23.19,38.87,39.81,22.70,27.72,
     44.67,28.64,38.58,37.38,49.90,41.48,41.73,
     51.45,35.72,33.41,76.60,32.02,30.35,38.29,
     38.71,31.79,39.64,26.49)
mean(y)
## [1] 38.1208
length(y)  
## [1] 25
plik<-function(alpha,data){
  beta<-optim(par=1,fn = function(beta_1){
    -sum(dgamma(data,shape = alpha,scale = beta_1,log = T))})$par
  -sum(dgamma(data,alpha,beta,log = T))        
}

optim(35,plik,data=y)
## $par
## [1] 37.91715
## 
## $value
## [1] 100.4767
## 
## $counts
## function gradient 
##       64       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
m<-seq(20,50,length.out=250)
mplike<-NULL
for(i in 1:250){
  mplike[i]=-plik(m[i],y)
}

mulike<- as.data.frame(cbind(m,mplike)) %>%
  filter(mplike==max(mplike)) %>%
  select(m)%>%
  as.numeric()

ggplot()+geom_line(aes(x=m,y=mplike))+
  theme_bw()+
  xlab(expression(mu)) +ylab(expression(l(mu)))+
  geom_vline(aes(m,mplike),xintercept = mulike,col="Green")

PCA

Class lecture

Fit a model with 3(m) factors using PCA & MLE.

Rotate the matrix:

n <- 276
m <- 3
p <- 6 

crmat <- matrix(c(
1, 0.505, 0.569, 0.602, 0.621, 0.603,
0.505, 1, 0.422, 0.467, 0.482, 0.450,
0.569, 0.422, 1, 0.926, 0.877, 0.878,
0.602, 0.467, 0.926, 1, 0.874, 0.894,
0.621, 0.482, 0.877, 0.874, 1, 0.937,
0.603, 0.450, 0.878, 0.894, 0.937, 1
), nrow = 6, ncol = 6, byrow = TRUE)

Eigen Values & eigen vectors

eg <- eigen(crmat)
lmda <- eg$values
lmda
## [1] 4.45644850 0.78240991 0.45842506 0.16883257 0.07908774 0.05479622
eL <- eg$vectors[ , 1:m]
eL
##            [,1]       [,2]        [,3]
## [1,] -0.3507933  0.3956532  0.84666989
## [2,] -0.2862040  0.8146406 -0.50202982
## [3,] -0.4399784 -0.2632446 -0.11051604
## [4,] -0.4468851 -0.1972029 -0.09896293
## [5,] -0.4488806 -0.1614667 -0.06586761
## [6,] -0.4474933 -0.2134503 -0.06906640

Loadings: L = ei√λ

L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m) {
L[, i] <- sqrt(lmda[i]) * eL[, i]
}
L
##            [,1]       [,2]        [,3]
## [1,] -0.7405352  0.3499708  0.57325558
## [2,] -0.6041852  0.7205817 -0.33990980
## [3,] -0.9288076 -0.2328502 -0.07482720
## [4,] -0.9433880 -0.1744338 -0.06700492
## [5,] -0.9476006 -0.1428236 -0.04459705
## [6,] -0.9446719 -0.1888052 -0.04676285

Communalities and Specific Variance

cm <- apply(X = L^2,MARGIN = 1, FUN = sum)
cm
## [1] 0.9994939 0.9998164 0.9225019 0.9248977 0.9203343 0.9302392
sv <- 1 - cm
sv
## [1] 0.0005060766 0.0001835913 0.0774981111 0.0751022503 0.0796656638
## [6] 0.0697608285

Proportion Explained and Cumulative Proportion

prop <- diag(t(L) %*% L) / p #only m=3 coln included
cumsum(prop)
## [1] 0.7427414 0.8731431 0.9495472
pr<-lmda/p#can't do it bcz all coln included
cumsum(pr)
## [1] 0.7427414 0.8731431 0.9495472 0.9776860 0.9908673 1.0000000

Rotation

varimax(L)
## $loadings
## 
## Loadings:
##      [,1]   [,2]   [,3]  
## [1,] -0.354  0.244  0.903
## [2,] -0.234  0.949  0.211
## [3,] -0.921  0.166  0.218
## [4,] -0.903  0.214  0.252
## [5,] -0.887  0.229  0.284
## [6,] -0.907  0.192  0.264
## 
##                 [,1]  [,2]  [,3]
## SS loadings    3.454 1.123 1.120
## Proportion Var 0.576 0.187 0.187
## Cumulative Var 0.576 0.763 0.950
## 
## $rotmat
##           [,1]       [,2]       [,3]
## [1,] 0.8546542 -0.3385584 -0.3936299
## [2,] 0.4824845  0.7979214  0.3612896
## [3,] 0.1917680 -0.4986980  0.8452960

MLE: factanal performs MLE factor analysis on Covariance matrix.

fit <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "none")
fit
## 
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "none")
## 
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
## 
## Loadings:
##      Factor1 Factor2 Factor3
## [1,]  0.622   0.284   0.143 
## [2,]  0.484   0.660   0.121 
## [3,]  0.937                 
## [4,]  0.992          -0.105 
## [5,]  0.920           0.367 
## [6,]  0.926           0.232 
## 
##                Factor1 Factor2 Factor3
## SS loadings      4.187   0.520   0.236
## Proportion Var   0.698   0.087   0.039
## Cumulative Var   0.698   0.784   0.824
## 
## The degrees of freedom for the model is 0 and the fit was 8e-04

Numerical Optimization

Newton Raphson: One Dimensional Problem

Class Lecture Slide

Example 01

theta0 <- 10
eps <- 10
result <- c()
while( eps > 0.001) {
  l1 <- 257.59 - 15* theta0
  l2 <- -15
  theta1 <- theta0 - l1/l2
  eps <- abs(theta1 - theta0)
  result <- rbind(result, c(theta0, theta1, l1, eps ))
  theta0 <- theta1
}
result
##          [,1]     [,2]   [,3]     [,4]
## [1,] 10.00000 17.17267 107.59 7.172667
## [2,] 17.17267 17.17267   0.00 0.000000

Example 02

sigma_0_sq<-2.5
eps<-10
result<-c()
while(eps>0.001){
  l1<- -15/(2*sigma_0_sq)+124.88/(2*((sigma_0_sq)^2))
  l2<-15/(2*sigma_0_sq^2)- 124.88/sigma_0_sq^3
  sigma1<-sigma_0_sq-l1/l2
  eps<-abs(sigma1-sigma_0_sq)
  result<-rbind(result,c(sigma_0_sq,sigma1,l1,eps))
  sigma_0_sq<-sigma1
}
result
##          [,1]     [,2]         [,3]         [,4]
## [1,] 2.500000 3.529162 6.990400e+00 1.029162e+00
## [2,] 3.529162 4.819141 2.888103e+00 1.289979e+00
## [3,] 4.819141 6.247261 1.132290e+00 1.428120e+00
## [4,] 6.247261 7.495147 3.993398e-01 1.247886e+00
## [5,] 7.495147 8.174777 1.108349e-01 6.796305e-01
## [6,] 8.174777 8.319985 1.689693e-02 1.452075e-01
## [7,] 8.319985 8.325326 5.795058e-04 5.341750e-03
## [8,] 8.325326 8.325333 7.431753e-07 6.868025e-06

Newton Raphson: Two Dimensional Problem

Slide Given

dat3 <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/3. Newton_Raphson/NewtonMethod3.txt")

n <- nrow(dat3)
result <- c()
mu0 <- 30
sigma2_0 <- 5
eps <- 5

while (eps > 0.001){
#g matrix
g1 <- (- sum(dat3 - mu0)) / sigma2_0
g2 <- n/(2*sigma2_0) - sum((dat3 - mu0)^2) / (2 * sigma2_0^2)
g <- matrix(c(g1,g2), nrow = 2)

# H matrix
h11 <- n/ sigma2_0
h12 = h21 <- sum(dat3)/(sigma2_0^2) - (n * mu0) / (sigma2_0^2)
h22 <- (-n/(2 * sigma2_0^2)) + sum((dat3 - mu0)^2) / (sigma2_0^3)
H <-matrix(c(h11, h12, h21, h22), nrow = 2, byrow = T)

#Theta
theta0 <- matrix(c(mu0, sigma2_0), nrow = 2)
theta1 <- theta0 - solve(H) %*% g

eps <- abs(max(theta1 - theta0))
result <- rbind(result,c(theta1[1,1], theta1[2,1], eps))
mu0 <- theta1[1,1]
sigma2_0 <- theta1[2,1]
}

result
##           [,1]      [,2]         [,3]
##  [1,] 38.08833  1.690092 8.088330e+00
##  [2,] 35.07835  1.801138 1.110456e-01
##  [3,] 34.96474  2.635538 8.343997e-01
##  [4,] 34.91048  3.812803 1.177266e+00
##  [5,] 34.88506  5.412478 1.599675e+00
##  [6,] 34.87362  7.456511 2.044032e+00
##  [7,] 34.86886  9.800152 2.343641e+00
##  [8,] 34.86717 11.992387 2.192235e+00
##  [9,] 34.86673 13.361853 1.369466e+00
## [10,] 34.86668 13.744016 3.821623e-01
## [11,] 34.86668 13.767125 2.310881e-02
## [12,] 34.86668 13.767203 7.797036e-05

REML

Lecture Slide is Here.

library(mvtnorm)

set.seed(165)
y <- matrix(
data = rnorm(n = 25, mean = 20, sd = 5),
ncol = 1)
x <- matrix(
data = rep(x = 1, times = 25),
ncol = 1)
A <- diag(x = length(y)) - x %*% solve(t(x) %*% x) %*% t(x)

#residual likelihood
reml_lik <- function(par, y, A){
w <- t(A) %*% y
mu <- rep(x = 0, times = length(w))
cmat <- t(A) %*% (par * diag(length(y))) %*% A
sum(dmvnorm(x = as.numeric(w),
mean = mu, sigma = cmat,log = TRUE))
}
reml_lik(par = 25, y = y, A = A[,-length(y)])
## [1] -72.02488
optimise(f = reml_lik,interval = c(0, 1e+100),
y = y, A = A[, -length(y)], maximum = TRUE)
## $maximum
## [1] 26.98599
## 
## $objective
## [1] -71.98891

1st Incourse

Here is the Question

Solution:

Ans to Q no. 1(b)

x <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/plik.txt")
mle <- function(data,alpha){
  
  -sum(log(dgamma(x = data,shape = alpha,scale = 1)))
  
}
optim(par = 10,fn = mle,data=x$V1,method = "Brent",upper = 100,lower = 10)
## $par
## [1] 37.30919
## 
## $value
## [1] 100.3793
## 
## $counts
## function gradient 
##       NA       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Ans to Q no. 1(c)

xp <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/plik.txt")
plik <- function(alpha,data){
  
  bit <- optim(par = 1,fn = function(alpha,beta,data){
    -sum(log(dgamma(x = data,shape = alpha,scale = beta)))},
    data = data,
    alpha = alpha,
    method = "Brent",
    lower = 0.1,
    upper = 10)
  
  opt_beta <- bit$par
  
  -sum(log(dgamma(x = data,shape = alpha,scale = opt_beta)))
  
}

optim(par = 35,
      fn = plik,
      data = xp$V1,
      method = "Brent",
      lower = 10,
      upper = 100)
## $par
## [1] 14.63149
## 
## $value
## [1] 92.36291
## 
## $counts
## function gradient 
##       NA       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Ans to Q no. 2(a)

xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
cig2 <- 10
mu0 <- 10
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result1 <- c()

while (eps>0.001) {
  
  l1 <- sum(dat1 - mu0) / cig2
  l2 <- -n / cig2
  mu1 <- mu0 - (l1/l2)
  eps <- abs(mu1 - mu0)
  result1 <- rbind(result1,c(mu1,eps))
  mu0 <- mu1
}

result1
##            [,1]         [,2]
## [1,] -0.3989145 1.039891e+01
## [2,] -0.3989145 8.881784e-16

Ans to Q no. 2(b)

xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
sig2 <- 5
m0 <- 5
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result2 <- c()

while (eps>0.001) {
  
  l1 <- -n/(2*sig2) + sum((dat1-m0)^2)/(2*sig2^2)
  l2 <- n/(2*sig2^2) - sum((dat1-m0)^2)/(sig2^3)
  sig2_1 <- sig2 - (l1/l2)
  eps <- abs(sig2_1 - sig2)
  result2 <- rbind(result2,c(sig2_1,eps))
  sig2 <- sig2_1
}

result2
##            [,1]         [,2]
##  [1,]  7.320477 2.3204769638
##  [2,] 10.582627 3.2621505234
##  [3,] 14.999636 4.4170089816
##  [4,] 20.612890 5.6132536269
##  [5,] 26.986327 6.3734364086
##  [6,] 32.836524 5.8501976854
##  [7,] 36.354872 3.5183481331
##  [8,] 37.266356 0.9114833252
##  [9,] 37.314350 0.0479939880
## [10,] 37.314474 0.0001239378

Ans to Q no. 2(c)

xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
sig2 <- 2.5
m0 <- 30
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result3 <- c()

while (eps>0.001) {
  
  g1 <- -sum(dat1-m0)/sig2
  g2 <- n/(2*sig2) - sum((dat1-m0)^2)/(2*sig2^2)
  g <- matrix(c(g1,g2),nrow = 2)
  
  h11 <- n/sig2
  h12 = h21 <- sum(dat1-m0)/(sig2^2)
  h22 <- -n/(2*sig2^2) + sum((dat1-m0^2))/(sig2^3)
  H <- matrix(c(h11,h12,h21,h22),nrow = 2,byrow = T)
  
  th0 <- matrix(c(m0,sig2),nrow = 2)
  th1 <- th0 - solve(H) %*% g
  
  eps <- abs(max(th1-th0))
  
  result3 <- rbind(result3,c(th1[1,1],th1[2,1],eps))
  m0 <- th1[1,1]
  sig2 <- th1[2,1]
}

result3
##             [,1]          [,2]         [,3]
##  [1,]  7.2470712     3.1288042 0.6288041689
##  [2,]  1.4093942     3.8687799 0.7399757344
##  [3,] -0.5213124     3.6069162 0.2618637492
##  [4,] -0.2871857     0.3144069 0.2341266301
##  [5,] -1.0715507    -1.5784046 0.7843649524
##  [6,]  2.1830360     4.4803868 6.0587914138
##  [7,] -0.1254516     4.9549194 0.4745325562
##  [8,] -0.5434621     2.3358374 0.4180105072
##  [9,] -0.1759433    -1.2672961 0.3675187451
## [10,]  6.3915973   -39.8623766 6.5675405985
## [11,] -0.4954643   -39.2955995 0.5667770907
## [12,] -0.5195186   -88.3812501 0.0240542955
## [13,] -0.5327113  -186.4303727 0.0131926795
## [14,] -0.5396167  -382.4825512 0.0069053506
## [15,] -0.5431488  -774.5669393 0.0035321855
## [16,] -0.5449351 -1558.7264315 0.0017862571
## [17,] -0.5458333 -3127.0409417 0.0008982058

Ans to Q no. 03

reml <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/reml.txt", quote="\"", comment.char="")

library(mvtnorm)

y<-as.matrix(read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/reml.txt"),ncol=1)

x<-matrix(data = rep(1,times=25),ncol = 1)
A<-diag(x = length(y))- x%*%solve(t(x)%*%x)%*%t(x)

reml_lik<-function(par,y,A){
  w=t(A)%*%y
  mu=rep(x = 0,times=length(w))
  cmat<-t(A)%*%(par*diag(x = length(y)))%*%A
  sum(dmvnorm(x = as.numeric(w),mean = mu,sigma = cmat,log = T))
}

optimise(f = reml_lik,interval = c(0,1e+100),y=y,
         A=A[,-length(y)],maximum = TRUE)
## $maximum
## [1] 27.18862
## 
## $objective
## [1] -72.07866

Factor Analysis

Class

Multivariate Analysis (PCA)

setwd(“~/AST430/Lecture - Factor Analysis”) manually, Session–>Set working directory–>choose directory

library(readr)
library(tidyverse)
library(corrr)
library(ggcorrplot)


protein <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/protein.csv")
head (protein)
prot_num <- protein %>% select(-Country)

prot_normalized <- scale(prot_num)  #substracting mean and dividing by standard deviation

cor_mat <- cor(prot_normalized)

#Specifically designed to visualize correlation matrices
ggcorrplot::ggcorrplot(cor_mat)

#Eigenvalues (variances explained by each principal component).
#Eigenvectors (also called "loadings" or contributions of variables to each principal component).
pca.data <- princomp(prot_num, cor = T)
pca.data
## Call:
## princomp(x = prot_num, cor = T)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7    Comp.8 
## 2.0016087 1.2786710 1.0620355 0.9770691 0.6810568 0.5702026 0.5211586 0.3410160 
##    Comp.9 
## 0.3148204 
## 
##  9  variables and  25 observations.
summary(pca.data)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     2.0016087 1.2786710 1.0620355 0.9770691 0.6810568
## Proportion of Variance 0.4451597 0.1816666 0.1253244 0.1060738 0.0515376
## Cumulative Proportion  0.4451597 0.6268263 0.7521507 0.8582245 0.9097621
##                            Comp.6     Comp.7     Comp.8     Comp.9
## Standard deviation     0.57020257 0.52115865 0.34101599 0.31482043
## Proportion of Variance 0.03612566 0.03017848 0.01292132 0.01101243
## Cumulative Proportion  0.94588776 0.97606624 0.98898757 1.00000000
pca.data$loadings[, 1:2]
##               Comp.1      Comp.2
## RedMeat    0.3026094  0.05625165
## WhiteMeat  0.3105562  0.23685334
## Eggs       0.4266785  0.03533576
## Milk       0.3777273  0.18458877
## Fish       0.1356499 -0.64681970
## Cereals   -0.4377434  0.23348508
## Starch     0.2972477 -0.35282564
## Nuts      -0.4203344 -0.14331056
## Fr&Veg    -0.1104199 -0.53619004
#screeplot:

screeplot(pca.data)

screeplot(pca.data, type = 'lines')

Fit a model with 3 factors using PCA & MLE and then rotate the matrix.

crmat <- matrix(c(1,0.505,0.569,0.602,0.621,0.603,
                  0.505,1,0.422,0.467,0.482,0.450,
                  0.569,0.422,1,0.926,0.877,0.878,
                  0.602,0.467,0.926,1,0.874,0.894,
                  0.621,0.482,0.877,0.874,1,0.937,
                  0.603,0.450,0.878,0.894,0.937,1),
                nrow=6,ncol=6,byrow=TRUE)
crmat
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]
## [1,] 1.000 0.505 0.569 0.602 0.621 0.603
## [2,] 0.505 1.000 0.422 0.467 0.482 0.450
## [3,] 0.569 0.422 1.000 0.926 0.877 0.878
## [4,] 0.602 0.467 0.926 1.000 0.874 0.894
## [5,] 0.621 0.482 0.877 0.874 1.000 0.937
## [6,] 0.603 0.450 0.878 0.894 0.937 1.000
#n<-276

### PCA :

#Eigen Values
p <- 6 #total variables
m <- 3  #since 3 factors
lmda <- eigen(crmat)$values[1:m]
lmda
## [1] 4.4564485 0.7824099 0.4584251
#Eigen Vectors
eL <- eigen(crmat)$vectors[,1:m]
eL
##            [,1]       [,2]        [,3]
## [1,] -0.3507933  0.3956532  0.84666989
## [2,] -0.2862040  0.8146406 -0.50202982
## [3,] -0.4399784 -0.2632446 -0.11051604
## [4,] -0.4468851 -0.1972029 -0.09896293
## [5,] -0.4488806 -0.1614667 -0.06586761
## [6,] -0.4474933 -0.2134503 -0.06906640
# Loadings
L <- matrix(NA, nrow = p, ncol = m)

for(i in 1:m){
  L[,i] <- sqrt(lmda[i] ) * eL[,i]
}

# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.9994939 0.9998164 0.9225019 0.9248977 0.9203343 0.9302392
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.0005060766 0.0001835913 0.0774981111 0.0751022503 0.0796656638
## [6] 0.0697608285
# Proportion explained: 
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.74274142 0.13040165 0.07640418
# Cumulative proportion: 
cumsum(prop)
## [1] 0.7427414 0.8731431 0.9495472
# So 96% of the total proportion is explained by the three factors. 


### Rotation: 

varimax(L)
## $loadings
## 
## Loadings:
##      [,1]   [,2]   [,3]  
## [1,] -0.354  0.244  0.903
## [2,] -0.234  0.949  0.211
## [3,] -0.921  0.166  0.218
## [4,] -0.903  0.214  0.252
## [5,] -0.887  0.229  0.284
## [6,] -0.907  0.192  0.264
## 
##                 [,1]  [,2]  [,3]
## SS loadings    3.454 1.123 1.120
## Proportion Var 0.576 0.187 0.187
## Cumulative Var 0.576 0.763 0.950
## 
## $rotmat
##           [,1]       [,2]       [,3]
## [1,] 0.8546542 -0.3385584 -0.3936299
## [2,] 0.4824845  0.7979214  0.3612896
## [3,] 0.1917680 -0.4986980  0.8452960
### MLE Method: 

#factanal performs MLE factor analysis on Covariance matrix: 
fit <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "none")
fit
## 
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "none")
## 
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
## 
## Loadings:
##      Factor1 Factor2 Factor3
## [1,]  0.622   0.284   0.143 
## [2,]  0.484   0.660   0.121 
## [3,]  0.937                 
## [4,]  0.992          -0.105 
## [5,]  0.920           0.367 
## [6,]  0.926           0.232 
## 
##                Factor1 Factor2 Factor3
## SS loadings      4.187   0.520   0.236
## Proportion Var   0.698   0.087   0.039
## Cumulative Var   0.698   0.784   0.824
## 
## The degrees of freedom for the model is 0 and the fit was 8e-04
fit2 <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "varimax")
fit2
## 
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "varimax")
## 
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
## 
## Loadings:
##      Factor1 Factor2 Factor3
## [1,]  0.466   0.502   0.139 
## [2,]  0.208   0.798         
## [3,]  0.889   0.289         
## [4,]  0.935   0.345         
## [5,]  0.823   0.359   0.418 
## [6,]  0.851   0.323   0.288 
## 
##                Factor1 Factor2 Factor3
## SS loadings      3.326   1.324   0.293
## Proportion Var   0.554   0.221   0.049
## Cumulative Var   0.554   0.775   0.824
## 
## The degrees of freedom for the model is 0 and the fit was 8e-04

Assignment

Use PCA and MLE methods for the three datasets: protein.csv, pulp_paper.csv and crime.csv for factor analysis and find the loadings, communalities, specific variance, and proportion explained. Suggest names of the factors before and after the rotation.

protein.csv data

protein <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/protein.csv")
## Rows: 25 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (9): RedMeat, WhiteMeat, Eggs, Milk, Fish, Cereals, Starch, Nuts, Fr&Veg
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(protein)
protein<-protein %>% select(-Country)
protein_stand<-scale(protein)
protein_cor<-cor(protein_stand)
ggcorrplot(protein_cor)

pca_data<-princomp(protein_cor)
pca_data
## Call:
## princomp(x = protein_cor)
## 
## Standard deviations:
##       Comp.1       Comp.2       Comp.3       Comp.4       Comp.5       Comp.6 
## 1.278539e+00 5.187769e-01 3.715581e-01 2.854996e-01 1.517555e-01 1.000892e-01 
##       Comp.7       Comp.8       Comp.9 
## 8.461374e-02 3.671406e-02 7.958511e-09 
## 
##  9  variables and  9 observations.
summary(pca_data)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3    Comp.4     Comp.5
## Standard deviation     1.2785390 0.5187769 0.37155811 0.2854996 0.15175548
## Proportion of Variance 0.7550709 0.1243143 0.06376953 0.0376505 0.01063772
## Cumulative Proportion  0.7550709 0.8793852 0.94315474 0.9808052 0.99144296
##                             Comp.6      Comp.7       Comp.8       Comp.9
## Standard deviation     0.100089157 0.084613737 0.0367140593 7.958511e-09
## Proportion of Variance 0.004627366 0.003307056 0.0006226222 2.925657e-17
## Cumulative Proportion  0.996070322 0.999377378 1.0000000000 1.000000e+00
pca_data$loadings[, 1:2]
##               Comp.1      Comp.2
## RedMeat    0.2902937  0.09510150
## WhiteMeat  0.3131864  0.26392299
## Eggs       0.4131713  0.07419838
## Milk       0.3852342  0.15393922
## Fish       0.1165053 -0.69780771
## Cereals   -0.4342505  0.27531862
## Starch     0.2796041 -0.36379635
## Nuts      -0.4400339 -0.07686551
## Fr&Veg    -0.1567571 -0.43715631
#screeplot
screeplot(pca_data, type = "l")

pulp_paper.csv

pulp<-read.table("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/pulp_paper.dat")
pulp<-pulp[-1,]
pulp <- data.frame(lapply(pulp, as.numeric))
head(pulp)
pulp_stand<-scale(pulp)
pulp_cor<-cor(pulp_stand)
ggcorrplot(pulp_cor)

pca<-princomp(pulp_cor)
summary(pca)
## Importance of components:
##                           Comp.1     Comp.2      Comp.3       Comp.4
## Standard deviation     1.4030875 0.28721942 0.098671244 0.0283030687
## Proportion of Variance 0.9547186 0.04000677 0.004721577 0.0003884839
## Cumulative Proportion  0.9547186 0.99472541 0.999446987 0.9998354709
##                              Comp.5       Comp.6       Comp.7 Comp.8
## Standard deviation     0.0167774417 7.384949e-03 1.800878e-03      0
## Proportion of Variance 0.0001365078 2.644849e-05 1.572804e-06      0
## Cumulative Proportion  0.9999719787 9.999984e-01 1.000000e+00      1
pca$loadings[, 1:2]
##        Comp.1     Comp.2
## V1  0.3359621  0.3387672
## V2  0.3273338  0.4609865
## V3  0.3473831  0.2988179
## V4  0.3425073  0.2342509
## V5  0.3467051 -0.5252258
## V6  0.3543481 -0.3885918
## V7 -0.3843774  0.3184464
## V8  0.3853347 -0.0169917
screeplot(pca, type = "l")

crime.csv

crime <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/Crime.csv")
## New names:
## Rows: 50 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (4): Murder, Assault, UrbanPop, Rape
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(crime)
crime<-crime %>% select(-...1)
crime_stand<-scale(crime)
crime_cor<-cor(crime_stand)
ggcorrplot(crime_cor)

pca<-princomp(crime_cor)
summary(pca)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3 Comp.4
## Standard deviation     0.5564763 0.2037617 0.10523382      0
## Proportion of Variance 0.8548193 0.1146110 0.03056973      0
## Cumulative Proportion  0.8548193 0.9694303 1.00000000      1
pca$loadings[,1:2]
##              Comp.1      Comp.2
## Murder    0.6164996  0.23245381
## Assault   0.4578583 -0.04103407
## UrbanPop -0.6216128 -0.04109728
## Rape      0.1545688 -0.97087200
screeplot(pca, type = "l")

Exercise: 9.3

data_1<-matrix(c(1.00, .02, .96, .42, .01,
                 .02, 1.00, .13, .71, .85,
                 .96, .13, 1.00, .50, .11,
                 .42, .71, .50, 1.00, .79,
                 .01, .85, .11, .79, 1.00), nrow = 5, byrow = TRUE)
n<-276
p <- 5 #total variables
m <- 2  #2 eigenvalues are greater than unity 
lmda <- eigen(data_1)$values[1:m]
lmda
## [1] 2.853090 1.806332
#Eigen Vectors
eL <- eigen(data_1)$vectors[,1:m]
eL
##           [,1]        [,2]
## [1,] 0.3314539 -0.60721643
## [2,] 0.4601593  0.39003172
## [3,] 0.3820572 -0.55650828
## [4,] 0.5559769  0.07806457
## [5,] 0.4725608  0.40418799
# Loadings
L <- matrix(NA, nrow = p, ncol = m)

for(i in 1:m){
  L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
##           [,1]       [,2]
## [1,] 0.5598618 -0.8160981
## [2,] 0.7772594  0.5242021
## [3,] 0.6453364 -0.7479464
## [4,] 0.9391057  0.1049187
## [5,] 0.7982069  0.5432281
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.9794614 0.8789200 0.9758829 0.8929275 0.9322311
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.02053865 0.12107998 0.02411712 0.10707250 0.06776888
# Proportion explained: 
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.5706181 0.3612665
# Cumulative proportion: 
cumsum(prop)
## [1] 0.5706181 0.9318846
# So 96% of the total proportion is explained by the three factors. 


### Rotation: 

varimax(L)
## $loadings
## 
## Loadings:
##      [,1]   [,2]  
## [1,]        -0.989
## [2,]  0.937       
## [3,]  0.130 -0.979
## [4,]  0.843 -0.427
## [5,]  0.965       
## 
##                 [,1]  [,2]
## SS loadings    2.539 2.121
## Proportion Var 0.508 0.424
## Cumulative Var 0.508 0.932
## 
## $rotmat
##           [,1]       [,2]
## [1,] 0.8365697 -0.5478605
## [2,] 0.5478605  0.8365697
### MLE Method: 

#factanal performs MLE factor analysis on Covariance matrix: 
fit <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "none")
fit
## 
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "none")
## 
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
## 
## Loadings:
##      Factor1 Factor2
## [1,]  0.976  -0.139 
## [2,]  0.150   0.860 
## [3,]  0.979         
## [4,]  0.535   0.738 
## [5,]  0.146   0.963 
## 
##                Factor1 Factor2
## SS loadings      2.241   2.232
## Proportion Var   0.448   0.446
## Cumulative Var   0.448   0.895
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
fit1 <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "varimax")
fit1
## 
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "varimax")
## 
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
## 
## Loadings:
##      Factor1 Factor2
## [1,]          0.985 
## [2,]  0.873         
## [3,]  0.131   0.971 
## [4,]  0.817   0.405 
## [5,]  0.973         
## 
##                Factor1 Factor2
## SS loadings      2.396   2.078
## Proportion Var   0.479   0.416
## Cumulative Var   0.479   0.895
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119

Exercise: 9.4

data_2<-matrix(c(1.000, .632, .511, .115, .155,
                 .632, 1.000, .574, .322, .213,
                 .511, .574, 1.000, .183, .146,
                 .115, .322, .183, 1.000, .683,
                 .155, .213, .146, .683, 1.000), nrow = 5, byrow = TRUE)
n<-276
p <- 5 #total variables
m <- 2  #2 eigenvalues are greater than unity 
lmda <- eigen(data_2)$values
lmda
## [1] 2.4376148 1.4062612 0.5001857 0.4000328 0.2559055
#Eigen Vectors
eL <- eigen(data_2)$vectors[,1:m]
eL
##            [,1]       [,2]
## [1,] -0.4692143  0.3677864
## [2,] -0.5322254  0.2364426
## [3,] -0.4652234  0.3154110
## [4,] -0.3873348 -0.5850179
## [5,] -0.3607117 -0.6058861
# Loadings
L <- matrix(NA, nrow = p, ncol = m)

for(i in 1:m){
  L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
##            [,1]       [,2]
## [1,] -0.7325779  0.4361428
## [2,] -0.8309562  0.2803875
## [3,] -0.7263470  0.3740330
## [4,] -0.6047405 -0.6937486
## [5,] -0.5631743 -0.7184954
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.7268909 0.7691054 0.6674806 0.8469982 0.8334010
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.2731091 0.2308946 0.3325194 0.1530018 0.1665990
# Proportion explained: 
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.4875230 0.2812522
# Cumulative proportion: 
cumsum(prop)
## [1] 0.4875230 0.7687752
# So 96% of the total proportion is explained by the three factors. 


### Rotation: 

varimax(L)
## $loadings
## 
## Loadings:
##      [,1]   [,2]  
## [1,] -0.852       
## [2,] -0.849 -0.220
## [3,] -0.813       
## [4,] -0.126 -0.912
## [5,]        -0.910
## 
##                 [,1]  [,2]
## SS loadings    2.128 1.715
## Proportion Var 0.426 0.343
## Cumulative Var 0.426 0.769
## 
## $rotmat
##            [,1]      [,2]
## [1,]  0.8367920 0.5475208
## [2,] -0.5475208 0.8367920
### MLE Method: 

#factanal performs MLE factor analysis on Covariance matrix: 
fit <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "none")
fit
## 
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "none")
## 
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
## 
## Loadings:
##      Factor1 Factor2
## [1,]  0.976  -0.139 
## [2,]  0.150   0.860 
## [3,]  0.979         
## [4,]  0.535   0.738 
## [5,]  0.146   0.963 
## 
##                Factor1 Factor2
## SS loadings      2.241   2.232
## Proportion Var   0.448   0.446
## Cumulative Var   0.448   0.895
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
fit1 <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "varimax")
fit1
## 
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "varimax")
## 
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
## 
## Loadings:
##      Factor1 Factor2
## [1,]          0.985 
## [2,]  0.873         
## [3,]  0.131   0.971 
## [4,]  0.817   0.405 
## [5,]  0.973         
## 
##                Factor1 Factor2
## SS loadings      2.396   2.078
## Proportion Var   0.479   0.416
## Cumulative Var   0.479   0.895
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119

Exercise: 9.6

data_3<-matrix(c(1.000, .6386, .4752, .3227, .5520, .3262, .3509, .4008, .1821, -.0352,
                 .6386, 1.000, .4953, .5668, .4706, .3520, .3998, .5167, .3102, .1012,  
                 .4752, .4953, 1.000, .4357, .2539, .2812, .7926, .4728, .4682, -.0120,
                 .3227, .5668, .4357, 1.000, .3449, .3503, .3657, .6040, .2344, .2380, 
                 .5520, .4706, .2539, .3449, 1.000, .1546, .2100, .4213, .2116, .4125,
                 .3262, .3520, .2812, .3503, .1546, 1.000, .2553, .4163, .1712, .0002,
                 .3509, .3998, .7926, .3657, .2100, .2553, 1.000, .4036, .4179, .0109,  
                 .4008, .5167, .4728, .6040, .4213, .4163, .4036, 1.000, .3151, .2395,
                 .1821, .3102, .4682, .2344, .2116, .1712, .4179, .3151, 1.000,.0983,
                 -.0352, .1012, -.0120, .2380, .4125, .0002, .0109, .2395, .0983,1.000),
               nrow = 10, byrow = TRUE)
n<-280
p <- 10 #total variables
m <- 4  #2 eigenvalues are greater than unity 
lmda <- eigen(data_3)$values[1:m]
lmda
## [1] 4.2129434 1.3895855 1.0594001 0.9177767
#Eigen Vectors
eL <- eigen(data_3)$vectors[,1:m]
eL
##             [,1]        [,2]         [,3]        [,4]
##  [1,] -0.3391615 -0.01874585  0.455010859  0.43461912
##  [2,] -0.3861285 -0.06376924  0.247452683  0.11964658
##  [3,] -0.3756562  0.36853015 -0.191728996  0.11708682
##  [4,] -0.3462877 -0.15328490 -0.004441842 -0.38355709
##  [5,] -0.2946590 -0.46544520  0.043795306  0.41438248
##  [6,] -0.2497538  0.07016977  0.361172572 -0.58568225
##  [7,] -0.3360646  0.38720230 -0.280390394  0.08112517
##  [8,] -0.3707459 -0.13775864 -0.017738766 -0.31756023
##  [9,] -0.2525672  0.21345999 -0.504151016  0.07679286
## [10,] -0.1070443 -0.63264530 -0.479048235 -0.08891610
# Loadings
L <- matrix(NA, nrow = p, ncol = m)

for(i in 1:m){
  L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
##             [,1]        [,2]         [,3]        [,4]
##  [1,] -0.6961444 -0.02209774  0.468329773  0.41636799
##  [2,] -0.7925464 -0.07517162  0.254696029  0.11462221
##  [3,] -0.7710515  0.43442586 -0.197341218  0.11216995
##  [4,] -0.7107713 -0.18069329 -0.004571862 -0.36745024
##  [5,] -0.6048010 -0.54866998  0.045077267  0.39698116
##  [6,] -0.5126311  0.08271661  0.371744685 -0.56108748
##  [7,] -0.6897880  0.45643672 -0.288597881  0.07771845
##  [8,] -0.7609729 -0.16239082 -0.018258009 -0.30422480
##  [9,] -0.5184056  0.25162810 -0.518908343  0.07356806
## [10,] -0.2197133 -0.74576659 -0.493070762 -0.08518221
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
##  [1] 0.8778004 0.7117888 0.8347719 0.6728865 0.8264490 0.7226459 0.7734708
##  [8] 0.6983366 0.6067392 0.8548165
#Specific Variance
sv <- 1 - cm
sv
##  [1] 0.1221996 0.2882112 0.1652281 0.3271135 0.1735510 0.2773541 0.2265292
##  [8] 0.3016634 0.3932608 0.1451835
# Proportion explained: 
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.42129434 0.13895855 0.10594001 0.09177767
# Cumulative proportion: 
cumsum(prop)
## [1] 0.4212943 0.5602529 0.6661929 0.7579706
# So 96% of the total proportion is explained by the three factors. 


### Rotation: 

varimax(L)
## $loadings
## 
## Loadings:
##       [,1]   [,2]   [,3]   [,4]  
##  [1,] -0.885  0.139 -0.182 -0.205
##  [2,] -0.664        -0.291 -0.429
##  [3,] -0.302        -0.819 -0.252
##  [4,] -0.221 -0.293 -0.267 -0.683
##  [5,] -0.746 -0.507              
##  [6,] -0.108  0.161        -0.826
##  [7,] -0.185        -0.832 -0.204
##  [8,] -0.277 -0.293 -0.324 -0.656
##  [9,]        -0.188 -0.754       
## [10,]        -0.921              
## 
##                 [,1]  [,2]  [,3]  [,4]
## SS loadings    2.045 1.376 2.234 1.924
## Proportion Var 0.205 0.138 0.223 0.192
## Cumulative Var 0.205 0.342 0.566 0.758
## 
## $rotmat
##            [,1]       [,2]       [,3]       [,4]
## [1,]  0.5683672 0.18381686  0.5850636  0.5485169
## [2,]  0.2499160 0.79875769 -0.5445967  0.0542454
## [3,] -0.4876786 0.57138291  0.5835154 -0.3085465
## [4,] -0.6137348 0.04146187 -0.1436133  0.7752327
### MLE Method: 

#factanal performs MLE factor analysis on Covariance matrix: 
fit <- factanal(covmat = data_3, n.obs = n, factors = m, rotation = "none")
fit
## 
## Call:
## factanal(factors = m, covmat = data_3, n.obs = n, rotation = "none")
## 
## Uniquenesses:
##  [1] 0.010 0.388 0.089 0.327 0.196 0.734 0.304 0.420 0.725 0.600
## 
## Loadings:
##       Factor1 Factor2 Factor3 Factor4
##  [1,]  0.993                         
##  [2,]  0.665   0.252   0.239   0.220 
##  [3,]  0.530   0.777  -0.141         
##  [4,]  0.363   0.428   0.421   0.425 
##  [5,]  0.571           0.620  -0.304 
##  [6,]  0.343   0.190           0.323 
##  [7,]  0.401   0.718  -0.102         
##  [8,]  0.439   0.407   0.390   0.263 
##  [9,]  0.218   0.461                 
## [10,]                  0.609  -0.145 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.686   1.794   1.187   0.539
## Proportion Var   0.269   0.179   0.119   0.054
## Cumulative Var   0.269   0.448   0.567   0.621
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 15.74 on 11 degrees of freedom.
## The p-value is 0.151
fit1 <- factanal(covmat = data_3, n.obs = n, factors = m, rotation = "varimax")
fit1
## 
## Call:
## factanal(factors = m, covmat = data_3, n.obs = n, rotation = "varimax")
## 
## Uniquenesses:
##  [1] 0.010 0.388 0.089 0.327 0.196 0.734 0.304 0.420 0.725 0.600
## 
## Loadings:
##       Factor1 Factor2 Factor3 Factor4
##  [1,]  0.205   0.296   0.928         
##  [2,]  0.280   0.554   0.451   0.155 
##  [3,]  0.883   0.278   0.228         
##  [4,]  0.254   0.739           0.242 
##  [5,]  0.142   0.151   0.519   0.701 
##  [6,]  0.136   0.465   0.173         
##  [7,]  0.794   0.220   0.133         
##  [8,]  0.314   0.612   0.168   0.279 
##  [9,]  0.477   0.160           0.139 
## [10,]          0.111           0.619 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      1.958   1.719   1.471   1.057
## Proportion Var   0.196   0.172   0.147   0.106
## Cumulative Var   0.196   0.368   0.515   0.621
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 15.74 on 11 degrees of freedom.
## The p-value is 0.151

Here is another pdf of the Assignment.

Bayesian Inference

The Slide from sir.

Class

Canonical Correlation

The class Lecture is Here.

Discrimination Function and Classification Rule

The Silde is given…

Example 01

Equal Variance

xb1 <- matrix(c(-0.0065,-0.0390), ncol=1)
xb2 <- matrix(c(-0.2483,0.0262), ncol=1)
Sp_inv <- matrix(c(131.158,-90.423,-90.423,108.147), nrow = 2, byrow = T)

#Discriminant Function
t(xb1- xb2) %*% Sp_inv
##          [,1]      [,2]
## [1,] 37.60958 -28.91547
#Alocation Rule
(1/2) * t(xb1 - xb2) %*% Sp_inv %*% (xb1 + xb2)
##           [,1]
## [1,] -4.606402
#Allocate a women for whom x1 = −0.210 and x2 = −0.044.
x0<-matrix(c(-.210, -.044), ncol = 1)
t(xb1- xb2) %*% Sp_inv %*% x0
##           [,1]
## [1,] -6.625732

Example 02

library(readr)
library(tidyverse)

rmowners <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination/rmowners.csv")
#View(rmowners)

dat <- rmowners %>%
  group_split(Owner)
dat
## <list_of<
##   tbl_df<
##     Income  : double
##     Lot_Size: double
##     Owner   : character
##   >
## >[2]>
## [[1]]
## # A tibble: 12 × 3
##    Income Lot_Size Owner
##     <dbl>    <dbl> <chr>
##  1  105       19.6 No   
##  2   82.8     20.8 No   
##  3   94.8     17.2 No   
##  4   73.2     20.4 No   
##  5  114       17.6 No   
##  6   79.2     17.6 No   
##  7   89.4     16   No   
##  8   96       18.4 No   
##  9   77.4     16.4 No   
## 10   63       18.8 No   
## 11   81       14   No   
## 12   93       14.8 No   
## 
## [[2]]
## # A tibble: 12 × 3
##    Income Lot_Size Owner
##     <dbl>    <dbl> <chr>
##  1   90       18.4 Yes  
##  2  116.      16.8 Yes  
##  3   94.8     21.6 Yes  
##  4   91.5     20.8 Yes  
##  5  117       23.6 Yes  
##  6  140.      19.2 Yes  
##  7  138       17.6 Yes  
##  8  113.      22.4 Yes  
##  9   99       20   Yes  
## 10  123       20.8 Yes  
## 11   81       22   Yes  
## 12  111       20   Yes
#Creating two dataset owner and non-owner:
dat_no<- dat[[1]] %>% 
  select(-Owner)
dat_yes<- dat[[2]] %>% 
  select(-Owner)

#Calculating mean and variance
x1_bar <- as.numeric(apply(dat_yes, 2, mean))
x2_bar <- as.numeric(apply(dat_no, 2, mean))

s_1 <- matrix(as.numeric(var(dat_yes)), nrow = 2, byrow = T)
s_2 <- matrix(as.numeric(var(dat_no)), nrow = 2, byrow = T)

n1 <- nrow(dat_yes)
n2 <- nrow(dat_no)

# Pooled variance
spinv <- solve( ( (n1-1) * s_1 + (n2-1) * s_2 ) / (n1 + n2 -2) )
a <- t(x1_bar - x2_bar) %*% spinv
a
##           [,1]      [,2]
## [1,] 0.1002303 0.7851847
m<- 0.5 * a %*% (x1_bar + x2_bar)
m
##          [,1]
## [1,] 24.74567
#(ii)Classify a person with income 150 USD and a lot size 75 to one of the popn.
x1<-matrix(c(150, 75), ncol = 1)
a %*% x1
##         [,1]
## [1,] 73.9234

Example 03

Classifying Normal population with unequal variances

#calculating k
k <- 0.5 * log ( det(s_1)/ det(s_2) ) +
  0.5 *( (t(x1_bar) %*% solve(s_1) %*% x1_bar) -
           (t(x2_bar) %*% solve(s_2) %*% x2_bar) )
k
##          [,1]
## [1,] 36.14149
-0.5 * (solve(s_1) - solve(s_2))
##               [,1]         [,2]
## [1,]  0.0009397948 -0.003089851
## [2,] -0.0030898514 -0.022789795
t(x1_bar) %*% solve(s_1) - t(x2_bar) %*% solve(s_2)
##           [,1]     [,2]
## [1,] 0.0379567 2.258857
#(ii)Classify a person with income 150 USD and a lot size 75 to one of the popn.
#x2<-matrix(c(50, 75), ncol = 1)
#a %*% x2

For iris data

library(ggordiplots)
library(psych)

data("iris")
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
pairs.panels(iris[1:4],
             gap = 0,
             bg = c("red", "green", "blue")[iris$Species],
             pch = 21)

library(tidyverse)
set.seed(123)
head(iris)
iris %>% count(Species)

Dividing the dataset into training and test data:

# First create a indicator 

ind <- sample(x = c(1, 2),
              size = nrow(iris),
              replace = TRUE,
              prob = c(0.6,0.4))

ind
##   [1] 1 2 1 2 2 1 1 2 1 1 2 1 2 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 1 1 2
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 2 2 2 1 2 2 2 1
##  [75] 1 1 1 2 1 1 1 2 1 2 1 1 2 2 2 1 1 2 1 2 1 1 2 1 1 1 1 1 1 2 1 2 2 2 1 1 2
## [112] 1 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 1 1 1 2 1 1 1
## [149] 1 2
training <- iris[ind==1,]
testing <- iris[ind==2,]

#Linear Discrimination analysis
library(MASS)
linear <- lda(Species~., training)
linear
## Call:
## lda(Species ~ ., data = training)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3370787  0.3370787  0.3258427 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa         4.946667    3.380000     1.443333    0.250000
## versicolor     5.943333    2.803333     4.240000    1.316667
## virginica      6.527586    2.920690     5.489655    2.048276
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.3629008 -0.05215114
## Sepal.Width   2.2276982 -1.47580354
## Petal.Length -1.7854533  1.60918547
## Petal.Width  -3.9745504 -4.10534268
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9932 0.0068
#confusion matrix
lda(Species~., training)
## Call:
## lda(Species ~ ., data = training)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3370787  0.3370787  0.3258427 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa         4.946667    3.380000     1.443333    0.250000
## versicolor     5.943333    2.803333     4.240000    1.316667
## virginica      6.527586    2.920690     5.489655    2.048276
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.3629008 -0.05215114
## Sepal.Width   2.2276982 -1.47580354
## Petal.Length -1.7854533  1.60918547
## Petal.Width  -3.9745504 -4.10534268
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9932 0.0068
p1 <- predict(linear, training)$class

tab <- table(Predicted = p1, Actual = training$Species)
tab
##             Actual
## Predicted    setosa versicolor virginica
##   setosa         30          0         0
##   versicolor      0         30         0
##   virginica       0          0        29
p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)
tab1
##             Actual
## Predicted    setosa versicolor virginica
##   setosa         20          0         0
##   versicolor      0         19         1
##   virginica       0          1        20
#Error rate
err_rate_lda <- (1- sum(diag(tab1))/sum(tab1))*100
err_rate_lda
## [1] 3.278689
#Quadratic discriminant analysis
quad <- qda(Species~., training)
p3 <- predict(quad, testing)$class

tab2 <- table(Predicted = p3, Actual = testing$Species)
tab2
##             Actual
## Predicted    setosa versicolor virginica
##   setosa         20          0         0
##   versicolor      0         16         2
##   virginica       0          4        19
#Error rate qda
err_rate_qda <- (1- sum(diag(tab2))/sum(tab2))*100
err_rate_qda
## [1] 9.836066
# Hold Out Procedure:
cnt=0

for(i in 1:nrow(iris)){
  training <- iris[-i,]
  testing <- iris[i,]
  linear <- lda(Species~., training)
  
  #confusion matrix
  p1 <- predict(linear, testing)$class
  
  if(p1 == testing$Species) cnt=cnt+1
}
 
#Error rate:

(1-cnt/nrow(iris))*100
## [1] 2
## Assignment

# Perform this hold out procedure by taking 1 to 150 observation and plot the difference.

Logistic Regression

data1 <- iris %>% filter(Species != "setosa")
data1$Species <- droplevels(data1$Species)

table(data1$Species)
## 
## versicolor  virginica 
##         50         50
ind <- sample(x = 2,
              size = nrow(data1),
              replace = TRUE,
              prob = c(0.6,0.4))


training <-data1[ind==1,]
testing <-data1[ind==2,]


linear <- lda(Species~., training)
linear
## Call:
## lda(Species ~ ., data = training)
## 
## Prior probabilities of groups:
## versicolor  virginica 
##   0.483871   0.516129 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## versicolor     6.030000    2.783333     4.323333    1.346667
## virginica      6.678125    2.981250     5.665625    2.009375
## 
## Coefficients of linear discriminants:
##                    LD1
## Sepal.Length -1.469118
## Sepal.Width  -1.159612
## Petal.Length  2.365995
## Petal.Width   2.849550
#confusion matrix
p1 <- predict(linear, training)$class

p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)

Logistic model, for comparison

model_logistic = glm(Species~., data=training ,family=binomial)

logistic_probs = data.frame(probs = predict(model_logistic, testing, type="response"))

predictions_logistic = logistic_probs %>%
  mutate(class = ifelse(probs>.5, "virginica", "versicolor"))

table(testing$Species, predictions_logistic$class)
##             
##              versicolor virginica
##   versicolor         19         1
##   virginica           0        18

For rmowner data

library(tidyverse)
library(MASS)
set.seed(47)
data <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination/rmowners.csv")
data$Owner<-as.factor(data$Owner)
ind<-sample(x=2, size = nrow(data), replace = TRUE, prob = c(0.6, 0.4))
train<-data[ind==1, ]
test<-data[ind==2, ]
linear<-lda(Owner~., data = train)
p<-predict(linear, test)$class
table(Predicted=p, Actual=test$Owner)
##          Actual
## Predicted No Yes
##       No   1   2
##       Yes  0   4
#logistic
model<-glm(Owner~., data = train, family = binomial)
pred<-data.frame(prob=predict(model, test, type = "response"))
prob_log<-pred %>% 
  mutate(class=ifelse(prob>.5, "Yes", "No"))
table(Predicted=prob_log$class, Actual=test$Owner)
##          Actual
## Predicted No Yes
##       No   1   1
##       Yes  0   5

From the combined 3 dataset

setwd("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination")
load("assignment_classification (1).Rdata")

library(tidyverse)

dat<- rmowners1 %>% 
  group_split(Owner)
dat
## <list_of<
##   tbl_df<
##     Income  : double
##     Lot.size: double
##     Owner   : character
##   >
## >[2]>
## [[1]]
## # A tibble: 12 × 3
##    Income Lot.size Owner
##     <dbl>    <dbl> <chr>
##  1  139.      19.6 No   
##  2  109.      20.8 No   
##  3   91.8     17.2 No   
##  4  108.      20.4 No   
##  5   95.6     17.6 No   
##  6   89.2     17.6 No   
##  7  113.      16   No   
##  8   92.9     18.4 No   
##  9  110.      16.4 No   
## 10   88.2     18.8 No   
## 11   95.3     14   No   
## 12   83.4     14.8 No   
## 
## [[2]]
## # A tibble: 12 × 3
##    Income Lot.size Owner
##     <dbl>    <dbl> <chr>
##  1  139.      18.4 Yes  
##  2  109.      16.8 Yes  
##  3   91.8     21.6 Yes  
##  4  108.      20.8 Yes  
##  5   95.6     23.6 Yes  
##  6   89.2     19.2 Yes  
##  7  113.      17.6 Yes  
##  8   92.9     22.4 Yes  
##  9  110.      20   Yes  
## 10   88.2     20.8 Yes  
## 11   95.3     22   Yes  
## 12   83.4     20   Yes
dat_own <- dat[[1]]
no<-dat_own[, 1:2]
no
dat_nown <- dat[[2]]
yes<-dat_nown[, 1:2]
yes
x1_bar<-as.numeric(apply(no, 2, mean))
x2_bar<-as.numeric(apply(yes, 2, mean))

s1<-matrix(as.numeric(var(no)), nrow = 2, byrow = TRUE)
s2<-matrix(as.numeric(var(yes)), nrow = 2, byrow = TRUE)

n1<-nrow(no)
n2<-nrow(yes)

sp<- solve((((n1-1)*s1)+((n2-1)*s2))/(n1+n2-2))
sp
##             [,1]        [,2]
## [1,] 0.004252256 0.001586455
## [2,] 0.001586455 0.234601243
a<- t(x1_bar-x2_bar) %*% sp
a
##              [,1]       [,2]
## [1,] -0.004177664 -0.6177833
D<- 0.5 * a %*% (x1_bar + x2_bar)
D
##           [,1]
## [1,] -12.13021
table(as.numeric(a %*% t(dat1)) > as.numeric(D))
## 
## FALSE  TRUE 
##    49     1
table(as.numeric(a %*% t(dat2)) > as.numeric(D))
## 
## FALSE  TRUE 
##    26    24

2nd Incourse

The solution pdf is Here.

Required Packages

library(tidyverse)
library(knitr)
library(readr)
library(MASS)
library(psych)
library(CCA)
library(CCP)

Ans to the ques no 1

## 1 (a)

library(readr)
dat <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/Incourse/gardenerdata1.csv")

dat1<- dat %>% group_split(gardener)


no<- dat1[[1]]


yes<- dat1[[2]]


yess<- yes[,-3]

noo<- no[,-3]



x1_bar <- as.numeric(apply(yess,2,mean))
x2_bar <- as.numeric(apply(noo,2,mean))



s1<- matrix(as.numeric(var(yess)),nrow=2,byrow=T)

s2<- matrix(as.numeric(var(noo)),nrow=2,byrow=T)



k<- 0.5 * log(det(s1) / det(s2)) + 0.5 * ((t(x1_bar) %*% solve(s1) %*% x1_bar)- 
                                            (t(x2_bar) %*% solve(s2) %*% x2_bar))
## 1st part

- 0.5 * (solve(s1) - solve(s2))
##               [,1]         [,2]
## [1,]  0.0009397948 -0.003089851
## [2,] -0.0030898514 -0.022789795
##2nd part

t(x1_bar) %*% solve(s1) - t(x2_bar) %*% solve(s2)
##           [,1]     [,2]
## [1,] 0.0379567 2.258857
## 1(b)

x1<- dat$Income
x2<- dat$Garden_Size

## classification rule


0.009 * x1^2 - 0.023 * x2^2 -0.006 * x1*x2 + 0.037 * x1 + 2.25 * x2 > 36.14
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
table(0.009 * x1^2 - 0.023 * x2^2 -0.006 * x1*x2 + 0.037 * x1 + 2.25 * x2 > 36.14)
## 
## TRUE 
##   24

Ans to the ques no 2

##a 

library(MASS)
data("Pima.tr")
b<- Pima.tr

ind<- sample(2,size = nrow(b),replace = T,prob = c(0.8,0.2))
table(ind)
## ind
##   1   2 
## 150  50
training<- b[ind == 1,]
testing<- b[ind == 2,]

head(training)
head(testing)
##b


linear <- lda(type~.,data=training)
linear
## Call:
## lda(type ~ ., data = training)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.7066667 0.2933333 
## 
## Group means:
##        npreg      glu       bp     skin      bmi       ped      age
## No  2.830189 113.1604 69.34906 27.62264 31.20000 0.4260566 29.57547
## Yes 4.909091 142.8182 76.00000 31.70455 34.53182 0.4981136 38.79545
## 
## Coefficients of linear discriminants:
##                LD1
## npreg  0.106500509
## glu    0.023126807
## bp     0.008133495
## skin  -0.016139760
## bmi    0.073329163
## ped    1.229751268
## age    0.029787738
p1<- predict(object = linear,testing)$class


tab<- table(actual=testing$type, predicted= p1)

err<- (1-sum(diag(tab))/sum(tab))*100
err
## [1] 26
##c

quad <- qda(type~.,data=training)


p2<- predict(object = quad,testing)$class


tab2<- table(actual=testing$type, predicted= p2)
tab2
##       predicted
## actual No Yes
##    No  23   3
##    Yes 11  13
error<- (1-sum(diag(tab2))/sum(tab2))*100
error
## [1] 28
## d
model_logistic <- glm(type~.,family = "binomial",data = training )
logistic_prob<- data.frame(prob=predict(model_logistic,newdata = testing,type = "response"))
predicted<-logistic_prob %>% 
  mutate(class= if_else(prob>0.5,"Yes","No"))

tab3<- table(actual=testing$type, predicted= predicted$class)
errorr<- (1-sum(diag(tab3))/sum(tab3))*100
errorr
## [1] 26

Ans to the ques no 3

## a

mm<- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/Incourse/mmreg.csv")
psy<- mm[1:3]
aca<- mm[4:8]
cc1<- cc(psy,aca)
cc1$cor
## [1] 0.4640861 0.1675092 0.1039911
##b

cc1$xcoef
##                        [,1]       [,2]       [,3]
## locus_of_control -1.2538339 -0.6214776 -0.6616896
## self_concept      0.3513499 -1.1876866  0.8267210
## motivation       -1.2624204  2.0272641  2.0002283
cc1$ycoef
##                 [,1]         [,2]         [,3]
## read    -0.044620600 -0.004910024  0.021380576
## write   -0.035877112  0.042071478  0.091307329
## math    -0.023417185  0.004229478  0.009398182
## science -0.005025152 -0.085162184 -0.109835014
## female  -0.632119234  1.084642326 -1.794647036
##c

a<-comput(X = psy, Y = aca, cc(psy,aca) )
a[3:6]
## $corr.X.xscores
##                         [,1]       [,2]       [,3]
## locus_of_control -0.90404631 -0.3896883 -0.1756227
## self_concept     -0.02084327 -0.7087386  0.7051632
## motivation       -0.56715106  0.3508882  0.7451289
## 
## $corr.Y.xscores
##               [,1]        [,2]        [,3]
## read    -0.3900402 -0.06010654  0.01407661
## write   -0.4067914  0.01086075  0.02647207
## math    -0.3545378 -0.04990916  0.01536585
## science -0.3055607 -0.11336980 -0.02395489
## female  -0.1689796  0.12645737 -0.05650916
## 
## $corr.X.yscores
##                          [,1]        [,2]        [,3]
## locus_of_control -0.419555307 -0.06527635 -0.01826320
## self_concept     -0.009673069 -0.11872021  0.07333073
## motivation       -0.263206910  0.05877699  0.07748681
## 
## $corr.Y.yscores
##               [,1]        [,2]       [,3]
## read    -0.8404480 -0.35882541  0.1353635
## write   -0.8765429  0.06483674  0.2545608
## math    -0.7639483 -0.29794884  0.1477611
## science -0.6584139 -0.67679761 -0.2303551
## female  -0.3641127  0.75492811 -0.5434036
##d 

rho<- cc1$cor
rho
## [1] 0.4640861 0.1675092 0.1039911
nrow(mm)
## [1] 600
library(CCP)

p.asym(rho = rho,N = 600,p = 3,q = 5,tstat = "Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
##               stat    approx df1      df2     p.value
## 1 to 3:  0.7543611 11.715733  15 1634.653 0.000000000
## 2 to 3:  0.9614300  2.944459   8 1186.000 0.002905057
## 3 to 3:  0.9891858  2.164612   3  594.000 0.091092180

Clustering method

Related pdf

library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization


df <- USArrests
df <- na.omit(df)
df <- scale(df)
head(df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207
# Dissimilarity matrix
d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )


# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

#abline(h = 2)



# Agglomerative coefficinet
coef.hclust(hc1)
## [1] 0.8531583
# Compute with agnes: Agglomerative Nesting
hc2 <- agnes(df, method = "complete")

# Agglomerative coefficient
hc2$ac
## [1] 0.8531583
#DI_visive ANA_lysis clustering

# compute divisive hierarchical clustering
hc4 <- diana(df)


# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8514345
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")

# Ward's method
hc5 <- hclust(d, method = "ward.D2" )

# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)

# Number of members in each cluster
table(sub_grp)
## sub_grp
##  1  2  3  4 
##  7 12 19 12
plot(hc5, cex = 0.6)
rect.hclust(hc5, k = 4, border = 2:5)

fviz_cluster(list(data = df, cluster = sub_grp))

####### K means clustering

k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:2] 46.7 56.1
##  $ tot.withinss: num 103
##  $ betweenss   : num 93.1
##  $ size        : int [1:2] 20 30
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
k2
## K-means clustering with 2 clusters of sizes 20, 30
## 
## Cluster means:
##      Murder    Assault   UrbanPop       Rape
## 1  1.004934  1.0138274  0.1975853  0.8469650
## 2 -0.669956 -0.6758849 -0.1317235 -0.5646433
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              2              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              1              2              2              1              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              2              2              1              2              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              2              2              1              2              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              2              1              2              1              1 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              1              2              2 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              2              2 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              2              2              2              2              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              1              2              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              2              2              2              2              2 
## 
## Within cluster sum of squares by cluster:
## [1] 46.74796 56.11445
##  (between_SS / total_SS =  47.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(k2, data = df)

df %>%
  as_tibble() %>%
  mutate(cluster = k2$cluster,
         state = row.names(USArrests)) %>%
  ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +
  geom_text()

k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

MCMC

The Slide id Here.