The Class Lecture is here.
library(tidyverse)
library(dplyr)
library(tinytex)
library(knitr)
library(ggplot2)
Optim function to find the minimum of a quadratic function,f(x)=(x−3)^2:
# Define the function we want to minimize
f <- function(x) {
(x - 3)^2
}
# Use optim to find the minimum
result <- optim(par = 0, # Initial guess for x
fn = f) # Function to minimize
# Print the result
result$par # This gives the value of x that minimizes f(x)
## [1] 2.9
Maximizing the log-likelihood (our goal) becomes equivalent to minimizing the negative log-likelihood:
-sum(dnorm(data, mu, sig, log=TRUE))
Define the plik Function:
plik <- function(mu, data){
sig <- optim(par = 1,#The initial value for sigma is set at 1
fn = function(sigma, data){
-sum(log(dnorm(data, mu, sigma)))},
data = data)$par #some creates problem with/without data=data part, so try for both if creates issues.
-sum(dnorm(data, mu, sig, log = TRUE))
}
Generate Random Data:
set.seed(47)
x <- round(rnorm(n = 50, mean = 35, sd = 4),2)
Calls optim to find the value of mu that maximizes the likelihood for the data in x, starting the optimization from an initial guess of 15.
optim(15, plik, data = x)
## $par
## [1] 34.44287
##
## $value
## [1] 134.3046
##
## $counts
## function gradient
## 32 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
To evaluate the likelihood across a range of potential mu values.
m <- seq(15, 50, length.out = 250)
Calculate the Profile Likelihood for Each mu:
mplik <- NULL
for(i in 1:250){
mplik[i] = -plik(m[i], x)
}
mplik
## [1] -220.1395 -219.7885 -219.4351 -219.0795 -218.7209 -218.3602 -217.9968
## [8] -217.6309 -217.2625 -216.8914 -216.5176 -216.1411 -215.7619 -215.3799
## [15] -214.9951 -214.6074 -214.2168 -213.8233 -213.4268 -213.0272 -212.6246
## [22] -212.2188 -211.8099 -211.3978 -210.9825 -210.5638 -210.1417 -209.7163
## [29] -209.2874 -208.8550 -208.4190 -207.9794 -207.5361 -207.0891 -206.6382
## [36] -206.1840 -205.7249 -205.2624 -204.7956 -204.3248 -203.8499 -203.3707
## [43] -202.8872 -202.3993 -201.9070 -201.4100 -200.9084 -200.4021 -199.8911
## [50] -199.3752 -198.8545 -198.3287 -197.7976 -197.2614 -196.7199 -196.1731
## [57] -195.6207 -195.0628 -194.4992 -193.9297 -193.3545 -192.7732 -192.1858
## [64] -191.5922 -190.9923 -190.3859 -189.7731 -189.1535 -188.5271 -187.8939
## [71] -187.2536 -186.6062 -185.9517 -185.2893 -184.6196 -183.9422 -183.2570
## [78] -182.5639 -181.8628 -181.1535 -180.4357 -179.7095 -178.9747 -178.2312
## [85] -177.4789 -176.7175 -175.9471 -175.1674 -174.3785 -173.5801 -172.7722
## [92] -171.9547 -171.1275 -170.2906 -169.4439 -168.5875 -167.7213 -166.8452
## [99] -165.9597 -165.0642 -164.1593 -163.2451 -162.3218 -161.3895 -160.4487
## [106] -159.4997 -158.5430 -157.5791 -156.6087 -155.6325 -154.6514 -153.6664
## [113] -152.6785 -151.6891 -150.6997 -149.7118 -148.7273 -147.7483 -146.7770
## [120] -145.8158 -144.8677 -143.9356 -143.0226 -142.1325 -141.2688 -140.4357
## [127] -139.6372 -138.8776 -138.1616 -137.4935 -136.8778 -136.3190 -135.8212
## [134] -135.3885 -135.0244 -134.7322 -134.5141 -134.3725 -134.3085 -134.3228
## [141] -134.4153 -134.5850 -134.8303 -135.1492 -135.5388 -135.9957 -136.5162
## [148] -137.0963 -137.7317 -138.4178 -139.1503 -139.9245 -140.7362 -141.5810
## [155] -142.4548 -143.3537 -144.2740 -145.2123 -146.1656 -147.1307 -148.1051
## [162] -149.0864 -150.0724 -151.0610 -152.0506 -153.0396 -154.0266 -155.0103
## [169] -155.9897 -156.9639 -157.9320 -158.8933 -159.8472 -160.7933 -161.7310
## [176] -162.6600 -163.5801 -164.4909 -165.3923 -166.2842 -167.1664 -168.0388
## [183] -168.9015 -169.7543 -170.5976 -171.4308 -172.2544 -173.0684 -173.8728
## [190] -174.6677 -175.4533 -176.2297 -176.9967 -177.7547 -178.5038 -179.2441
## [197] -179.9757 -180.6988 -181.4135 -182.1198 -182.8180 -183.5082 -184.1905
## [204] -184.8651 -185.5320 -186.1914 -186.8435 -187.4883 -188.1260 -188.7567
## [211] -189.3806 -189.9977 -190.6082 -191.2122 -191.8098 -192.4011 -192.9862
## [218] -193.5653 -194.1393 -194.7057 -195.2672 -195.8231 -196.3735 -196.9183
## [225] -197.4579 -197.9922 -198.5214 -199.0453 -199.5642 -200.0783 -200.5876
## [232] -201.0921 -201.5920 -202.0874 -202.5780 -203.0643 -203.5462 -204.0239
## [239] -204.4973 -204.9665 -205.4318 -205.8929 -206.3522 -206.8033 -207.2529
## [246] -207.6984 -208.1404 -208.5787 -209.0134 -209.4445
Find the Maximum Likelihood Estimate of mu:
mu_lik = as.data.frame(cbind(m, mplik)) %>%
filter(mplik == max(mplik)) %>% #keep only the row where mplik is maximized.
select(m)
mu_lik
ggplot()+
geom_line(aes(x = m, y = mplik))+
theme_bw()+
xlab(expression(mu)) + ylab(expression(l(mu))) +
geom_vline(aes(m, mplik), xintercept = 34)
y<-c(39.91,43.97,23.19,38.87,39.81,22.70,27.72,
44.67,28.64,38.58,37.38,49.90,41.48,41.73,
51.45,35.72,33.41,76.60,32.02,30.35,38.29,
38.71,31.79,39.64,26.49)
mean(y)
## [1] 38.1208
length(y)
## [1] 25
plik<-function(alpha,data){
beta<-optim(par=1,fn = function(beta_1){
-sum(dgamma(data,shape = alpha,scale = beta_1,log = T))})$par
-sum(dgamma(data,alpha,beta,log = T))
}
optim(35,plik,data=y)
## $par
## [1] 37.91715
##
## $value
## [1] 100.4767
##
## $counts
## function gradient
## 64 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
m<-seq(20,50,length.out=250)
mplike<-NULL
for(i in 1:250){
mplike[i]=-plik(m[i],y)
}
mulike<- as.data.frame(cbind(m,mplike)) %>%
filter(mplike==max(mplike)) %>%
select(m)%>%
as.numeric()
ggplot()+geom_line(aes(x=m,y=mplike))+
theme_bw()+
xlab(expression(mu)) +ylab(expression(l(mu)))+
geom_vline(aes(m,mplike),xintercept = mulike,col="Green")
Class lecture
Rotate the matrix:
n <- 276
m <- 3
p <- 6
crmat <- matrix(c(
1, 0.505, 0.569, 0.602, 0.621, 0.603,
0.505, 1, 0.422, 0.467, 0.482, 0.450,
0.569, 0.422, 1, 0.926, 0.877, 0.878,
0.602, 0.467, 0.926, 1, 0.874, 0.894,
0.621, 0.482, 0.877, 0.874, 1, 0.937,
0.603, 0.450, 0.878, 0.894, 0.937, 1
), nrow = 6, ncol = 6, byrow = TRUE)
Eigen Values & eigen vectors
eg <- eigen(crmat)
lmda <- eg$values
lmda
## [1] 4.45644850 0.78240991 0.45842506 0.16883257 0.07908774 0.05479622
eL <- eg$vectors[ , 1:m]
eL
## [,1] [,2] [,3]
## [1,] -0.3507933 0.3956532 0.84666989
## [2,] -0.2862040 0.8146406 -0.50202982
## [3,] -0.4399784 -0.2632446 -0.11051604
## [4,] -0.4468851 -0.1972029 -0.09896293
## [5,] -0.4488806 -0.1614667 -0.06586761
## [6,] -0.4474933 -0.2134503 -0.06906640
Loadings: L = ei√λ
L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m) {
L[, i] <- sqrt(lmda[i]) * eL[, i]
}
L
## [,1] [,2] [,3]
## [1,] -0.7405352 0.3499708 0.57325558
## [2,] -0.6041852 0.7205817 -0.33990980
## [3,] -0.9288076 -0.2328502 -0.07482720
## [4,] -0.9433880 -0.1744338 -0.06700492
## [5,] -0.9476006 -0.1428236 -0.04459705
## [6,] -0.9446719 -0.1888052 -0.04676285
Communalities and Specific Variance
cm <- apply(X = L^2,MARGIN = 1, FUN = sum)
cm
## [1] 0.9994939 0.9998164 0.9225019 0.9248977 0.9203343 0.9302392
sv <- 1 - cm
sv
## [1] 0.0005060766 0.0001835913 0.0774981111 0.0751022503 0.0796656638
## [6] 0.0697608285
Proportion Explained and Cumulative Proportion
prop <- diag(t(L) %*% L) / p #only m=3 coln included
cumsum(prop)
## [1] 0.7427414 0.8731431 0.9495472
pr<-lmda/p#can't do it bcz all coln included
cumsum(pr)
## [1] 0.7427414 0.8731431 0.9495472 0.9776860 0.9908673 1.0000000
Rotation
varimax(L)
## $loadings
##
## Loadings:
## [,1] [,2] [,3]
## [1,] -0.354 0.244 0.903
## [2,] -0.234 0.949 0.211
## [3,] -0.921 0.166 0.218
## [4,] -0.903 0.214 0.252
## [5,] -0.887 0.229 0.284
## [6,] -0.907 0.192 0.264
##
## [,1] [,2] [,3]
## SS loadings 3.454 1.123 1.120
## Proportion Var 0.576 0.187 0.187
## Cumulative Var 0.576 0.763 0.950
##
## $rotmat
## [,1] [,2] [,3]
## [1,] 0.8546542 -0.3385584 -0.3936299
## [2,] 0.4824845 0.7979214 0.3612896
## [3,] 0.1917680 -0.4986980 0.8452960
MLE: factanal performs MLE factor analysis on Covariance matrix.
fit <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "none")
fit
##
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "none")
##
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
##
## Loadings:
## Factor1 Factor2 Factor3
## [1,] 0.622 0.284 0.143
## [2,] 0.484 0.660 0.121
## [3,] 0.937
## [4,] 0.992 -0.105
## [5,] 0.920 0.367
## [6,] 0.926 0.232
##
## Factor1 Factor2 Factor3
## SS loadings 4.187 0.520 0.236
## Proportion Var 0.698 0.087 0.039
## Cumulative Var 0.698 0.784 0.824
##
## The degrees of freedom for the model is 0 and the fit was 8e-04
Class Lecture Slide
theta0 <- 10
eps <- 10
result <- c()
while( eps > 0.001) {
l1 <- 257.59 - 15* theta0
l2 <- -15
theta1 <- theta0 - l1/l2
eps <- abs(theta1 - theta0)
result <- rbind(result, c(theta0, theta1, l1, eps ))
theta0 <- theta1
}
result
## [,1] [,2] [,3] [,4]
## [1,] 10.00000 17.17267 107.59 7.172667
## [2,] 17.17267 17.17267 0.00 0.000000
sigma_0_sq<-2.5
eps<-10
result<-c()
while(eps>0.001){
l1<- -15/(2*sigma_0_sq)+124.88/(2*((sigma_0_sq)^2))
l2<-15/(2*sigma_0_sq^2)- 124.88/sigma_0_sq^3
sigma1<-sigma_0_sq-l1/l2
eps<-abs(sigma1-sigma_0_sq)
result<-rbind(result,c(sigma_0_sq,sigma1,l1,eps))
sigma_0_sq<-sigma1
}
result
## [,1] [,2] [,3] [,4]
## [1,] 2.500000 3.529162 6.990400e+00 1.029162e+00
## [2,] 3.529162 4.819141 2.888103e+00 1.289979e+00
## [3,] 4.819141 6.247261 1.132290e+00 1.428120e+00
## [4,] 6.247261 7.495147 3.993398e-01 1.247886e+00
## [5,] 7.495147 8.174777 1.108349e-01 6.796305e-01
## [6,] 8.174777 8.319985 1.689693e-02 1.452075e-01
## [7,] 8.319985 8.325326 5.795058e-04 5.341750e-03
## [8,] 8.325326 8.325333 7.431753e-07 6.868025e-06
Slide Given
dat3 <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/3. Newton_Raphson/NewtonMethod3.txt")
n <- nrow(dat3)
result <- c()
mu0 <- 30
sigma2_0 <- 5
eps <- 5
while (eps > 0.001){
#g matrix
g1 <- (- sum(dat3 - mu0)) / sigma2_0
g2 <- n/(2*sigma2_0) - sum((dat3 - mu0)^2) / (2 * sigma2_0^2)
g <- matrix(c(g1,g2), nrow = 2)
# H matrix
h11 <- n/ sigma2_0
h12 = h21 <- sum(dat3)/(sigma2_0^2) - (n * mu0) / (sigma2_0^2)
h22 <- (-n/(2 * sigma2_0^2)) + sum((dat3 - mu0)^2) / (sigma2_0^3)
H <-matrix(c(h11, h12, h21, h22), nrow = 2, byrow = T)
#Theta
theta0 <- matrix(c(mu0, sigma2_0), nrow = 2)
theta1 <- theta0 - solve(H) %*% g
eps <- abs(max(theta1 - theta0))
result <- rbind(result,c(theta1[1,1], theta1[2,1], eps))
mu0 <- theta1[1,1]
sigma2_0 <- theta1[2,1]
}
result
## [,1] [,2] [,3]
## [1,] 38.08833 1.690092 8.088330e+00
## [2,] 35.07835 1.801138 1.110456e-01
## [3,] 34.96474 2.635538 8.343997e-01
## [4,] 34.91048 3.812803 1.177266e+00
## [5,] 34.88506 5.412478 1.599675e+00
## [6,] 34.87362 7.456511 2.044032e+00
## [7,] 34.86886 9.800152 2.343641e+00
## [8,] 34.86717 11.992387 2.192235e+00
## [9,] 34.86673 13.361853 1.369466e+00
## [10,] 34.86668 13.744016 3.821623e-01
## [11,] 34.86668 13.767125 2.310881e-02
## [12,] 34.86668 13.767203 7.797036e-05
Lecture Slide is Here.
library(mvtnorm)
set.seed(165)
y <- matrix(
data = rnorm(n = 25, mean = 20, sd = 5),
ncol = 1)
x <- matrix(
data = rep(x = 1, times = 25),
ncol = 1)
A <- diag(x = length(y)) - x %*% solve(t(x) %*% x) %*% t(x)
#residual likelihood
reml_lik <- function(par, y, A){
w <- t(A) %*% y
mu <- rep(x = 0, times = length(w))
cmat <- t(A) %*% (par * diag(length(y))) %*% A
sum(dmvnorm(x = as.numeric(w),
mean = mu, sigma = cmat,log = TRUE))
}
reml_lik(par = 25, y = y, A = A[,-length(y)])
## [1] -72.02488
optimise(f = reml_lik,interval = c(0, 1e+100),
y = y, A = A[, -length(y)], maximum = TRUE)
## $maximum
## [1] 26.98599
##
## $objective
## [1] -71.98891
Here is the Question
Solution:
Ans to Q no. 1(b)
x <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/plik.txt")
mle <- function(data,alpha){
-sum(log(dgamma(x = data,shape = alpha,scale = 1)))
}
optim(par = 10,fn = mle,data=x$V1,method = "Brent",upper = 100,lower = 10)
## $par
## [1] 37.30919
##
## $value
## [1] 100.3793
##
## $counts
## function gradient
## NA NA
##
## $convergence
## [1] 0
##
## $message
## NULL
Ans to Q no. 1(c)
xp <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/plik.txt")
plik <- function(alpha,data){
bit <- optim(par = 1,fn = function(alpha,beta,data){
-sum(log(dgamma(x = data,shape = alpha,scale = beta)))},
data = data,
alpha = alpha,
method = "Brent",
lower = 0.1,
upper = 10)
opt_beta <- bit$par
-sum(log(dgamma(x = data,shape = alpha,scale = opt_beta)))
}
optim(par = 35,
fn = plik,
data = xp$V1,
method = "Brent",
lower = 10,
upper = 100)
## $par
## [1] 14.63149
##
## $value
## [1] 92.36291
##
## $counts
## function gradient
## NA NA
##
## $convergence
## [1] 0
##
## $message
## NULL
Ans to Q no. 2(a)
xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
cig2 <- 10
mu0 <- 10
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result1 <- c()
while (eps>0.001) {
l1 <- sum(dat1 - mu0) / cig2
l2 <- -n / cig2
mu1 <- mu0 - (l1/l2)
eps <- abs(mu1 - mu0)
result1 <- rbind(result1,c(mu1,eps))
mu0 <- mu1
}
result1
## [,1] [,2]
## [1,] -0.3989145 1.039891e+01
## [2,] -0.3989145 8.881784e-16
xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
sig2 <- 5
m0 <- 5
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result2 <- c()
while (eps>0.001) {
l1 <- -n/(2*sig2) + sum((dat1-m0)^2)/(2*sig2^2)
l2 <- n/(2*sig2^2) - sum((dat1-m0)^2)/(sig2^3)
sig2_1 <- sig2 - (l1/l2)
eps <- abs(sig2_1 - sig2)
result2 <- rbind(result2,c(sig2_1,eps))
sig2 <- sig2_1
}
result2
## [,1] [,2]
## [1,] 7.320477 2.3204769638
## [2,] 10.582627 3.2621505234
## [3,] 14.999636 4.4170089816
## [4,] 20.612890 5.6132536269
## [5,] 26.986327 6.3734364086
## [6,] 32.836524 5.8501976854
## [7,] 36.354872 3.5183481331
## [8,] 37.266356 0.9114833252
## [9,] 37.314350 0.0479939880
## [10,] 37.314474 0.0001239378
Ans to Q no. 2(c)
xd <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/newton.txt")
sig2 <- 2.5
m0 <- 30
n <- length(xd$V1)
dat1 <- xd$V1
eps <- 5
result3 <- c()
while (eps>0.001) {
g1 <- -sum(dat1-m0)/sig2
g2 <- n/(2*sig2) - sum((dat1-m0)^2)/(2*sig2^2)
g <- matrix(c(g1,g2),nrow = 2)
h11 <- n/sig2
h12 = h21 <- sum(dat1-m0)/(sig2^2)
h22 <- -n/(2*sig2^2) + sum((dat1-m0^2))/(sig2^3)
H <- matrix(c(h11,h12,h21,h22),nrow = 2,byrow = T)
th0 <- matrix(c(m0,sig2),nrow = 2)
th1 <- th0 - solve(H) %*% g
eps <- abs(max(th1-th0))
result3 <- rbind(result3,c(th1[1,1],th1[2,1],eps))
m0 <- th1[1,1]
sig2 <- th1[2,1]
}
result3
## [,1] [,2] [,3]
## [1,] 7.2470712 3.1288042 0.6288041689
## [2,] 1.4093942 3.8687799 0.7399757344
## [3,] -0.5213124 3.6069162 0.2618637492
## [4,] -0.2871857 0.3144069 0.2341266301
## [5,] -1.0715507 -1.5784046 0.7843649524
## [6,] 2.1830360 4.4803868 6.0587914138
## [7,] -0.1254516 4.9549194 0.4745325562
## [8,] -0.5434621 2.3358374 0.4180105072
## [9,] -0.1759433 -1.2672961 0.3675187451
## [10,] 6.3915973 -39.8623766 6.5675405985
## [11,] -0.4954643 -39.2955995 0.5667770907
## [12,] -0.5195186 -88.3812501 0.0240542955
## [13,] -0.5327113 -186.4303727 0.0131926795
## [14,] -0.5396167 -382.4825512 0.0069053506
## [15,] -0.5431488 -774.5669393 0.0035321855
## [16,] -0.5449351 -1558.7264315 0.0017862571
## [17,] -0.5458333 -3127.0409417 0.0008982058
Ans to Q no. 03
reml <- read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/reml.txt", quote="\"", comment.char="")
library(mvtnorm)
y<-as.matrix(read.table("G:/My Drive/Turin/4th Year/AST 430/1st_inc/INC Soln/reml.txt"),ncol=1)
x<-matrix(data = rep(1,times=25),ncol = 1)
A<-diag(x = length(y))- x%*%solve(t(x)%*%x)%*%t(x)
reml_lik<-function(par,y,A){
w=t(A)%*%y
mu=rep(x = 0,times=length(w))
cmat<-t(A)%*%(par*diag(x = length(y)))%*%A
sum(dmvnorm(x = as.numeric(w),mean = mu,sigma = cmat,log = T))
}
optimise(f = reml_lik,interval = c(0,1e+100),y=y,
A=A[,-length(y)],maximum = TRUE)
## $maximum
## [1] 27.18862
##
## $objective
## [1] -72.07866
Multivariate Analysis (PCA)
setwd(“~/AST430/Lecture - Factor Analysis”) manually, Session–>Set working directory–>choose directory
library(readr)
library(tidyverse)
library(corrr)
library(ggcorrplot)
protein <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/protein.csv")
head (protein)
prot_num <- protein %>% select(-Country)
prot_normalized <- scale(prot_num) #substracting mean and dividing by standard deviation
cor_mat <- cor(prot_normalized)
#Specifically designed to visualize correlation matrices
ggcorrplot::ggcorrplot(cor_mat)
#Eigenvalues (variances explained by each principal component).
#Eigenvectors (also called "loadings" or contributions of variables to each principal component).
pca.data <- princomp(prot_num, cor = T)
pca.data
## Call:
## princomp(x = prot_num, cor = T)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 2.0016087 1.2786710 1.0620355 0.9770691 0.6810568 0.5702026 0.5211586 0.3410160
## Comp.9
## 0.3148204
##
## 9 variables and 25 observations.
summary(pca.data)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.0016087 1.2786710 1.0620355 0.9770691 0.6810568
## Proportion of Variance 0.4451597 0.1816666 0.1253244 0.1060738 0.0515376
## Cumulative Proportion 0.4451597 0.6268263 0.7521507 0.8582245 0.9097621
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.57020257 0.52115865 0.34101599 0.31482043
## Proportion of Variance 0.03612566 0.03017848 0.01292132 0.01101243
## Cumulative Proportion 0.94588776 0.97606624 0.98898757 1.00000000
pca.data$loadings[, 1:2]
## Comp.1 Comp.2
## RedMeat 0.3026094 0.05625165
## WhiteMeat 0.3105562 0.23685334
## Eggs 0.4266785 0.03533576
## Milk 0.3777273 0.18458877
## Fish 0.1356499 -0.64681970
## Cereals -0.4377434 0.23348508
## Starch 0.2972477 -0.35282564
## Nuts -0.4203344 -0.14331056
## Fr&Veg -0.1104199 -0.53619004
#screeplot:
screeplot(pca.data)
screeplot(pca.data, type = 'lines')
Fit a model with 3 factors using PCA & MLE and then rotate the matrix.
crmat <- matrix(c(1,0.505,0.569,0.602,0.621,0.603,
0.505,1,0.422,0.467,0.482,0.450,
0.569,0.422,1,0.926,0.877,0.878,
0.602,0.467,0.926,1,0.874,0.894,
0.621,0.482,0.877,0.874,1,0.937,
0.603,0.450,0.878,0.894,0.937,1),
nrow=6,ncol=6,byrow=TRUE)
crmat
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.000 0.505 0.569 0.602 0.621 0.603
## [2,] 0.505 1.000 0.422 0.467 0.482 0.450
## [3,] 0.569 0.422 1.000 0.926 0.877 0.878
## [4,] 0.602 0.467 0.926 1.000 0.874 0.894
## [5,] 0.621 0.482 0.877 0.874 1.000 0.937
## [6,] 0.603 0.450 0.878 0.894 0.937 1.000
#n<-276
### PCA :
#Eigen Values
p <- 6 #total variables
m <- 3 #since 3 factors
lmda <- eigen(crmat)$values[1:m]
lmda
## [1] 4.4564485 0.7824099 0.4584251
#Eigen Vectors
eL <- eigen(crmat)$vectors[,1:m]
eL
## [,1] [,2] [,3]
## [1,] -0.3507933 0.3956532 0.84666989
## [2,] -0.2862040 0.8146406 -0.50202982
## [3,] -0.4399784 -0.2632446 -0.11051604
## [4,] -0.4468851 -0.1972029 -0.09896293
## [5,] -0.4488806 -0.1614667 -0.06586761
## [6,] -0.4474933 -0.2134503 -0.06906640
# Loadings
L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m){
L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.9994939 0.9998164 0.9225019 0.9248977 0.9203343 0.9302392
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.0005060766 0.0001835913 0.0774981111 0.0751022503 0.0796656638
## [6] 0.0697608285
# Proportion explained:
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.74274142 0.13040165 0.07640418
# Cumulative proportion:
cumsum(prop)
## [1] 0.7427414 0.8731431 0.9495472
# So 96% of the total proportion is explained by the three factors.
### Rotation:
varimax(L)
## $loadings
##
## Loadings:
## [,1] [,2] [,3]
## [1,] -0.354 0.244 0.903
## [2,] -0.234 0.949 0.211
## [3,] -0.921 0.166 0.218
## [4,] -0.903 0.214 0.252
## [5,] -0.887 0.229 0.284
## [6,] -0.907 0.192 0.264
##
## [,1] [,2] [,3]
## SS loadings 3.454 1.123 1.120
## Proportion Var 0.576 0.187 0.187
## Cumulative Var 0.576 0.763 0.950
##
## $rotmat
## [,1] [,2] [,3]
## [1,] 0.8546542 -0.3385584 -0.3936299
## [2,] 0.4824845 0.7979214 0.3612896
## [3,] 0.1917680 -0.4986980 0.8452960
### MLE Method:
#factanal performs MLE factor analysis on Covariance matrix:
fit <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "none")
fit
##
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "none")
##
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
##
## Loadings:
## Factor1 Factor2 Factor3
## [1,] 0.622 0.284 0.143
## [2,] 0.484 0.660 0.121
## [3,] 0.937
## [4,] 0.992 -0.105
## [5,] 0.920 0.367
## [6,] 0.926 0.232
##
## Factor1 Factor2 Factor3
## SS loadings 4.187 0.520 0.236
## Proportion Var 0.698 0.087 0.039
## Cumulative Var 0.698 0.784 0.824
##
## The degrees of freedom for the model is 0 and the fit was 8e-04
fit2 <- factanal(covmat = crmat, n.obs = n, factors = m, rotation = "varimax")
fit2
##
## Call:
## factanal(factors = m, covmat = crmat, n.obs = n, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.512 0.317 0.117 0.005 0.019 0.088
##
## Loadings:
## Factor1 Factor2 Factor3
## [1,] 0.466 0.502 0.139
## [2,] 0.208 0.798
## [3,] 0.889 0.289
## [4,] 0.935 0.345
## [5,] 0.823 0.359 0.418
## [6,] 0.851 0.323 0.288
##
## Factor1 Factor2 Factor3
## SS loadings 3.326 1.324 0.293
## Proportion Var 0.554 0.221 0.049
## Cumulative Var 0.554 0.775 0.824
##
## The degrees of freedom for the model is 0 and the fit was 8e-04
Use PCA and MLE methods for the three datasets: protein.csv, pulp_paper.csv and crime.csv for factor analysis and find the loadings, communalities, specific variance, and proportion explained. Suggest names of the factors before and after the rotation.
protein <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/protein.csv")
## Rows: 25 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (9): RedMeat, WhiteMeat, Eggs, Milk, Fish, Cereals, Starch, Nuts, Fr&Veg
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(protein)
protein<-protein %>% select(-Country)
protein_stand<-scale(protein)
protein_cor<-cor(protein_stand)
ggcorrplot(protein_cor)
pca_data<-princomp(protein_cor)
pca_data
## Call:
## princomp(x = protein_cor)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## 1.278539e+00 5.187769e-01 3.715581e-01 2.854996e-01 1.517555e-01 1.000892e-01
## Comp.7 Comp.8 Comp.9
## 8.461374e-02 3.671406e-02 7.958511e-09
##
## 9 variables and 9 observations.
summary(pca_data)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.2785390 0.5187769 0.37155811 0.2854996 0.15175548
## Proportion of Variance 0.7550709 0.1243143 0.06376953 0.0376505 0.01063772
## Cumulative Proportion 0.7550709 0.8793852 0.94315474 0.9808052 0.99144296
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.100089157 0.084613737 0.0367140593 7.958511e-09
## Proportion of Variance 0.004627366 0.003307056 0.0006226222 2.925657e-17
## Cumulative Proportion 0.996070322 0.999377378 1.0000000000 1.000000e+00
pca_data$loadings[, 1:2]
## Comp.1 Comp.2
## RedMeat 0.2902937 0.09510150
## WhiteMeat 0.3131864 0.26392299
## Eggs 0.4131713 0.07419838
## Milk 0.3852342 0.15393922
## Fish 0.1165053 -0.69780771
## Cereals -0.4342505 0.27531862
## Starch 0.2796041 -0.36379635
## Nuts -0.4400339 -0.07686551
## Fr&Veg -0.1567571 -0.43715631
#screeplot
screeplot(pca_data, type = "l")
pulp<-read.table("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/pulp_paper.dat")
pulp<-pulp[-1,]
pulp <- data.frame(lapply(pulp, as.numeric))
head(pulp)
pulp_stand<-scale(pulp)
pulp_cor<-cor(pulp_stand)
ggcorrplot(pulp_cor)
pca<-princomp(pulp_cor)
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.4030875 0.28721942 0.098671244 0.0283030687
## Proportion of Variance 0.9547186 0.04000677 0.004721577 0.0003884839
## Cumulative Proportion 0.9547186 0.99472541 0.999446987 0.9998354709
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.0167774417 7.384949e-03 1.800878e-03 0
## Proportion of Variance 0.0001365078 2.644849e-05 1.572804e-06 0
## Cumulative Proportion 0.9999719787 9.999984e-01 1.000000e+00 1
pca$loadings[, 1:2]
## Comp.1 Comp.2
## V1 0.3359621 0.3387672
## V2 0.3273338 0.4609865
## V3 0.3473831 0.2988179
## V4 0.3425073 0.2342509
## V5 0.3467051 -0.5252258
## V6 0.3543481 -0.3885918
## V7 -0.3843774 0.3184464
## V8 0.3853347 -0.0169917
screeplot(pca, type = "l")
crime <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_1/Data/Crime.csv")
## New names:
## Rows: 50 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (4): Murder, Assault, UrbanPop, Rape
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(crime)
crime<-crime %>% select(-...1)
crime_stand<-scale(crime)
crime_cor<-cor(crime_stand)
ggcorrplot(crime_cor)
pca<-princomp(crime_cor)
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.5564763 0.2037617 0.10523382 0
## Proportion of Variance 0.8548193 0.1146110 0.03056973 0
## Cumulative Proportion 0.8548193 0.9694303 1.00000000 1
pca$loadings[,1:2]
## Comp.1 Comp.2
## Murder 0.6164996 0.23245381
## Assault 0.4578583 -0.04103407
## UrbanPop -0.6216128 -0.04109728
## Rape 0.1545688 -0.97087200
screeplot(pca, type = "l")
data_1<-matrix(c(1.00, .02, .96, .42, .01,
.02, 1.00, .13, .71, .85,
.96, .13, 1.00, .50, .11,
.42, .71, .50, 1.00, .79,
.01, .85, .11, .79, 1.00), nrow = 5, byrow = TRUE)
n<-276
p <- 5 #total variables
m <- 2 #2 eigenvalues are greater than unity
lmda <- eigen(data_1)$values[1:m]
lmda
## [1] 2.853090 1.806332
#Eigen Vectors
eL <- eigen(data_1)$vectors[,1:m]
eL
## [,1] [,2]
## [1,] 0.3314539 -0.60721643
## [2,] 0.4601593 0.39003172
## [3,] 0.3820572 -0.55650828
## [4,] 0.5559769 0.07806457
## [5,] 0.4725608 0.40418799
# Loadings
L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m){
L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
## [,1] [,2]
## [1,] 0.5598618 -0.8160981
## [2,] 0.7772594 0.5242021
## [3,] 0.6453364 -0.7479464
## [4,] 0.9391057 0.1049187
## [5,] 0.7982069 0.5432281
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.9794614 0.8789200 0.9758829 0.8929275 0.9322311
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.02053865 0.12107998 0.02411712 0.10707250 0.06776888
# Proportion explained:
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.5706181 0.3612665
# Cumulative proportion:
cumsum(prop)
## [1] 0.5706181 0.9318846
# So 96% of the total proportion is explained by the three factors.
### Rotation:
varimax(L)
## $loadings
##
## Loadings:
## [,1] [,2]
## [1,] -0.989
## [2,] 0.937
## [3,] 0.130 -0.979
## [4,] 0.843 -0.427
## [5,] 0.965
##
## [,1] [,2]
## SS loadings 2.539 2.121
## Proportion Var 0.508 0.424
## Cumulative Var 0.508 0.932
##
## $rotmat
## [,1] [,2]
## [1,] 0.8365697 -0.5478605
## [2,] 0.5478605 0.8365697
### MLE Method:
#factanal performs MLE factor analysis on Covariance matrix:
fit <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "none")
fit
##
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "none")
##
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
##
## Loadings:
## Factor1 Factor2
## [1,] 0.976 -0.139
## [2,] 0.150 0.860
## [3,] 0.979
## [4,] 0.535 0.738
## [5,] 0.146 0.963
##
## Factor1 Factor2
## SS loadings 2.241 2.232
## Proportion Var 0.448 0.446
## Cumulative Var 0.448 0.895
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
fit1 <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "varimax")
fit1
##
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
##
## Loadings:
## Factor1 Factor2
## [1,] 0.985
## [2,] 0.873
## [3,] 0.131 0.971
## [4,] 0.817 0.405
## [5,] 0.973
##
## Factor1 Factor2
## SS loadings 2.396 2.078
## Proportion Var 0.479 0.416
## Cumulative Var 0.479 0.895
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
data_2<-matrix(c(1.000, .632, .511, .115, .155,
.632, 1.000, .574, .322, .213,
.511, .574, 1.000, .183, .146,
.115, .322, .183, 1.000, .683,
.155, .213, .146, .683, 1.000), nrow = 5, byrow = TRUE)
n<-276
p <- 5 #total variables
m <- 2 #2 eigenvalues are greater than unity
lmda <- eigen(data_2)$values
lmda
## [1] 2.4376148 1.4062612 0.5001857 0.4000328 0.2559055
#Eigen Vectors
eL <- eigen(data_2)$vectors[,1:m]
eL
## [,1] [,2]
## [1,] -0.4692143 0.3677864
## [2,] -0.5322254 0.2364426
## [3,] -0.4652234 0.3154110
## [4,] -0.3873348 -0.5850179
## [5,] -0.3607117 -0.6058861
# Loadings
L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m){
L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
## [,1] [,2]
## [1,] -0.7325779 0.4361428
## [2,] -0.8309562 0.2803875
## [3,] -0.7263470 0.3740330
## [4,] -0.6047405 -0.6937486
## [5,] -0.5631743 -0.7184954
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.7268909 0.7691054 0.6674806 0.8469982 0.8334010
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.2731091 0.2308946 0.3325194 0.1530018 0.1665990
# Proportion explained:
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.4875230 0.2812522
# Cumulative proportion:
cumsum(prop)
## [1] 0.4875230 0.7687752
# So 96% of the total proportion is explained by the three factors.
### Rotation:
varimax(L)
## $loadings
##
## Loadings:
## [,1] [,2]
## [1,] -0.852
## [2,] -0.849 -0.220
## [3,] -0.813
## [4,] -0.126 -0.912
## [5,] -0.910
##
## [,1] [,2]
## SS loadings 2.128 1.715
## Proportion Var 0.426 0.343
## Cumulative Var 0.426 0.769
##
## $rotmat
## [,1] [,2]
## [1,] 0.8367920 0.5475208
## [2,] -0.5475208 0.8367920
### MLE Method:
#factanal performs MLE factor analysis on Covariance matrix:
fit <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "none")
fit
##
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "none")
##
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
##
## Loadings:
## Factor1 Factor2
## [1,] 0.976 -0.139
## [2,] 0.150 0.860
## [3,] 0.979
## [4,] 0.535 0.738
## [5,] 0.146 0.963
##
## Factor1 Factor2
## SS loadings 2.241 2.232
## Proportion Var 0.448 0.446
## Cumulative Var 0.448 0.895
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
fit1 <- factanal(covmat = data_1, n.obs = n, factors = m, rotation = "varimax")
fit1
##
## Call:
## factanal(factors = m, covmat = data_1, n.obs = n, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.028 0.237 0.040 0.168 0.052
##
## Loadings:
## Factor1 Factor2
## [1,] 0.985
## [2,] 0.873
## [3,] 0.131 0.971
## [4,] 0.817 0.405
## [5,] 0.973
##
## Factor1 Factor2
## SS loadings 2.396 2.078
## Proportion Var 0.479 0.416
## Cumulative Var 0.479 0.895
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 6.32 on 1 degree of freedom.
## The p-value is 0.0119
data_3<-matrix(c(1.000, .6386, .4752, .3227, .5520, .3262, .3509, .4008, .1821, -.0352,
.6386, 1.000, .4953, .5668, .4706, .3520, .3998, .5167, .3102, .1012,
.4752, .4953, 1.000, .4357, .2539, .2812, .7926, .4728, .4682, -.0120,
.3227, .5668, .4357, 1.000, .3449, .3503, .3657, .6040, .2344, .2380,
.5520, .4706, .2539, .3449, 1.000, .1546, .2100, .4213, .2116, .4125,
.3262, .3520, .2812, .3503, .1546, 1.000, .2553, .4163, .1712, .0002,
.3509, .3998, .7926, .3657, .2100, .2553, 1.000, .4036, .4179, .0109,
.4008, .5167, .4728, .6040, .4213, .4163, .4036, 1.000, .3151, .2395,
.1821, .3102, .4682, .2344, .2116, .1712, .4179, .3151, 1.000,.0983,
-.0352, .1012, -.0120, .2380, .4125, .0002, .0109, .2395, .0983,1.000),
nrow = 10, byrow = TRUE)
n<-280
p <- 10 #total variables
m <- 4 #2 eigenvalues are greater than unity
lmda <- eigen(data_3)$values[1:m]
lmda
## [1] 4.2129434 1.3895855 1.0594001 0.9177767
#Eigen Vectors
eL <- eigen(data_3)$vectors[,1:m]
eL
## [,1] [,2] [,3] [,4]
## [1,] -0.3391615 -0.01874585 0.455010859 0.43461912
## [2,] -0.3861285 -0.06376924 0.247452683 0.11964658
## [3,] -0.3756562 0.36853015 -0.191728996 0.11708682
## [4,] -0.3462877 -0.15328490 -0.004441842 -0.38355709
## [5,] -0.2946590 -0.46544520 0.043795306 0.41438248
## [6,] -0.2497538 0.07016977 0.361172572 -0.58568225
## [7,] -0.3360646 0.38720230 -0.280390394 0.08112517
## [8,] -0.3707459 -0.13775864 -0.017738766 -0.31756023
## [9,] -0.2525672 0.21345999 -0.504151016 0.07679286
## [10,] -0.1070443 -0.63264530 -0.479048235 -0.08891610
# Loadings
L <- matrix(NA, nrow = p, ncol = m)
for(i in 1:m){
L[,i] <- sqrt(lmda[i] ) * eL[,i]
}
L
## [,1] [,2] [,3] [,4]
## [1,] -0.6961444 -0.02209774 0.468329773 0.41636799
## [2,] -0.7925464 -0.07517162 0.254696029 0.11462221
## [3,] -0.7710515 0.43442586 -0.197341218 0.11216995
## [4,] -0.7107713 -0.18069329 -0.004571862 -0.36745024
## [5,] -0.6048010 -0.54866998 0.045077267 0.39698116
## [6,] -0.5126311 0.08271661 0.371744685 -0.56108748
## [7,] -0.6897880 0.45643672 -0.288597881 0.07771845
## [8,] -0.7609729 -0.16239082 -0.018258009 -0.30422480
## [9,] -0.5184056 0.25162810 -0.518908343 0.07356806
## [10,] -0.2197133 -0.74576659 -0.493070762 -0.08518221
# Communalities
cm <- apply(X = L^2, MARGIN = 1, FUN = sum)
cm
## [1] 0.8778004 0.7117888 0.8347719 0.6728865 0.8264490 0.7226459 0.7734708
## [8] 0.6983366 0.6067392 0.8548165
#Specific Variance
sv <- 1 - cm
sv
## [1] 0.1221996 0.2882112 0.1652281 0.3271135 0.1735510 0.2773541 0.2265292
## [8] 0.3016634 0.3932608 0.1451835
# Proportion explained:
prop <- diag(t(L) %*% L)/p
prop
## [1] 0.42129434 0.13895855 0.10594001 0.09177767
# Cumulative proportion:
cumsum(prop)
## [1] 0.4212943 0.5602529 0.6661929 0.7579706
# So 96% of the total proportion is explained by the three factors.
### Rotation:
varimax(L)
## $loadings
##
## Loadings:
## [,1] [,2] [,3] [,4]
## [1,] -0.885 0.139 -0.182 -0.205
## [2,] -0.664 -0.291 -0.429
## [3,] -0.302 -0.819 -0.252
## [4,] -0.221 -0.293 -0.267 -0.683
## [5,] -0.746 -0.507
## [6,] -0.108 0.161 -0.826
## [7,] -0.185 -0.832 -0.204
## [8,] -0.277 -0.293 -0.324 -0.656
## [9,] -0.188 -0.754
## [10,] -0.921
##
## [,1] [,2] [,3] [,4]
## SS loadings 2.045 1.376 2.234 1.924
## Proportion Var 0.205 0.138 0.223 0.192
## Cumulative Var 0.205 0.342 0.566 0.758
##
## $rotmat
## [,1] [,2] [,3] [,4]
## [1,] 0.5683672 0.18381686 0.5850636 0.5485169
## [2,] 0.2499160 0.79875769 -0.5445967 0.0542454
## [3,] -0.4876786 0.57138291 0.5835154 -0.3085465
## [4,] -0.6137348 0.04146187 -0.1436133 0.7752327
### MLE Method:
#factanal performs MLE factor analysis on Covariance matrix:
fit <- factanal(covmat = data_3, n.obs = n, factors = m, rotation = "none")
fit
##
## Call:
## factanal(factors = m, covmat = data_3, n.obs = n, rotation = "none")
##
## Uniquenesses:
## [1] 0.010 0.388 0.089 0.327 0.196 0.734 0.304 0.420 0.725 0.600
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## [1,] 0.993
## [2,] 0.665 0.252 0.239 0.220
## [3,] 0.530 0.777 -0.141
## [4,] 0.363 0.428 0.421 0.425
## [5,] 0.571 0.620 -0.304
## [6,] 0.343 0.190 0.323
## [7,] 0.401 0.718 -0.102
## [8,] 0.439 0.407 0.390 0.263
## [9,] 0.218 0.461
## [10,] 0.609 -0.145
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.686 1.794 1.187 0.539
## Proportion Var 0.269 0.179 0.119 0.054
## Cumulative Var 0.269 0.448 0.567 0.621
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 15.74 on 11 degrees of freedom.
## The p-value is 0.151
fit1 <- factanal(covmat = data_3, n.obs = n, factors = m, rotation = "varimax")
fit1
##
## Call:
## factanal(factors = m, covmat = data_3, n.obs = n, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.010 0.388 0.089 0.327 0.196 0.734 0.304 0.420 0.725 0.600
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## [1,] 0.205 0.296 0.928
## [2,] 0.280 0.554 0.451 0.155
## [3,] 0.883 0.278 0.228
## [4,] 0.254 0.739 0.242
## [5,] 0.142 0.151 0.519 0.701
## [6,] 0.136 0.465 0.173
## [7,] 0.794 0.220 0.133
## [8,] 0.314 0.612 0.168 0.279
## [9,] 0.477 0.160 0.139
## [10,] 0.111 0.619
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 1.958 1.719 1.471 1.057
## Proportion Var 0.196 0.172 0.147 0.106
## Cumulative Var 0.196 0.368 0.515 0.621
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 15.74 on 11 degrees of freedom.
## The p-value is 0.151
Here is another pdf of the Assignment.
The class Lecture is Here.
The Silde is given…
Equal Variance
xb1 <- matrix(c(-0.0065,-0.0390), ncol=1)
xb2 <- matrix(c(-0.2483,0.0262), ncol=1)
Sp_inv <- matrix(c(131.158,-90.423,-90.423,108.147), nrow = 2, byrow = T)
#Discriminant Function
t(xb1- xb2) %*% Sp_inv
## [,1] [,2]
## [1,] 37.60958 -28.91547
#Alocation Rule
(1/2) * t(xb1 - xb2) %*% Sp_inv %*% (xb1 + xb2)
## [,1]
## [1,] -4.606402
#Allocate a women for whom x1 = −0.210 and x2 = −0.044.
x0<-matrix(c(-.210, -.044), ncol = 1)
t(xb1- xb2) %*% Sp_inv %*% x0
## [,1]
## [1,] -6.625732
library(readr)
library(tidyverse)
rmowners <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination/rmowners.csv")
#View(rmowners)
dat <- rmowners %>%
group_split(Owner)
dat
## <list_of<
## tbl_df<
## Income : double
## Lot_Size: double
## Owner : character
## >
## >[2]>
## [[1]]
## # A tibble: 12 × 3
## Income Lot_Size Owner
## <dbl> <dbl> <chr>
## 1 105 19.6 No
## 2 82.8 20.8 No
## 3 94.8 17.2 No
## 4 73.2 20.4 No
## 5 114 17.6 No
## 6 79.2 17.6 No
## 7 89.4 16 No
## 8 96 18.4 No
## 9 77.4 16.4 No
## 10 63 18.8 No
## 11 81 14 No
## 12 93 14.8 No
##
## [[2]]
## # A tibble: 12 × 3
## Income Lot_Size Owner
## <dbl> <dbl> <chr>
## 1 90 18.4 Yes
## 2 116. 16.8 Yes
## 3 94.8 21.6 Yes
## 4 91.5 20.8 Yes
## 5 117 23.6 Yes
## 6 140. 19.2 Yes
## 7 138 17.6 Yes
## 8 113. 22.4 Yes
## 9 99 20 Yes
## 10 123 20.8 Yes
## 11 81 22 Yes
## 12 111 20 Yes
#Creating two dataset owner and non-owner:
dat_no<- dat[[1]] %>%
select(-Owner)
dat_yes<- dat[[2]] %>%
select(-Owner)
#Calculating mean and variance
x1_bar <- as.numeric(apply(dat_yes, 2, mean))
x2_bar <- as.numeric(apply(dat_no, 2, mean))
s_1 <- matrix(as.numeric(var(dat_yes)), nrow = 2, byrow = T)
s_2 <- matrix(as.numeric(var(dat_no)), nrow = 2, byrow = T)
n1 <- nrow(dat_yes)
n2 <- nrow(dat_no)
# Pooled variance
spinv <- solve( ( (n1-1) * s_1 + (n2-1) * s_2 ) / (n1 + n2 -2) )
a <- t(x1_bar - x2_bar) %*% spinv
a
## [,1] [,2]
## [1,] 0.1002303 0.7851847
m<- 0.5 * a %*% (x1_bar + x2_bar)
m
## [,1]
## [1,] 24.74567
#(ii)Classify a person with income 150 USD and a lot size 75 to one of the popn.
x1<-matrix(c(150, 75), ncol = 1)
a %*% x1
## [,1]
## [1,] 73.9234
Classifying Normal population with unequal variances
#calculating k
k <- 0.5 * log ( det(s_1)/ det(s_2) ) +
0.5 *( (t(x1_bar) %*% solve(s_1) %*% x1_bar) -
(t(x2_bar) %*% solve(s_2) %*% x2_bar) )
k
## [,1]
## [1,] 36.14149
-0.5 * (solve(s_1) - solve(s_2))
## [,1] [,2]
## [1,] 0.0009397948 -0.003089851
## [2,] -0.0030898514 -0.022789795
t(x1_bar) %*% solve(s_1) - t(x2_bar) %*% solve(s_2)
## [,1] [,2]
## [1,] 0.0379567 2.258857
#(ii)Classify a person with income 150 USD and a lot size 75 to one of the popn.
#x2<-matrix(c(50, 75), ncol = 1)
#a %*% x2
library(ggordiplots)
library(psych)
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
pairs.panels(iris[1:4],
gap = 0,
bg = c("red", "green", "blue")[iris$Species],
pch = 21)
library(tidyverse)
set.seed(123)
head(iris)
iris %>% count(Species)
Dividing the dataset into training and test data:
# First create a indicator
ind <- sample(x = c(1, 2),
size = nrow(iris),
replace = TRUE,
prob = c(0.6,0.4))
ind
## [1] 1 2 1 2 2 1 1 2 1 1 2 1 2 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 1 1 2
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 2 2 2 1 2 2 2 1
## [75] 1 1 1 2 1 1 1 2 1 2 1 1 2 2 2 1 1 2 1 2 1 1 2 1 1 1 1 1 1 2 1 2 2 2 1 1 2
## [112] 1 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 1 1 1 2 1 1 1
## [149] 1 2
training <- iris[ind==1,]
testing <- iris[ind==2,]
#Linear Discrimination analysis
library(MASS)
linear <- lda(Species~., training)
linear
## Call:
## lda(Species ~ ., data = training)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3370787 0.3370787 0.3258427
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 4.946667 3.380000 1.443333 0.250000
## versicolor 5.943333 2.803333 4.240000 1.316667
## virginica 6.527586 2.920690 5.489655 2.048276
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.3629008 -0.05215114
## Sepal.Width 2.2276982 -1.47580354
## Petal.Length -1.7854533 1.60918547
## Petal.Width -3.9745504 -4.10534268
##
## Proportion of trace:
## LD1 LD2
## 0.9932 0.0068
#confusion matrix
lda(Species~., training)
## Call:
## lda(Species ~ ., data = training)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3370787 0.3370787 0.3258427
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 4.946667 3.380000 1.443333 0.250000
## versicolor 5.943333 2.803333 4.240000 1.316667
## virginica 6.527586 2.920690 5.489655 2.048276
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.3629008 -0.05215114
## Sepal.Width 2.2276982 -1.47580354
## Petal.Length -1.7854533 1.60918547
## Petal.Width -3.9745504 -4.10534268
##
## Proportion of trace:
## LD1 LD2
## 0.9932 0.0068
p1 <- predict(linear, training)$class
tab <- table(Predicted = p1, Actual = training$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 30 0 0
## versicolor 0 30 0
## virginica 0 0 29
p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)
tab1
## Actual
## Predicted setosa versicolor virginica
## setosa 20 0 0
## versicolor 0 19 1
## virginica 0 1 20
#Error rate
err_rate_lda <- (1- sum(diag(tab1))/sum(tab1))*100
err_rate_lda
## [1] 3.278689
#Quadratic discriminant analysis
quad <- qda(Species~., training)
p3 <- predict(quad, testing)$class
tab2 <- table(Predicted = p3, Actual = testing$Species)
tab2
## Actual
## Predicted setosa versicolor virginica
## setosa 20 0 0
## versicolor 0 16 2
## virginica 0 4 19
#Error rate qda
err_rate_qda <- (1- sum(diag(tab2))/sum(tab2))*100
err_rate_qda
## [1] 9.836066
# Hold Out Procedure:
cnt=0
for(i in 1:nrow(iris)){
training <- iris[-i,]
testing <- iris[i,]
linear <- lda(Species~., training)
#confusion matrix
p1 <- predict(linear, testing)$class
if(p1 == testing$Species) cnt=cnt+1
}
#Error rate:
(1-cnt/nrow(iris))*100
## [1] 2
## Assignment
# Perform this hold out procedure by taking 1 to 150 observation and plot the difference.
Logistic Regression
data1 <- iris %>% filter(Species != "setosa")
data1$Species <- droplevels(data1$Species)
table(data1$Species)
##
## versicolor virginica
## 50 50
ind <- sample(x = 2,
size = nrow(data1),
replace = TRUE,
prob = c(0.6,0.4))
training <-data1[ind==1,]
testing <-data1[ind==2,]
linear <- lda(Species~., training)
linear
## Call:
## lda(Species ~ ., data = training)
##
## Prior probabilities of groups:
## versicolor virginica
## 0.483871 0.516129
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## versicolor 6.030000 2.783333 4.323333 1.346667
## virginica 6.678125 2.981250 5.665625 2.009375
##
## Coefficients of linear discriminants:
## LD1
## Sepal.Length -1.469118
## Sepal.Width -1.159612
## Petal.Length 2.365995
## Petal.Width 2.849550
#confusion matrix
p1 <- predict(linear, training)$class
p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)
Logistic model, for comparison
model_logistic = glm(Species~., data=training ,family=binomial)
logistic_probs = data.frame(probs = predict(model_logistic, testing, type="response"))
predictions_logistic = logistic_probs %>%
mutate(class = ifelse(probs>.5, "virginica", "versicolor"))
table(testing$Species, predictions_logistic$class)
##
## versicolor virginica
## versicolor 19 1
## virginica 0 18
For rmowner data
library(tidyverse)
library(MASS)
set.seed(47)
data <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination/rmowners.csv")
data$Owner<-as.factor(data$Owner)
ind<-sample(x=2, size = nrow(data), replace = TRUE, prob = c(0.6, 0.4))
train<-data[ind==1, ]
test<-data[ind==2, ]
linear<-lda(Owner~., data = train)
p<-predict(linear, test)$class
table(Predicted=p, Actual=test$Owner)
## Actual
## Predicted No Yes
## No 1 2
## Yes 0 4
#logistic
model<-glm(Owner~., data = train, family = binomial)
pred<-data.frame(prob=predict(model, test, type = "response"))
prob_log<-pred %>%
mutate(class=ifelse(prob>.5, "Yes", "No"))
table(Predicted=prob_log$class, Actual=test$Owner)
## Actual
## Predicted No Yes
## No 1 1
## Yes 0 5
From the combined 3 dataset
setwd("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/class_4_Discrimination")
load("assignment_classification (1).Rdata")
library(tidyverse)
dat<- rmowners1 %>%
group_split(Owner)
dat
## <list_of<
## tbl_df<
## Income : double
## Lot.size: double
## Owner : character
## >
## >[2]>
## [[1]]
## # A tibble: 12 × 3
## Income Lot.size Owner
## <dbl> <dbl> <chr>
## 1 139. 19.6 No
## 2 109. 20.8 No
## 3 91.8 17.2 No
## 4 108. 20.4 No
## 5 95.6 17.6 No
## 6 89.2 17.6 No
## 7 113. 16 No
## 8 92.9 18.4 No
## 9 110. 16.4 No
## 10 88.2 18.8 No
## 11 95.3 14 No
## 12 83.4 14.8 No
##
## [[2]]
## # A tibble: 12 × 3
## Income Lot.size Owner
## <dbl> <dbl> <chr>
## 1 139. 18.4 Yes
## 2 109. 16.8 Yes
## 3 91.8 21.6 Yes
## 4 108. 20.8 Yes
## 5 95.6 23.6 Yes
## 6 89.2 19.2 Yes
## 7 113. 17.6 Yes
## 8 92.9 22.4 Yes
## 9 110. 20 Yes
## 10 88.2 20.8 Yes
## 11 95.3 22 Yes
## 12 83.4 20 Yes
dat_own <- dat[[1]]
no<-dat_own[, 1:2]
no
dat_nown <- dat[[2]]
yes<-dat_nown[, 1:2]
yes
x1_bar<-as.numeric(apply(no, 2, mean))
x2_bar<-as.numeric(apply(yes, 2, mean))
s1<-matrix(as.numeric(var(no)), nrow = 2, byrow = TRUE)
s2<-matrix(as.numeric(var(yes)), nrow = 2, byrow = TRUE)
n1<-nrow(no)
n2<-nrow(yes)
sp<- solve((((n1-1)*s1)+((n2-1)*s2))/(n1+n2-2))
sp
## [,1] [,2]
## [1,] 0.004252256 0.001586455
## [2,] 0.001586455 0.234601243
a<- t(x1_bar-x2_bar) %*% sp
a
## [,1] [,2]
## [1,] -0.004177664 -0.6177833
D<- 0.5 * a %*% (x1_bar + x2_bar)
D
## [,1]
## [1,] -12.13021
table(as.numeric(a %*% t(dat1)) > as.numeric(D))
##
## FALSE TRUE
## 49 1
table(as.numeric(a %*% t(dat2)) > as.numeric(D))
##
## FALSE TRUE
## 26 24
The solution pdf is Here.
Required Packages
library(tidyverse)
library(knitr)
library(readr)
library(MASS)
library(psych)
library(CCA)
library(CCP)
Ans to the ques no 1
## 1 (a)
library(readr)
dat <- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/Incourse/gardenerdata1.csv")
dat1<- dat %>% group_split(gardener)
no<- dat1[[1]]
yes<- dat1[[2]]
yess<- yes[,-3]
noo<- no[,-3]
x1_bar <- as.numeric(apply(yess,2,mean))
x2_bar <- as.numeric(apply(noo,2,mean))
s1<- matrix(as.numeric(var(yess)),nrow=2,byrow=T)
s2<- matrix(as.numeric(var(noo)),nrow=2,byrow=T)
k<- 0.5 * log(det(s1) / det(s2)) + 0.5 * ((t(x1_bar) %*% solve(s1) %*% x1_bar)-
(t(x2_bar) %*% solve(s2) %*% x2_bar))
## 1st part
- 0.5 * (solve(s1) - solve(s2))
## [,1] [,2]
## [1,] 0.0009397948 -0.003089851
## [2,] -0.0030898514 -0.022789795
##2nd part
t(x1_bar) %*% solve(s1) - t(x2_bar) %*% solve(s2)
## [,1] [,2]
## [1,] 0.0379567 2.258857
## 1(b)
x1<- dat$Income
x2<- dat$Garden_Size
## classification rule
0.009 * x1^2 - 0.023 * x2^2 -0.006 * x1*x2 + 0.037 * x1 + 2.25 * x2 > 36.14
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
table(0.009 * x1^2 - 0.023 * x2^2 -0.006 * x1*x2 + 0.037 * x1 + 2.25 * x2 > 36.14)
##
## TRUE
## 24
Ans to the ques no 2
##a
library(MASS)
data("Pima.tr")
b<- Pima.tr
ind<- sample(2,size = nrow(b),replace = T,prob = c(0.8,0.2))
table(ind)
## ind
## 1 2
## 150 50
training<- b[ind == 1,]
testing<- b[ind == 2,]
head(training)
head(testing)
##b
linear <- lda(type~.,data=training)
linear
## Call:
## lda(type ~ ., data = training)
##
## Prior probabilities of groups:
## No Yes
## 0.7066667 0.2933333
##
## Group means:
## npreg glu bp skin bmi ped age
## No 2.830189 113.1604 69.34906 27.62264 31.20000 0.4260566 29.57547
## Yes 4.909091 142.8182 76.00000 31.70455 34.53182 0.4981136 38.79545
##
## Coefficients of linear discriminants:
## LD1
## npreg 0.106500509
## glu 0.023126807
## bp 0.008133495
## skin -0.016139760
## bmi 0.073329163
## ped 1.229751268
## age 0.029787738
p1<- predict(object = linear,testing)$class
tab<- table(actual=testing$type, predicted= p1)
err<- (1-sum(diag(tab))/sum(tab))*100
err
## [1] 26
##c
quad <- qda(type~.,data=training)
p2<- predict(object = quad,testing)$class
tab2<- table(actual=testing$type, predicted= p2)
tab2
## predicted
## actual No Yes
## No 23 3
## Yes 11 13
error<- (1-sum(diag(tab2))/sum(tab2))*100
error
## [1] 28
## d
model_logistic <- glm(type~.,family = "binomial",data = training )
logistic_prob<- data.frame(prob=predict(model_logistic,newdata = testing,type = "response"))
predicted<-logistic_prob %>%
mutate(class= if_else(prob>0.5,"Yes","No"))
tab3<- table(actual=testing$type, predicted= predicted$class)
errorr<- (1-sum(diag(tab3))/sum(tab3))*100
errorr
## [1] 26
Ans to the ques no 3
## a
mm<- read_csv("G:/My Drive/Turin/4th Year/AST 430/2nd_inc/Incourse/mmreg.csv")
psy<- mm[1:3]
aca<- mm[4:8]
cc1<- cc(psy,aca)
cc1$cor
## [1] 0.4640861 0.1675092 0.1039911
##b
cc1$xcoef
## [,1] [,2] [,3]
## locus_of_control -1.2538339 -0.6214776 -0.6616896
## self_concept 0.3513499 -1.1876866 0.8267210
## motivation -1.2624204 2.0272641 2.0002283
cc1$ycoef
## [,1] [,2] [,3]
## read -0.044620600 -0.004910024 0.021380576
## write -0.035877112 0.042071478 0.091307329
## math -0.023417185 0.004229478 0.009398182
## science -0.005025152 -0.085162184 -0.109835014
## female -0.632119234 1.084642326 -1.794647036
##c
a<-comput(X = psy, Y = aca, cc(psy,aca) )
a[3:6]
## $corr.X.xscores
## [,1] [,2] [,3]
## locus_of_control -0.90404631 -0.3896883 -0.1756227
## self_concept -0.02084327 -0.7087386 0.7051632
## motivation -0.56715106 0.3508882 0.7451289
##
## $corr.Y.xscores
## [,1] [,2] [,3]
## read -0.3900402 -0.06010654 0.01407661
## write -0.4067914 0.01086075 0.02647207
## math -0.3545378 -0.04990916 0.01536585
## science -0.3055607 -0.11336980 -0.02395489
## female -0.1689796 0.12645737 -0.05650916
##
## $corr.X.yscores
## [,1] [,2] [,3]
## locus_of_control -0.419555307 -0.06527635 -0.01826320
## self_concept -0.009673069 -0.11872021 0.07333073
## motivation -0.263206910 0.05877699 0.07748681
##
## $corr.Y.yscores
## [,1] [,2] [,3]
## read -0.8404480 -0.35882541 0.1353635
## write -0.8765429 0.06483674 0.2545608
## math -0.7639483 -0.29794884 0.1477611
## science -0.6584139 -0.67679761 -0.2303551
## female -0.3641127 0.75492811 -0.5434036
##d
rho<- cc1$cor
rho
## [1] 0.4640861 0.1675092 0.1039911
nrow(mm)
## [1] 600
library(CCP)
p.asym(rho = rho,N = 600,p = 3,q = 5,tstat = "Wilks")
## Wilks' Lambda, using F-approximation (Rao's F):
## stat approx df1 df2 p.value
## 1 to 3: 0.7543611 11.715733 15 1634.653 0.000000000
## 2 to 3: 0.9614300 2.944459 8 1186.000 0.002905057
## 3 to 3: 0.9891858 2.164612 3 594.000 0.091092180
Related pdf…
library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
df <- USArrests
df <- na.omit(df)
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# Dissimilarity matrix
d <- dist(df, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
#abline(h = 2)
# Agglomerative coefficinet
coef.hclust(hc1)
## [1] 0.8531583
# Compute with agnes: Agglomerative Nesting
hc2 <- agnes(df, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.8531583
#DI_visive ANA_lysis clustering
# compute divisive hierarchical clustering
hc4 <- diana(df)
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8514345
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# Ward's method
hc5 <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)
# Number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3 4
## 7 12 19 12
plot(hc5, cex = 0.6)
rect.hclust(hc5, k = 4, border = 2:5)
fviz_cluster(list(data = df, cluster = sub_grp))
####### K means clustering
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 46.7 56.1
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 20 30
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
k2
## K-means clustering with 2 clusters of sizes 20, 30
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.004934 1.0138274 0.1975853 0.8469650
## 2 -0.669956 -0.6758849 -0.1317235 -0.5646433
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 46.74796 56.11445
## (between_SS / total_SS = 47.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(k2, data = df)
df %>%
as_tibble() %>%
mutate(cluster = k2$cluster,
state = row.names(USArrests)) %>%
ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +
geom_text()
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
The Slide id Here.