library(MASS)
library(expm)
## Loading required package: Matrix
##
## Attaching package: 'expm'
## The following object is masked from 'package:Matrix':
##
## expm
fin <- read.csv("C:/Users/riema/Downloads/T11-4.csv", header=FALSE)
X <- as.matrix(fin[,1:4])
# Center the columns
Xc <- scale(X, center = TRUE, scale = FALSE)
# SVD
sv <- svd(Xc)
Z <- sv$u %*% diag(sv$d)
plot(Z[,1], Z[,2],
pch = 19,
xlab = "Z1 (1st SVD component)",
ylab = "Z2 (2nd SVD component)",
main = "SVD: First Two Transformed Variables")
grid()
It does not look strongly multivariate normal due to it’s asymmetry and
extreme points.
mu <- colMeans(X)
mu
## V1 V2 V3 V4
## 0.096304348 -0.006956522 2.033478261 0.431739130
sigma <- cov(X)
sigma
## V1 V2 V3 V4
## V1 0.068183816 0.027767053 0.14996870 -0.002566763
## V2 0.027767053 0.015363865 0.05878251 0.001252367
## V3 0.149968696 0.058782512 1.01309874 0.028607150
## V4 -0.002566763 0.001252367 0.02860715 0.033912464
d <- NULL
for (i in 1:4){
d <- c(d, t(X[i,]-mu) %*%solve(sigma) %*%(X[i,]-mu))
}
d
## [1] 12.726465 11.224681 1.692702 1.347885
p <- ((1:4)-0.5)/4
p
## [1] 0.125 0.375 0.625 0.875
chsq <- qchisq(p,df=4, lower.tail=TRUE)
plot(sort(d),chsq)
abline(a=0, b=1, lwd=2)
This does not appear to be multivariate normal as there is a large gap between the points and they don’t fall in a relatively straight line.
library(mvtnorm)
id <- sample(1:nrow(fin), size=floor(0.8*nrow(fin)))
first <- fin[id,]
secn <- fin[-id,]
first.x <- first[,-5]
#Now must calculate mu1, mu2, mu3, sigma1, sigma2, sigma3 from TRAINING data
xbars <- lapply(split(first.x, first[,5]), colMeans)
xbars
## $`0`
## V1 V2 V3 V4
## -0.06857143 -0.08071429 1.35928571 0.47285714
##
## $`1`
## V1 V2 V3 V4
## 0.26272727 0.05909091 2.63363636 0.42454545
Ses <- lapply(split(first.x, first[,5]), cov)
Ses
## $`0`
## V1 V2 V3 V4
## V1 0.017767033 0.014508791 0.03658571 0.004049451
## V2 0.014508791 0.013160989 0.03261484 0.006471429
## V3 0.036585714 0.032614835 0.19874560 0.047671429
## V4 0.004049451 0.006471429 0.04767143 0.050545055
##
## $`1`
## V1 V2 V3 V4
## V1 0.044516017 0.0081597403 0.076203896 -0.0061034632
## V2 0.008159740 0.0024086580 0.008732035 0.0005995671
## V3 0.076203896 0.0087320346 1.177319481 0.0357731602
## V4 -0.006103463 0.0005995671 0.035773160 0.0287116883
library(mvtnorm)
id<-sample(1:nrow(fin), size=floor(0.80*nrow(fin)))
D<- sapply(1:2, function(x){
dmvnorm(secn[,1:4], xbars[[x]], Ses[[x]])
}
)
D
## [,1] [,2]
## 1 3.280255e-01 6.592180e-35
## 2 1.738087e-12 3.299116e-14
## 3 1.015295e+01 7.384961e+00
## 5 5.057953e+01 2.239037e-03
## 9 2.130736e+01 5.045240e+00
## 16 4.908495e-06 1.053357e+01
## 20 1.026040e+00 2.307014e-02
## 29 5.955514e-01 6.521291e+00
## 30 2.153007e-01 1.429793e+01
## 33 2.977523e-03 4.631060e+00
predicted <- unique(secn$V5)[apply(D, 1, which.max)]
predicted
## [1] 0 0 0 0 0 1 0 1 1 1
table(predicted=predicted, actual=secn$V5)
## actual
## predicted 0 1
## 0 6 0
## 1 1 3
mean(predicted == secn$V5)
## [1] 0.9
first$V5 <- factor(first$V5,
levels = c(0, 1),
labels = c("Bunkrupt", "Not_Bankrupt"))
Y <- model.matrix(~V5-1, data=first)
X <- model.matrix(~V1+V2+V3+V4, data=first)
Beta <- solve(t(X) %*% X) %*% t(X)%*%Y
Beta
## V5Bunkrupt V5Not_Bankrupt
## (Intercept) 0.5471378 0.4528622
## V1 -0.2587195 0.2587195
## V2 -1.8322060 1.8322060
## V3 -0.1429223 0.1429223
## V4 0.4299670 -0.4299670
test.X <- model.matrix(~V1+V2+V3+V4, data=secn)
test.X
## (Intercept) V1 V2 V3 V4
## 1 1 -0.45 -0.41 1.09 0.45
## 2 1 -0.56 -0.31 1.51 0.16
## 3 1 0.06 0.02 1.01 0.40
## 5 1 -0.10 -0.09 1.56 0.67
## 9 1 0.07 -0.01 1.37 0.34
## 16 1 0.37 0.11 1.99 0.38
## 20 1 0.12 0.11 1.14 0.17
## 29 1 -0.02 0.02 2.05 0.35
## 30 1 0.22 0.08 2.35 0.40
## 33 1 -0.10 -0.01 2.50 0.58
## attr(,"assign")
## [1] 0 1 2 3 4
predict1 <- test.X%*%Beta
predict1
## V5Bunkrupt V5Not_Bankrupt
## 1 1.4524659 -0.4524659
## 2 1.1129866 -0.1129866
## 3 0.5226058 0.4773942
## 5 0.8030274 0.1969726
## 9 0.4977347 0.5022653
## 16 0.1288410 0.8711590
## 20 0.2247117 0.7752883
## 29 0.3731658 0.6268342
## 30 0.1797624 0.8202376
## 33 0.4834069 0.5165931
predictgr1 <- unique(fin$V5)[apply(predict1, 1, which.max)]
table(secn$V5, predictgr1)
## predictgr1
## 0 1
## 0 4 3
## 1 0 3
mean(predictgr1 == secn$V5)
## [1] 0.7
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
names(fin)
## [1] "V1" "V2" "V3" "V4" "V5"
table(fin$V5)
##
## 0 1
## 21 25
id <- sample(1:nrow(fin), size=floor(0.8*nrow(fin)))
first <- fin[id,]
secn <- fin[-id,]
model=glm(V5 ~V1+V2+V3+V4, data=first, family="binomial")
summary(model)
##
## Call:
## glm(formula = V5 ~ V1 + V2 + V3 + V4, family = "binomial", data = first)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.818 2.403 -2.005 0.0449 *
## V1 6.202 7.460 0.831 0.4058
## V2 -2.263 14.968 -0.151 0.8798
## V3 2.955 1.305 2.264 0.0236 *
## V4 -2.259 2.932 -0.771 0.4410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 49.795 on 35 degrees of freedom
## Residual deviance: 25.950 on 31 degrees of freedom
## AIC: 35.95
##
## Number of Fisher Scoring iterations: 6
pred=predict(model, type="response")
first$pred<-pred
pred_classification<-ifelse(first$pred>0.5, 1, 0)
table(first$V5, pred_classification)
## pred_classification
## 0 1
## 0 14 3
## 1 1 18
mean(first$V5==pred_classification)
## [1] 0.8888889
library(Morpho)
crude <- read.csv("C:/Users/riema/Downloads/T11-7.csv", header=FALSE)
X <- as.matrix(crude[,1:5])
id <- sample(1:nrow(crude), size=floor(0.8*nrow(crude)))
train <- crude[id,]
test <- crude[-id,]
train.x <- train[, -6]
xbar <- lapply(split(train.x, train[,6]), colMeans)
xbar
## $SubMuli
## V1 V2 V3 V4 V5
## 4.38750 37.50000 0.16375 6.82875 6.30000
##
## $Upper
## V1 V2 V3 V4 V5
## 7.4206897 22.3793103 0.4534483 4.6475862 6.1024138
##
## $Wilhelm
## V1 V2 V3 V4 V5
## 3.2285714 43.5714286 0.1171429 6.7957143 11.5400000
S <- lapply(split(train.x, train[,6]), cov)
S
## $SubMuli
## V1 V2 V3 V4 V5
## V1 1.7526786 0.7214286 0.07762500 1.0612679 2.62785714
## V2 0.7214286 42.2857143 -0.28500000 1.8835714 -10.18000000
## V3 0.0776250 -0.2850000 0.02716964 0.1810911 0.04558571
## V4 1.0612679 1.8835714 0.18109107 1.9461839 0.68214286
## V5 2.6278571 -10.1800000 0.04558571 0.6821429 13.72600000
##
## $Upper
## V1 V2 V3 V4 V5
## V1 3.1066995 -5.354557 -0.1568596 -0.75059113 1.30541256
## V2 -5.3545567 79.172414 1.1825739 1.84523399 -9.34023399
## V3 -0.1568596 1.182574 0.1194520 0.21238362 -0.13286576
## V4 -0.7505911 1.845234 0.2123836 1.06340468 0.06366318
## V5 1.3054126 -9.340234 -0.1328658 0.06366318 4.78170468
##
## $Wilhelm
## V1 V2 V3 V4 V5
## V1 0.289047619 1.6642857 0.005428571 0.10930952 0.14533333
## V2 1.664285714 37.2857143 -0.014761905 3.34785714 6.25833333
## V3 0.005428571 -0.0147619 0.010023810 0.03896905 0.04241667
## V4 0.109309524 3.3478571 0.038969048 0.76029524 1.15885000
## V5 0.145333333 6.2583333 0.042416667 1.15885000 2.00186667
pools <- covW(train.x, train[,6])
## Warning in covW(train.x, train[, 6]): groups coerced to factors
pools
## V1 V2 V3 V4 V5
## V1 2.46318590 -3.2900457 -0.09307616 -0.3154102 1.36142809
## V2 -3.29004566 66.7449237 0.75679262 2.0716755 -7.20089151
## V3 -0.09307616 0.7567926 0.08768257 0.1816632 -0.07674735
## V4 -0.31541023 2.0716755 0.18166325 1.1697656 0.32952851
## V5 1.36142809 -7.2008915 -0.07674735 0.3295285 5.90197393
## attr(,"means")
## V1 V2 V3 V4 V5
## SubMuli 4.387500 37.50000 0.1637500 6.828750 6.300000
## Upper 7.420690 22.37931 0.4534483 4.647586 6.102414
## Wilhelm 3.228571 43.57143 0.1171429 6.795714 11.540000
set.seed(1)
n=nrow(crude)
id <- sample(seq_len(n), size = floor(0.7 * n))
train <- crude[id, ]
test <- crude[-id, ]
Xtrain <- train[, 1:5]
Xtest <- test[, 1:5]
p <- ncol(Xtrain)
xbar <- lapply(split(Xtrain, train$V6), colMeans)
xbarO <- colMeans(Xtrain)
xbarO
## V1 V2 V3 V4 V5
## 6.1487179 27.3743590 0.3220513 5.1741026 6.4717949
S <- lapply(split(Xtrain, train$V6), cov)
S
## $SubMuli
## V1 V2 V3 V4 V5
## V1 1.43476190 12.8928571 0.08159524 0.8786190 0.18364286
## V2 12.89285714 121.2857143 0.48119048 5.5935714 2.08095238
## V3 0.08159524 0.4811905 0.03059524 0.2148357 -0.06494048
## V4 0.87861905 5.5935714 0.21483571 1.9840571 -0.47922143
## V5 0.18364286 2.0809524 -0.06494048 -0.4792214 8.28634762
##
## $Upper
## V1 V2 V3 V4 V5
## V1 4.8285755 -2.5124501 -0.28048860 -0.9253718 1.80571368
## V2 -2.5124501 90.5580627 0.27167806 0.2811282 -9.36417094
## V3 -0.2804886 0.2716781 0.08126268 0.1319244 0.04753291
## V4 -0.9253718 0.2811282 0.13192436 0.9235538 0.62692692
## V5 1.8057137 -9.3641709 0.04753291 0.6269269 5.69489487
##
## $Wilhelm
## V1 V2 V3 V4 V5
## V1 0.34200 3.1500 0.002250 0.158100 0.25810
## V2 3.15000 44.0000 0.077500 3.405000 7.09000
## V3 0.00225 0.0775 0.014200 0.071225 0.08215
## V4 0.15810 3.4050 0.071225 0.761330 1.28158
## V5 0.25810 7.0900 0.082150 1.281580 2.42833
n_g <- table(train$V6)
k <- length(unique(train$V6))
B <- Reduce(
`+`,
lapply(seq_len(k), function(i) {
n_g[i] * tcrossprod(xbar[[i]] - xbarO)
})
)
B
## [,1] [,2] [,3] [,4] [,5]
## [1,] 129.537902 -461.68472 8.9792348 -59.853242 -56.047223
## [2,] -461.684721 1820.27044 -31.7327212 214.291341 337.598525
## [3,] 8.979235 -31.73272 0.6228348 -4.147376 -3.672056
## [4,] -59.853242 214.29134 -4.1473758 27.660681 26.660921
## [5,] -56.047223 337.59853 -3.6720563 26.660921 132.956302
W <- Reduce(
`+`,
lapply(seq_len(k), function(i) {
(n_g[i] - 1) * S[[i]]
})
)
W
## V1 V2 V3 V4 V5
## V1 135.519534 24.63344 -6.794132 -18.155552 49.082813
## V2 24.633439 3258.22392 10.260772 54.490762 -202.622730
## V3 -6.794132 10.26077 2.353201 5.003948 1.174813
## V4 -18.155552 54.49076 5.003948 38.962063 18.551091
## V5 49.082813 -202.62273 1.174813 18.551091 207.498672
E <- eigen(solve(W) %*% B)
E
## eigen() decomposition
## $values
## [1] 4.962526e+00 6.748817e-01 -1.128130e-16 8.165398e-17 -9.711466e-18
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.12780662 0.01018909 0.11710917 0.173494411 -0.1002438799
## [2,] 0.01697638 -0.02091672 -0.01601134 0.006783666 0.0008024519
## [3,] -0.98245418 -0.90196511 -0.96985993 -0.967136338 0.9923531636
## [4,] 0.11506721 0.37145893 0.21206556 0.185571531 -0.0719700475
## [5,] 0.07014690 -0.21894263 0.02071211 -0.008011376 -0.0024559439
newobs <- data.frame(V1 = 9.50, V2 = 17.00, V3 = 0.50, V4 = 3.52, V5 = 5.71)
predict <- unique(train$V6)[apply(newobs, 1, which.min)]
predict
## [1] "SubMuli"
library(class)
set.seed(123)
trainsc <- scale(Xtrain[,1:5])
testsc <- scale(Xtest[,1:5])
trainlab <- train$V6
testlab <- test$V6
nc <- nrow(trainsc)
sqrt(nc)
## [1] 6.244998
predictions <- knn(train = trainsc,
test = newobs,
cl = trainlab,
k = 5)
predictions
## [1] Wilhelm
## Levels: SubMuli Upper Wilhelm
gpa <- read.csv("C:/Users/riema/Downloads/T11-6.csv", header=FALSE)
X <- as.matrix(gpa[,1:2])
id <- sample(1:nrow(gpa), size=floor(0.8*nrow(gpa)))
train <- gpa[id,]
test <- gpa[-id,]
train.x <- train[, -3]
xbar <- lapply(split(train.x, train[,3]), colMeans)
xbar
## $`1`
## V1 V2
## 3.433462 570.115385
##
## $`2`
## V1 V2
## 2.460455 448.590909
##
## $`3`
## V1 V2
## 3.005 446.550
S <- lapply(split(train.x, train[,3]), cov)
S
## $`1`
## V1 V2
## V1 0.04150354 -0.3888154
## V2 -0.38881538 4410.1861538
##
## $`2`
## V1 V2
## V1 0.0261855 1.273528
## V2 1.2735281 4098.158009
##
## $`3`
## V1 V2
## V1 0.02998421 -5.234474
## V2 -5.23447368 2368.681579
pools <- covW(train.x, train[,3])
## Warning in covW(train.x, train[, 3]): groups coerced to factors
pools
## V1 V2
## V1 0.03318744 -1.268174
## V2 -1.26817375 3712.629570
## attr(,"means")
## V1 V2
## 1 3.433462 570.1154
## 2 2.460455 448.5909
## 3 3.005000 446.5500
set.seed(1)
n=nrow(gpa)
id1 <- sample(seq_len(n), size = floor(0.7 * n))
train <- gpa[id1,]
test <- gpa[-id1,]
Xtrain <- train[, 1:2]
Xtest <- test[, 1:2]
p <- ncol(Xtrain)
xbar <- lapply(split(Xtrain, train$V3), colMeans)
xbarO <- colMeans(Xtrain)
xbarO
## V1 V2
## 2.891695 477.016949
S <- lapply(split(Xtrain, train$V3), cov)
S
## $`1`
## V1 V2
## V1 0.03957574 0.6380882
## V2 0.63808824 3477.4705882
##
## $`2`
## V1 V2
## V1 0.03309112 -2.007663
## V2 -2.00766304 3813.027174
##
## $`3`
## V1 V2
## V1 0.03711634 -6.913856
## V2 -6.91385621 2867.006536
n_g <- table(train$V3)
k <- length(unique(train$V3))
B <- Reduce(
`+`,
lapply(seq_len(k), function(i) {
n_g[i] * tcrossprod(xbar[[i]] - xbarO)
})
)
W <- Reduce(
`+`,
lapply(seq_len(k), function(i) {
(n_g[i] - 1) * S[[i]]
})
)
E <- eigen(solve(W) %*% B)
E
## eigen() decomposition
## $values
## [1] 5.4622522 0.2072421
##
## $vectors
## [,1] [,2]
## [1,] 0.999998110 -0.999969167
## [2,] 0.001944019 0.007852673
r <- k - 1
Ytrain <- as.data.frame(as.matrix(Xtrain) %*% E$vectors[, 1:r])
ybar <- lapply(split(Ytrain, train$V3), colMeans)
Ytest <- as.matrix(Xtest) %*% E$vectors[, 1:r]
D <- sapply(seq_len(k), function(j) {
rowSums((Ytest - matrix(ybar[[j]], nrow(Ytest), r, byrow = TRUE))^2)
})
predicted_group <- unique(train$V3)[apply(D, 1, which.min)]
table(Predicted = predicted_group, Actual = test$V3)
## Actual
## Predicted 1 2 3
## 1 3 1 7
## 2 0 3 1
## 3 11 0 0
mean(predicted_group==test$V3)
## [1] 0.2307692
library(MASS)
library(heplots)
## Loading required package: broom
X <- gpa[, 1:2]
group <- gpa$V3
box_test <- heplots::boxM(X, group)
box_test
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X by group
## Chi-Sq (approx.) = 16.0745, df = 6, p-value = 0.01336
We can see that the p-value is less than 0.05, so it is more appropriate to use QDA as the covariance matrices are not equal, thus not fulfilling LDA’s assumptions.
qda_model <- qda(V3 ~ ., data = gpa)
qda_model
## Call:
## qda(V3 ~ ., data = gpa)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3647059 0.3294118 0.3058824
##
## Group means:
## V1 V2
## 1 3.403871 561.2258
## 2 2.482500 447.0714
## 3 2.992692 446.2308
predictions <- predict(qda_model, gpa)
conf_matrix <- table(Predicted = predictions$class, Actual = gpa$V3)
conf_matrix
## Actual
## Predicted 1 2 3
## 1 30 0 1
## 2 0 27 0
## 3 1 1 25
mean(predictions$class == gpa$V3)
## [1] 0.9647059