Homework 2 4400

Q1.

library(MASS)
library(expm)

## Loading required package: Matrix

## 
## Attaching package: 'expm'

## The following object is masked from 'package:Matrix':
## 
##     expm

fin <- read.csv("C:/Users/riema/Downloads/T11-4.csv", header=FALSE)
X <- as.matrix(fin[,1:4])

# Center the columns
Xc <- scale(X, center = TRUE, scale = FALSE)

# SVD
sv <- svd(Xc)


Z <- sv$u %*% diag(sv$d)


plot(Z[,1], Z[,2],
     pch = 19,
     xlab = "Z1 (1st SVD component)",
     ylab = "Z2 (2nd SVD component)",
     main = "SVD: First Two Transformed Variables")
grid()

It does not look strongly multivariate normal due to it’s asymmetry and extreme points.

mu <- colMeans(X)
mu

##           V1           V2           V3           V4 
##  0.096304348 -0.006956522  2.033478261  0.431739130

sigma <- cov(X)
sigma

##              V1          V2         V3           V4
## V1  0.068183816 0.027767053 0.14996870 -0.002566763
## V2  0.027767053 0.015363865 0.05878251  0.001252367
## V3  0.149968696 0.058782512 1.01309874  0.028607150
## V4 -0.002566763 0.001252367 0.02860715  0.033912464

d <- NULL
for (i in 1:4){
 d <- c(d, t(X[i,]-mu) %*%solve(sigma) %*%(X[i,]-mu))
}
d

## [1] 12.726465 11.224681  1.692702  1.347885

p <- ((1:4)-0.5)/4
p

## [1] 0.125 0.375 0.625 0.875

chsq <- qchisq(p,df=4, lower.tail=TRUE)

plot(sort(d),chsq)
abline(a=0, b=1, lwd=2)

This does not appear to be multivariate normal as there is a large gap between the points and they don’t fall in a relatively straight line.

library(mvtnorm)
id <- sample(1:nrow(fin), size=floor(0.8*nrow(fin)))
first <- fin[id,]
secn <- fin[-id,]
first.x <- first[,-5]

#Now must calculate mu1, mu2, mu3, sigma1, sigma2, sigma3 from TRAINING data
xbars <- lapply(split(first.x, first[,5]), colMeans)
xbars

## $`0`
##          V1          V2          V3          V4 
## -0.06857143 -0.08071429  1.35928571  0.47285714 
## 
## $`1`
##         V1         V2         V3         V4 
## 0.26272727 0.05909091 2.63363636 0.42454545

Ses <- lapply(split(first.x, first[,5]), cov)
Ses

## $`0`
##             V1          V2         V3          V4
## V1 0.017767033 0.014508791 0.03658571 0.004049451
## V2 0.014508791 0.013160989 0.03261484 0.006471429
## V3 0.036585714 0.032614835 0.19874560 0.047671429
## V4 0.004049451 0.006471429 0.04767143 0.050545055
## 
## $`1`
##              V1           V2          V3            V4
## V1  0.044516017 0.0081597403 0.076203896 -0.0061034632
## V2  0.008159740 0.0024086580 0.008732035  0.0005995671
## V3  0.076203896 0.0087320346 1.177319481  0.0357731602
## V4 -0.006103463 0.0005995671 0.035773160  0.0287116883

library(mvtnorm)
id<-sample(1:nrow(fin), size=floor(0.80*nrow(fin)))




D<- sapply(1:2, function(x){
  
  
  dmvnorm(secn[,1:4], xbars[[x]], Ses[[x]])
}
)

D

##            [,1]         [,2]
## 1  3.280255e-01 6.592180e-35
## 2  1.738087e-12 3.299116e-14
## 3  1.015295e+01 7.384961e+00
## 5  5.057953e+01 2.239037e-03
## 9  2.130736e+01 5.045240e+00
## 16 4.908495e-06 1.053357e+01
## 20 1.026040e+00 2.307014e-02
## 29 5.955514e-01 6.521291e+00
## 30 2.153007e-01 1.429793e+01
## 33 2.977523e-03 4.631060e+00

predicted <- unique(secn$V5)[apply(D, 1, which.max)]
predicted

##  [1] 0 0 0 0 0 1 0 1 1 1

table(predicted=predicted, actual=secn$V5)

##          actual
## predicted 0 1
##         0 6 0
##         1 1 3

mean(predicted == secn$V5)

## [1] 0.9

first$V5 <- factor(first$V5,
                                 levels = c(0, 1),
                                 labels = c("Bunkrupt", "Not_Bankrupt"))

Y <- model.matrix(~V5-1, data=first)
X <- model.matrix(~V1+V2+V3+V4, data=first)
Beta <- solve(t(X) %*% X) %*% t(X)%*%Y
Beta

##             V5Bunkrupt V5Not_Bankrupt
## (Intercept)  0.5471378      0.4528622
## V1          -0.2587195      0.2587195
## V2          -1.8322060      1.8322060
## V3          -0.1429223      0.1429223
## V4           0.4299670     -0.4299670

test.X <- model.matrix(~V1+V2+V3+V4, data=secn)
test.X

##    (Intercept)    V1    V2   V3   V4
## 1            1 -0.45 -0.41 1.09 0.45
## 2            1 -0.56 -0.31 1.51 0.16
## 3            1  0.06  0.02 1.01 0.40
## 5            1 -0.10 -0.09 1.56 0.67
## 9            1  0.07 -0.01 1.37 0.34
## 16           1  0.37  0.11 1.99 0.38
## 20           1  0.12  0.11 1.14 0.17
## 29           1 -0.02  0.02 2.05 0.35
## 30           1  0.22  0.08 2.35 0.40
## 33           1 -0.10 -0.01 2.50 0.58
## attr(,"assign")
## [1] 0 1 2 3 4

predict1 <- test.X%*%Beta
predict1

##    V5Bunkrupt V5Not_Bankrupt
## 1   1.4524659     -0.4524659
## 2   1.1129866     -0.1129866
## 3   0.5226058      0.4773942
## 5   0.8030274      0.1969726
## 9   0.4977347      0.5022653
## 16  0.1288410      0.8711590
## 20  0.2247117      0.7752883
## 29  0.3731658      0.6268342
## 30  0.1797624      0.8202376
## 33  0.4834069      0.5165931

predictgr1 <- unique(fin$V5)[apply(predict1, 1, which.max)]
table(secn$V5, predictgr1)

##    predictgr1
##     0 1
##   0 4 3
##   1 0 3

mean(predictgr1 == secn$V5)

## [1] 0.7

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

names(fin)

## [1] "V1" "V2" "V3" "V4" "V5"

table(fin$V5)

## 
##  0  1 
## 21 25

id <- sample(1:nrow(fin), size=floor(0.8*nrow(fin)))
first <- fin[id,]
secn <- fin[-id,]


model=glm(V5 ~V1+V2+V3+V4, data=first, family="binomial")
summary(model)

## 
## Call:
## glm(formula = V5 ~ V1 + V2 + V3 + V4, family = "binomial", data = first)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -4.818      2.403  -2.005   0.0449 *
## V1             6.202      7.460   0.831   0.4058  
## V2            -2.263     14.968  -0.151   0.8798  
## V3             2.955      1.305   2.264   0.0236 *
## V4            -2.259      2.932  -0.771   0.4410  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 49.795  on 35  degrees of freedom
## Residual deviance: 25.950  on 31  degrees of freedom
## AIC: 35.95
## 
## Number of Fisher Scoring iterations: 6

pred=predict(model, type="response")
first$pred<-pred
pred_classification<-ifelse(first$pred>0.5, 1, 0)
table(first$V5, pred_classification)

##    pred_classification
##      0  1
##   0 14  3
##   1  1 18

mean(first$V5==pred_classification)

## [1] 0.8888889

Q2.

library(Morpho)
crude <- read.csv("C:/Users/riema/Downloads/T11-7.csv", header=FALSE)

X <- as.matrix(crude[,1:5])

id <- sample(1:nrow(crude), size=floor(0.8*nrow(crude)))
train <- crude[id,]
test <- crude[-id,]


train.x <- train[, -6] 
xbar <- lapply(split(train.x, train[,6]), colMeans)
xbar

## $SubMuli
##       V1       V2       V3       V4       V5 
##  4.38750 37.50000  0.16375  6.82875  6.30000 
## 
## $Upper
##         V1         V2         V3         V4         V5 
##  7.4206897 22.3793103  0.4534483  4.6475862  6.1024138 
## 
## $Wilhelm
##         V1         V2         V3         V4         V5 
##  3.2285714 43.5714286  0.1171429  6.7957143 11.5400000

S <- lapply(split(train.x, train[,6]), cov)
S

## $SubMuli
##           V1          V2          V3        V4           V5
## V1 1.7526786   0.7214286  0.07762500 1.0612679   2.62785714
## V2 0.7214286  42.2857143 -0.28500000 1.8835714 -10.18000000
## V3 0.0776250  -0.2850000  0.02716964 0.1810911   0.04558571
## V4 1.0612679   1.8835714  0.18109107 1.9461839   0.68214286
## V5 2.6278571 -10.1800000  0.04558571 0.6821429  13.72600000
## 
## $Upper
##            V1        V2         V3          V4          V5
## V1  3.1066995 -5.354557 -0.1568596 -0.75059113  1.30541256
## V2 -5.3545567 79.172414  1.1825739  1.84523399 -9.34023399
## V3 -0.1568596  1.182574  0.1194520  0.21238362 -0.13286576
## V4 -0.7505911  1.845234  0.2123836  1.06340468  0.06366318
## V5  1.3054126 -9.340234 -0.1328658  0.06366318  4.78170468
## 
## $Wilhelm
##             V1         V2           V3         V4         V5
## V1 0.289047619  1.6642857  0.005428571 0.10930952 0.14533333
## V2 1.664285714 37.2857143 -0.014761905 3.34785714 6.25833333
## V3 0.005428571 -0.0147619  0.010023810 0.03896905 0.04241667
## V4 0.109309524  3.3478571  0.038969048 0.76029524 1.15885000
## V5 0.145333333  6.2583333  0.042416667 1.15885000 2.00186667

pools <- covW(train.x, train[,6])

## Warning in covW(train.x, train[, 6]): groups coerced to factors

pools

##             V1         V2          V3         V4          V5
## V1  2.46318590 -3.2900457 -0.09307616 -0.3154102  1.36142809
## V2 -3.29004566 66.7449237  0.75679262  2.0716755 -7.20089151
## V3 -0.09307616  0.7567926  0.08768257  0.1816632 -0.07674735
## V4 -0.31541023  2.0716755  0.18166325  1.1697656  0.32952851
## V5  1.36142809 -7.2008915 -0.07674735  0.3295285  5.90197393
## attr(,"means")
##               V1       V2        V3       V4        V5
## SubMuli 4.387500 37.50000 0.1637500 6.828750  6.300000
## Upper   7.420690 22.37931 0.4534483 4.647586  6.102414
## Wilhelm 3.228571 43.57143 0.1171429 6.795714 11.540000

set.seed(1)
n=nrow(crude)


id <- sample(seq_len(n), size = floor(0.7 * n))

train <- crude[id, ]
test  <- crude[-id, ]

Xtrain <- train[, 1:5]
Xtest  <- test[, 1:5]


p <- ncol(Xtrain)       

 

xbar <- lapply(split(Xtrain, train$V6), colMeans)


xbarO <- colMeans(Xtrain)

xbarO

##         V1         V2         V3         V4         V5 
##  6.1487179 27.3743590  0.3220513  5.1741026  6.4717949

S <- lapply(split(Xtrain, train$V6), cov)
S

## $SubMuli
##             V1          V2          V3         V4          V5
## V1  1.43476190  12.8928571  0.08159524  0.8786190  0.18364286
## V2 12.89285714 121.2857143  0.48119048  5.5935714  2.08095238
## V3  0.08159524   0.4811905  0.03059524  0.2148357 -0.06494048
## V4  0.87861905   5.5935714  0.21483571  1.9840571 -0.47922143
## V5  0.18364286   2.0809524 -0.06494048 -0.4792214  8.28634762
## 
## $Upper
##            V1         V2          V3         V4          V5
## V1  4.8285755 -2.5124501 -0.28048860 -0.9253718  1.80571368
## V2 -2.5124501 90.5580627  0.27167806  0.2811282 -9.36417094
## V3 -0.2804886  0.2716781  0.08126268  0.1319244  0.04753291
## V4 -0.9253718  0.2811282  0.13192436  0.9235538  0.62692692
## V5  1.8057137 -9.3641709  0.04753291  0.6269269  5.69489487
## 
## $Wilhelm
##         V1      V2       V3       V4      V5
## V1 0.34200  3.1500 0.002250 0.158100 0.25810
## V2 3.15000 44.0000 0.077500 3.405000 7.09000
## V3 0.00225  0.0775 0.014200 0.071225 0.08215
## V4 0.15810  3.4050 0.071225 0.761330 1.28158
## V5 0.25810  7.0900 0.082150 1.281580 2.42833

n_g <- table(train$V6)


k <- length(unique(train$V6))


B <- Reduce(
  `+`,
  lapply(seq_len(k), function(i) {
    n_g[i] * tcrossprod(xbar[[i]] - xbarO)
  })
)
B

##             [,1]       [,2]        [,3]       [,4]       [,5]
## [1,]  129.537902 -461.68472   8.9792348 -59.853242 -56.047223
## [2,] -461.684721 1820.27044 -31.7327212 214.291341 337.598525
## [3,]    8.979235  -31.73272   0.6228348  -4.147376  -3.672056
## [4,]  -59.853242  214.29134  -4.1473758  27.660681  26.660921
## [5,]  -56.047223  337.59853  -3.6720563  26.660921 132.956302

W <- Reduce(
  `+`,
  lapply(seq_len(k), function(i) {
    (n_g[i] - 1) * S[[i]]
  })
)

W

##            V1         V2        V3         V4          V5
## V1 135.519534   24.63344 -6.794132 -18.155552   49.082813
## V2  24.633439 3258.22392 10.260772  54.490762 -202.622730
## V3  -6.794132   10.26077  2.353201   5.003948    1.174813
## V4 -18.155552   54.49076  5.003948  38.962063   18.551091
## V5  49.082813 -202.62273  1.174813  18.551091  207.498672

E <- eigen(solve(W) %*% B)
E

## eigen() decomposition
## $values
## [1]  4.962526e+00  6.748817e-01 -1.128130e-16  8.165398e-17 -9.711466e-18
## 
## $vectors
##             [,1]        [,2]        [,3]         [,4]          [,5]
## [1,] -0.12780662  0.01018909  0.11710917  0.173494411 -0.1002438799
## [2,]  0.01697638 -0.02091672 -0.01601134  0.006783666  0.0008024519
## [3,] -0.98245418 -0.90196511 -0.96985993 -0.967136338  0.9923531636
## [4,]  0.11506721  0.37145893  0.21206556  0.185571531 -0.0719700475
## [5,]  0.07014690 -0.21894263  0.02071211 -0.008011376 -0.0024559439

newobs <- data.frame(V1 = 9.50, V2 = 17.00, V3 = 0.50, V4 = 3.52, V5 = 5.71)

predict <- unique(train$V6)[apply(newobs, 1, which.min)]
predict

## [1] "SubMuli"

library(class)

set.seed(123) 

trainsc <- scale(Xtrain[,1:5])
testsc <- scale(Xtest[,1:5])

trainlab <- train$V6
testlab <- test$V6

nc <- nrow(trainsc)
sqrt(nc)

## [1] 6.244998

predictions <- knn(train = trainsc, 
                   test = newobs, 
                   cl = trainlab, 
                   k = 5)
predictions

## [1] Wilhelm
## Levels: SubMuli Upper Wilhelm

Q3.

gpa <- read.csv("C:/Users/riema/Downloads/T11-6.csv", header=FALSE)

X <- as.matrix(gpa[,1:2])

id <- sample(1:nrow(gpa), size=floor(0.8*nrow(gpa)))
train <- gpa[id,]
test <- gpa[-id,]


train.x <- train[, -3] 
xbar <- lapply(split(train.x, train[,3]), colMeans)
xbar

## $`1`
##         V1         V2 
##   3.433462 570.115385 
## 
## $`2`
##         V1         V2 
##   2.460455 448.590909 
## 
## $`3`
##      V1      V2 
##   3.005 446.550

S <- lapply(split(train.x, train[,3]), cov)
S

## $`1`
##             V1           V2
## V1  0.04150354   -0.3888154
## V2 -0.38881538 4410.1861538
## 
## $`2`
##           V1          V2
## V1 0.0261855    1.273528
## V2 1.2735281 4098.158009
## 
## $`3`
##             V1          V2
## V1  0.02998421   -5.234474
## V2 -5.23447368 2368.681579

pools <- covW(train.x, train[,3])

## Warning in covW(train.x, train[, 3]): groups coerced to factors

pools

##             V1          V2
## V1  0.03318744   -1.268174
## V2 -1.26817375 3712.629570
## attr(,"means")
##         V1       V2
## 1 3.433462 570.1154
## 2 2.460455 448.5909
## 3 3.005000 446.5500

set.seed(1)
n=nrow(gpa)


id1 <- sample(seq_len(n), size = floor(0.7 * n))
train <- gpa[id1,]
test <- gpa[-id1,]


Xtrain <- train[, 1:2]
Xtest  <- test[, 1:2]


p <- ncol(Xtrain)       

 

xbar <- lapply(split(Xtrain, train$V3), colMeans)


xbarO <- colMeans(Xtrain)

xbarO

##         V1         V2 
##   2.891695 477.016949

S <- lapply(split(Xtrain, train$V3), cov)
S

## $`1`
##            V1           V2
## V1 0.03957574    0.6380882
## V2 0.63808824 3477.4705882
## 
## $`2`
##             V1          V2
## V1  0.03309112   -2.007663
## V2 -2.00766304 3813.027174
## 
## $`3`
##             V1          V2
## V1  0.03711634   -6.913856
## V2 -6.91385621 2867.006536

n_g <- table(train$V3)

k <- length(unique(train$V3))


B <- Reduce(
  `+`,
  lapply(seq_len(k), function(i) {
    n_g[i] * tcrossprod(xbar[[i]] - xbarO)
  })
)



W <- Reduce(
  `+`,
  lapply(seq_len(k), function(i) {
    (n_g[i] - 1) * S[[i]]
  })
)

 

E <- eigen(solve(W) %*% B)
E

## eigen() decomposition
## $values
## [1] 5.4622522 0.2072421
## 
## $vectors
##             [,1]         [,2]
## [1,] 0.999998110 -0.999969167
## [2,] 0.001944019  0.007852673

r <- k - 1  



Ytrain <- as.data.frame(as.matrix(Xtrain) %*% E$vectors[, 1:r])

ybar <- lapply(split(Ytrain, train$V3), colMeans)

Ytest <- as.matrix(Xtest) %*% E$vectors[, 1:r]

D <- sapply(seq_len(k), function(j) {
  rowSums((Ytest - matrix(ybar[[j]], nrow(Ytest), r, byrow = TRUE))^2)
})


predicted_group <- unique(train$V3)[apply(D, 1, which.min)]

table(Predicted = predicted_group, Actual = test$V3)

##          Actual
## Predicted  1  2  3
##         1  3  1  7
##         2  0  3  1
##         3 11  0  0

mean(predicted_group==test$V3)

## [1] 0.2307692

library(MASS)
library(heplots)

## Loading required package: broom

X <- gpa[, 1:2]
group <- gpa$V3

box_test <- heplots::boxM(X, group)
box_test

## 
##  Box's M-test for Homogeneity of Covariance Matrices 
## 
## data:  X by group 
## Chi-Sq (approx.) = 16.0745, df = 6, p-value = 0.01336

We can see that the p-value is less than 0.05, so it is more appropriate to use QDA as the covariance matrices are not equal, thus not fulfilling LDA’s assumptions.

qda_model <- qda(V3 ~ ., data = gpa)
qda_model

## Call:
## qda(V3 ~ ., data = gpa)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3647059 0.3294118 0.3058824 
## 
## Group means:
##         V1       V2
## 1 3.403871 561.2258
## 2 2.482500 447.0714
## 3 2.992692 446.2308

predictions <- predict(qda_model, gpa)


conf_matrix <- table(Predicted = predictions$class, Actual = gpa$V3)
conf_matrix

##          Actual
## Predicted  1  2  3
##         1 30  0  1
##         2  0 27  0
##         3  1  1 25

mean(predictions$class == gpa$V3)

## [1] 0.9647059

Homework 2 4400

Evan Hardinger

2026-02-04

Q1.

Q2.

Q3.