dat<-read.csv(file="~/SEMESTER 5/STATISTIKA MULTIVARIAT/Placekick (1).csv")
N.all<-nrow(dat)
N.all
## [1] 1425
# TRAIN TEST SPLIT
#split into training data & validation data
set.seed(9819)
unif.split<-runif(n = N.all, min = 0, max = 1)
head(unif.split)
## [1] 0.01471709 0.07885069 0.68931571 0.02955178 0.89506013 0.02344578
data.training<-dat[unif.split <= 0.8, ]
data.validation<-dat[unif.split > 0.8, ]
head(data.training)
## week distance change elap30 PAT type field wind good
## 1 1 21 1 24.7167 0 1 1 0 1
## 2 1 21 0 15.8500 0 1 1 0 1
## 3 1 20 0 0.4500 1 1 1 0 1
## 4 1 28 0 13.5500 0 1 1 0 1
## 6 1 25 0 17.6833 0 0 0 0 1
## 7 1 20 0 12.6833 1 0 0 0 1
nrow(data.training)
## [1] 1174
nrow(data.training)/N.all
## [1] 0.8238596
#RESUBSTITUTION
library(MASS)
DA1<-lda(formula = good ~ distance + change + elap30 + PAT + type + field + wind, data = data.training, CV = FALSE,
prior = c(1,1)/2)
DA1
## Call:
## lda(good ~ distance + change + elap30 + PAT + type + field +
## wind, data = data.training, CV = FALSE, prior = c(1, 1)/2)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## distance change elap30 PAT type field wind
## 0 40.46565 0.4427481 10.35026 0.08396947 0.6641221 0.4580153 0.11450382
## 1 25.78811 0.2301055 12.51144 0.59348035 0.7277085 0.4755513 0.07094919
##
## Coefficients of linear discriminants:
## LD1
## distance -0.109592263
## change -0.392639367
## elap30 0.009227895
## PAT -0.278736593
## type 0.323834769
## field -0.263267037
## wind -0.549303608
##posterior probability estimation
pred.resub<-predict(object = DA1)
names(pred.resub)
## [1] "class" "posterior" "x"
head(pred.resub$posterior)
## 0 1
## 1 0.11066329 0.8893367
## 2 0.07015194 0.9298481
## 3 0.11073231 0.8892677
## 4 0.21161924 0.7883808
## 6 0.14083472 0.8591653
## 7 0.10270625 0.8972937
head(pred.resub$class)
## [1] 1 1 1 1 1 1
## Levels: 0 1
head(pred.resub$x)
## LD1
## 1 1.2945125
## 2 1.6053309
## 3 1.2940770
## 4 0.8169609
## 6 1.1233116
## 7 1.3463969
class.lin.discrim.rule<-ifelse(test = pred.resub$x> 0, yes = 1, no = 0)
rule2<-ifelse(test = pred.resub$posterior[,1] < 0.5, yes = 1, no = 0)
table(class.lin.discrim.rule, rule2)
## rule2
## class.lin.discrim.rule 0 1
## 0 322 0
## 1 0 852
summarize.class<-function(original, classify) {
class.table<-table(original, classify)
numb<-rowSums(class.table)
prop<-round(class.table/numb,4)
overall<-round(sum(diag(class.table))/sum(class.table), 4)
list(class.table = class.table, prop = prop,
overall.correct = overall)
}
## discriminant rule accuracy with training data
summarize.class(original = data.training$good, classify = pred.resub$class)
## $class.table
## classify
## original 0 1
## 0 100 31
## 1 222 821
##
## $prop
## classify
## original 0 1
## 0 0.7634 0.2366
## 1 0.2128 0.7872
##
## $overall.correct
## [1] 0.7845
## discriminant rule accuracy with validation data
pred.validation<-predict(object = DA1, newdata = data.validation)
head(pred.validation$posterior)
## 0 1
## 5 0.09079756 0.90920244
## 13 0.96381136 0.03618864
## 14 0.07515083 0.92484917
## 17 0.18675371 0.81324629
## 19 0.08395358 0.91604642
## 22 0.14893681 0.85106319
summarize.class(original = data.validation$good, classify = pred.validation$class)
## $class.table
## classify
## original 0 1
## 0 24 8
## 1 46 173
##
## $prop
## classify
## original 0 1
## 0 0.75 0.25
## 1 0.21 0.79
##
## $overall.correct
## [1] 0.7849
#CROSS VALIDATION
DA2<-lda(formula = good ~ week + distance +
change + elap30 + PAT + type + field + wind, data =
dat, CV = TRUE, prior = c(1,1)/2)
head(DA2$posterior)
## 0 1
## 1 0.11038820 0.8896118
## 2 0.07755802 0.9224420
## 3 0.08775029 0.9122497
## 4 0.21572455 0.7842755
## 5 0.08586327 0.9141367
## 6 0.15151182 0.8484882
head(DA2$class)
## [1] 1 1 1 1 1 1
## Levels: 0 1
## Discriminant rule accuracy
summarize.class(original = dat$good, classify = DA2$class)
## $class.table
## classify
## original 0 1
## 0 124 39
## 1 274 988
##
## $prop
## classify
## original 0 1
## 0 0.7607 0.2393
## 1 0.2171 0.7829
##
## $overall.correct
## [1] 0.7804
## apriori probablity
mean(dat$good)
## [1] 0.885614
## prior probability is taking from sample
DA3<-lda(formula = good ~ week + distance + change + elap30 + PAT + type + field + wind, data = data.training, CV =
FALSE)
pred.valid3<-predict(object = DA3, newdata = data.validation)
head(pred.valid3$posterior)
## 0 1
## 5 0.009582986 0.9904170
## 13 0.725675354 0.2743246
## 14 0.007980778 0.9920192
## 17 0.022014374 0.9779856
## 19 0.008954717 0.9910453
## 22 0.016999808 0.9830002
summarize.class(original = data.validation$good, classify =pred.valid3$class)
## $class.table
## classify
## original 0 1
## 0 10 22
## 1 19 200
##
## $prop
## classify
## original 0 1
## 0 0.3125 0.6875
## 1 0.0868 0.9132
##
## $overall.correct
## [1] 0.8367
DA4<-lda(formula = good ~ week + distance + change + elap30 + PAT + type + field + wind, data = dat, CV = TRUE)
head(DA4$posterior)
## 0 1
## 1 0.01577412 0.9842259
## 2 0.01074299 0.9892570
## 3 0.01227158 0.9877284
## 4 0.03430818 0.9656918
## 5 0.01198636 0.9880136
## 6 0.02254376 0.9774562
summarize.class(original = dat$good,classify = DA4$class)
## $class.table
## classify
## original 0 1
## 0 60 103
## 1 77 1185
##
## $prop
## classify
## original 0 1
## 0 0.3681 0.6319
## 1 0.0610 0.9390
##
## $overall.correct
## [1] 0.8737
# BOX M-TEST FOR HOMOGENEITY
library(biotools)
## Warning: package 'biotools' was built under R version 4.5.2
## ---
## biotools version 4.3
X <- as.matrix(dat[,1:8])
boxM(X,dat$good)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X
## Chi-Sq (approx.) = 186.74, df = 36, p-value < 2.2e-16
# Quadratic linear discriminant (resubstitution)
DA5<-qda(formula = good ~ week + distance + change + elap30 + PAT + type + field + wind, data =
data.training, CV = FALSE)
DA5
## Call:
## qda(good ~ week + distance + change + elap30 + PAT + type + field +
## wind, data = data.training, CV = FALSE)
##
## Prior probabilities of groups:
## 0 1
## 0.1115843 0.8884157
##
## Group means:
## week distance change elap30 PAT type field
## 0 9.977099 40.46565 0.4427481 10.35026 0.08396947 0.6641221 0.4580153
## 1 9.187919 25.78811 0.2301055 12.51144 0.59348035 0.7277085 0.4755513
## wind
## 0 0.11450382
## 1 0.07094919
pred.valid5<-predict(object = DA5, newdata = data.validation)
head(pred.valid5$posterior)
## 0 1
## 5 0.0007644247 0.99923558
## 13 0.9269004271 0.07309957
## 14 0.0011802877 0.99881971
## 17 0.0008784251 0.99912157
## 19 0.0006554284 0.99934457
## 22 0.0329021882 0.96709781
summarize.class(original = data.validation$good, classify = pred.valid5$class)
## $class.table
## classify
## original 0 1
## 0 14 18
## 1 25 194
##
## $prop
## classify
## original 0 1
## 0 0.4375 0.5625
## 1 0.1142 0.8858
##
## $overall.correct
## [1] 0.8287
# Quadratic linear discriminant (cross validatio)
DA6<-qda(formula = good ~ week + distance + change + elap30 + PAT + type + field + wind, data = dat, CV = TRUE)
head(DA6$posterior)
## 0 1
## 1 0.2175043430 0.7824957
## 2 0.0684966384 0.9315034
## 3 0.0001330365 0.9998670
## 4 0.0553394298 0.9446606
## 5 0.0003076925 0.9996923
## 6 0.0420854539 0.9579145
summarize.class(original = dat$good, classify = DA6$class)
## $class.table
## classify
## original 0 1
## 0 73 90
## 1 114 1148
##
## $prop
## classify
## original 0 1
## 0 0.4479 0.5521
## 1 0.0903 0.9097
##
## $overall.correct
## [1] 0.8568