1 분류
1.1 자료준비
# savant_t_all.csv
PitchSample <- read.csv(file.choose(), header = T, stringsAsFactors = F)
X <- PitchSample
X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))
set.seed(1)
indexes = sample(1:nrow(X), size = 0.7*nrow(X))
train = X[indexes,]
test = X[-indexes,]
train_labels <- train[,1]
test_labels <- test[,1]
table(test_labels)
## test_labels
## CH CU EP FC FF FS FT KN PO SI SL
## 19734 22712 51 11804 23868 3056 18305 218 24 17274 21164
# savant_p_all.csv
PitchSample2 <- read.csv(file.choose(), header = T, stringsAsFactors = F)
X <- PitchSample2
X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))
# --------------------
# 확인용 샘플 추출 size 조정
set.seed(1)
indexes = sample(1:nrow(X), size = nrow(X))
Rtest = X[indexes,]
# --------------------
Rtest_labels <- Rtest[,1]
Rtest_labels <- factor(Rtest_labels,
levels = c("CH", "CU", "EP", "FC",
"FF", "FS", "FT", "KN",
"PO", "SI", "SL"),
labels = c("CH", "CU", "EP", "FC",
"FF", "FS", "FT", "KN",
"PO", "SI", "SL"))
table(Rtest_labels)
## Rtest_labels
## CH CU EP FC FF FS FT KN PO SI SL
## 500 500 0 500 500 129 500 0 0 362 500
1.2 패키지 인스톨
pkg_name_vec = c("e1071", "class", "rpart", "randomForest", "nnet", "ipred")
for (pkg_name in pkg_name_vec) {
if (!requireNamespace(package = pkg_name, quietly = TRUE)) {
install.packages(pkgs = pkg_name, dependencies = TRUE)
}
library(package = pkg_name, character.only = TRUE)
}
## Warning: package 'class' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Warning: package 'nnet' was built under R version 3.5.1
2 이중분류
2.1 대분류 모델 만들기
set.seed(1)
RFm <- randomForest(pitch_type ~., data = train, ntree = 100)
set.seed(1)
RFtestm <- predict(RFm, newdata = Rtest, type = "class")
table(Rtest_labels, RFtestm)
## RFtestm
## Rtest_labels CH CU EP FC FF FS FT KN PO SI SL
## CH 489 0 0 0 1 1 1 0 0 7 1
## CU 1 468 0 0 0 0 0 0 0 0 31
## EP 0 0 0 0 0 0 0 0 0 0 0
## FC 1 0 0 418 12 0 0 0 0 0 69
## FF 1 0 0 7 452 0 17 0 0 21 2
## FS 56 1 0 0 0 71 1 0 0 0 0
## FT 21 0 0 0 34 1 357 0 0 87 0
## KN 0 0 0 0 0 0 0 0 0 0 0
## PO 0 0 0 0 0 0 0 0 0 0 0
## SI 13 0 0 0 8 0 60 0 0 281 0
## SL 1 42 0 31 0 0 0 0 0 0 426
sum(diag(table(Rtest_labels, RFtestm)))/length(Rtest$pitch_type)
## [1] 0.8484675
sum(Rtest_labels == RFtestm)/length(Rtest$pitch_type)
## [1] 0.8484675
2.2 대분류 모델 레이블 변경
RFtestm <- gsub("FC", "C", RFtestm)
RFtestm <- gsub("SL", "C", RFtestm)
RFtestm <- gsub("CU", "C", RFtestm)
RFtestm <- gsub("EP", "C", RFtestm)
RFtestm <- gsub("FF", "B", RFtestm)
RFtestm <- gsub("FT", "B", RFtestm)
RFtestm <- gsub("SI", "B", RFtestm)
RFtestm <- gsub("PO", "B", RFtestm)
RFtestm <- gsub("CH", "A", RFtestm)
RFtestm <- gsub("FS", "A", RFtestm)
RFtestm <- gsub("KN", "A", RFtestm)
RFtestm <- data.frame(cbind(c(1:length(RFtestm)), RFtestm))
Aindexes <- subset(RFtestm, RFtestm == "A")
Aindexes <- as.vector(Aindexes$V1)
Aindexes <- as.numeric(Aindexes)
Bindexes <- subset(RFtestm, RFtestm == "B")
Bindexes <- as.vector(Bindexes$V1)
Bindexes <- as.numeric(Bindexes)
Cindexes <- subset(RFtestm, RFtestm == "C")
Cindexes <- as.vector(Cindexes$V1)
Cindexes <- as.numeric(Cindexes)
2.3 소분류 실제 테스트 자료
ARtest <- Rtest[Aindexes,]
BRtest <- Rtest[Bindexes,]
CRtest <- Rtest[Cindexes,]
ARtest_labels <- ARtest[,1]
BRtest_labels <- BRtest[,1]
CRtest_labels <- CRtest[,1]
2.4 소분류 모델 전처리
X <- PitchSample
X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))
A <- subset(X, X$pitch_type == "CH" | X$pitch_type == "FS" | X$pitch_type == "KN")
B <- subset(X, X$pitch_type == "SI" | X$pitch_type == "PO" | X$pitch_type == "FT" | X$pitch_type == "FF")
C <- subset(X, X$pitch_type == "SL" | X$pitch_type == "FC" | X$pitch_type == "CU" | X$pitch_type == "EP")
set.seed(1)
indexes = sample(1:nrow(A), size = 0.7*nrow(A))
Atrain = A[indexes,]
Atrain$pitch_type <- factor(Atrain$pitch_type)
Atest = A[-indexes,]
Atest$pitch_type <- factor(Atest$pitch_type)
Atrain_labels <- Atrain[,1]
Atest_labels <- Atest[,1]
set.seed(1)
indexes = sample(1:nrow(B), size = 0.7*nrow(B))
Btrain = B[indexes,]
Btrain$pitch_type <- factor(Btrain$pitch_type)
Btest = B[-indexes,]
Btest$pitch_type <- factor(Btest$pitch_type)
Btrain_labels <- Btrain[,1]
Btest_labels <- Btest[,1]
set.seed(1)
indexes = sample(1:nrow(C), size = 0.7*nrow(C))
Ctrain = C[indexes,]
Ctrain$pitch_type <- factor(Ctrain$pitch_type)
Ctest = C[-indexes,]
Ctest$pitch_type <- factor(Ctest$pitch_type)
Ctrain_labels <- Ctrain[,1]
Ctest_labels <- Ctest[,1]
2.5 소분류 A 모델 만들기
NBAtrain <- Atrain
NBAtrain$pitch_type <- factor(NBAtrain$pitch_type, levels = c("CH", "FS", "KN"))
NBAtrain_labels <- factor(Atrain_labels, levels = c("CH", "FS", "KN"))
NBmA <- naiveBayes(NBAtrain, NBAtrain_labels, laplace = 0.01)
testm <- predict(NBmA, Atest)
table(Atest_labels, testm)
## testm
## Atest_labels CH FS KN
## CH 19869 0 10
## FS 0 3060 1
## KN 0 0 218
sum(diag(table(Atest_labels, testm)))/length(Atest$pitch_type)
## [1] 0.999525
2.6 소분류 B 모델 만들기
NBBtrain <- Btrain
NBBtrain$pitch_type <- factor(NBBtrain$pitch_type, levels = c("FF", "FT", "PO", "SI"))
NBBtrain_labels <- factor(Btrain_labels, levels = c("FF", "FT", "PO", "SI"))
NBmB <- naiveBayes(NBBtrain, NBBtrain_labels, laplace = 0.01)
testm <- predict(NBmB, Btest)
table(Btest_labels, testm)
## testm
## Btest_labels FF FT PO SI
## FF 23848 0 16 116
## FT 0 18368 5 4
## PO 1 0 23 0
## SI 0 0 1 17347
#sum(diag(table(Btest_labels, testm)))/length(Btest$pitch_type)
sum(as.vector(Btest_labels) == as.vector(testm))/length(Btest$pitch_type)
## [1] 0.9976059
2.7 소분류 C 모델 만들기
NBCtrain <- Ctrain
NBCtrain$pitch_type <- factor(NBCtrain$pitch_type, levels = c("CU", "EP", "FC", "SL"))
NBCtrain_labels <- factor(Ctrain_labels, levels = c("CU", "EP", "FC", "SL"))
NBmC <- naiveBayes(NBCtrain, NBCtrain_labels, laplace = 0.01)
testm <- predict(NBmC, Ctest)
table(Ctest_labels, testm)
## testm
## Ctest_labels CU EP FC SL
## CU 22642 68 20 3
## EP 1 52 0 0
## FC 5 0 11662 15
## SL 17 1 4 20834
sum(as.vector(Ctest_labels) == as.vector(testm))/length(Ctest$pitch_type)
## [1] 0.9975779
2.8 소분류 모델 적용 결합
A
ARtest$pitch_type <- factor(ARtest$pitch_type, levels = c("CH", "FS", "KN"))
if (nrow(ARtest) != 0) {
Atestm <- predict(NBmA, ARtest)
table(Atestm)
Atestm <- as.vector(Atestm)
AA <- cbind(Aindexes, Atestm)
} else {
AA <- NA
}
B
BRtest$pitch_type <- factor(BRtest$pitch_type, levels = c("FF", "FT", "PO", "SI"))
if (nrow(BRtest) != 0) {
Btestm <- predict(NBmB, BRtest)
table(Btestm)
Btestm <- as.vector(Btestm)
BB <- cbind(Bindexes, Btestm)
} else {
BB <- NA
}
C
CRtest$pitch_type <- factor(CRtest$pitch_type, levels = c("CU", "EP", "FC", "SL"))
if (nrow(CRtest) != 0) {
Ctestm <- predict(NBmC, CRtest)
table(Ctestm)
Ctestm <- as.vector(Ctestm)
CC <- cbind(Cindexes, Ctestm)
} else {
CC <- NA
}
TOT <- data.frame(rbind(AA, BB, CC))
TOT <- na.omit(TOT)
TOT[,1] <- as.vector(TOT[,1])
TOT[,1] <- as.numeric(TOT[,1])
sTOT <- TOT[c(order(TOT[,1])),]
model_result <- as.vector(sTOT[,2])
2.9 결과
table(Rtest_labels, model_result)
## model_result
## Rtest_labels CH CU EP FC FF FS FT PO SI SL
## CH 490 0 0 0 0 0 2 1 6 1
## CU 1 495 4 0 0 0 0 0 0 0
## EP 0 0 0 0 0 0 0 0 0 0
## FC 1 0 0 486 12 0 0 0 0 1
## FF 1 0 0 9 490 0 0 0 0 0
## FS 0 1 0 0 0 127 0 0 1 0
## FT 22 0 0 0 0 0 477 0 1 0
## KN 0 0 0 0 0 0 0 0 0 0
## PO 0 0 0 0 0 0 0 0 0 0
## SI 13 0 0 0 0 0 0 0 349 0
## SL 0 0 0 0 0 1 0 0 0 499
sum(Rtest_labels == model_result)/length(Rtest_labels)
## [1] 0.9776568
- 0.9776568 약 97.78%의 분정정확도를 보인다
3 일반 분류
3.1 k-NN
set.seed(1)
kNN3 = knn(train[,-1], Rtest[,-1], train_labels, k = 3)
table(Rtest_labels, kNN3)
sum(diag(table(Rtest_labels, kNN3)))/length(Rtest$pitch_type)
3.2 Decision Tree
Tm <- rpart(pitch_type ~., data = train)
testm <- predict(Tm, newdata = Rtest, type = "class")
table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)
3.3 Random Forest
set.seed(1)
RFm <- randomForest(pitch_type ~., data = train, ntree = 100)
set.seed(1)
RFtestm <- predict(RFm, newdata = Rtest, type = "class")
table(Rtest_labels, RFtestm)
sum(diag(table(Rtest_labels, RFtestm)))/length(Rtest$pitch_type)
sum(Rtest_labels == RFtestm)/length(Rtest$pitch_type)
3.4 Multinomial Logistic Regression
LRm <- multinom(pitch_type ~., data = train)
testm <- predict(LRm, newdata = Rtest, type = "class")
table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)
3.5 Naive Bayes
NBtrain <- train
NBtrain$pitch_type <- factor(NBtrain$pitch_type)#, levels = c("CH", "CU", "FC", "FF", "FS", "FT", "SI", "SL"))
NBtrain_labels <- factor(train_labels)#, levels = c("CH", "CU", "FC", "FF", "FS", "FT", "SI", "SL"))
NBm <- naiveBayes(NBtrain, NBtrain_labels, laplace = 0.01)
testm <- predict(NBm, Rtest)
table(Rtest_labels, testm)
sum(as.vector(Rtest_labels) == as.vector(testm))/length(Rtest$pitch_type)
3.6 Support Vector Machine
SVMm <- svm(pitch_type ~., data = train, kernel = "radial") #linear, polynomial
testm <- predict(SVMm, Rtest)
table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)
3.7 Bagging
set.seed(1)
BGm <- bagging(pitch_type ~., data = train, nbagg = 30)
set.seed(1)
testm <- predict(BGm, Rtest)
table(Rtest_labels, factor(testm,
levels = c("CH", "CU", "EP", "FC",
"FF", "FS", "FT", "KN",
"PO", "SI", "SL")))
sum(diag(table(Rtest_labels, factor(testm,
levels = c("CH", "CU", "EP", "FC",
"FF", "FS", "FT", "KN",
"PO", "SI", "SL"))))/length(Rtest$pitch_type))
3.8 XGboost
pkg_name_vec = c("xgboost", "dplyr", "caret")
for (pkg_name in pkg_name_vec) {
if (!requireNamespace(package = pkg_name, quietly = TRUE)) {
install.packages(pkgs = pkg_name, dependencies = TRUE)
}
library(package = pkg_name, character.only = TRUE)
}
dtrain <- as.matrix(train[,2:19])
dtest <- as.matrix(test[,2:19])
train_labels <- train$pitch_type
test_labels <- test$pitch_type
numberOfClasses <- length(unique(train_labels)) + 1
param <- list("objective" = "multi:softmax",
"eval_metric" = "mlogloss",
"num_class" = numberOfClasses)
XBfit <- xgboost(dtrain, label = train_labels,
nround = 350,
params = param)
xgb_pred <- predict(XBfit, dtest)
ntest_labels <- as.numeric(test_labels)
table(xgb_pred, ntest_labels)
sum(diag(table(xgb_pred, ntest_labels)))/length(test_labels)
dRtest <- as.matrix(Rtest[,2:19])
xgb_pred <- predict(XBfit, dRtest)
ntest_labels <- as.numeric(Rtest_labels)
table(ntest_labels,factor(xgb_pred, levels = c("1","2","3","4","5","6","7","8","9","10","11")))
sum(diag(table(xgb_pred, ntest_labels)))/length(Rtest_labels)
3.9 결과
- 이중분류 모델이 단일분류 모델보다 효과적임 알 수 있다.