1 분류

1.1 자료준비

정규시즌 구종 데이터

# savant_t_all.csv
PitchSample <- read.csv(file.choose(), header = T, stringsAsFactors = F)

X <- PitchSample
X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))

set.seed(1)
indexes = sample(1:nrow(X), size = 0.7*nrow(X))
train = X[indexes,]
test = X[-indexes,]

train_labels <- train[,1]
test_labels <- test[,1]

table(test_labels)

## test_labels
##    CH    CU    EP    FC    FF    FS    FT    KN    PO    SI    SL 
## 19734 22712    51 11804 23868  3056 18305   218    24 17274 21164

포스트시즌 데이터

# savant_p_all.csv
PitchSample2 <- read.csv(file.choose(), header = T, stringsAsFactors = F)

X <- PitchSample2
X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))

# --------------------
# 확인용 샘플 추출 size 조정
set.seed(1)
indexes = sample(1:nrow(X), size = nrow(X))
Rtest = X[indexes,]
# --------------------

Rtest_labels <- Rtest[,1]
Rtest_labels <- factor(Rtest_labels, 
                       levels = c("CH", "CU", "EP", "FC",
                                "FF", "FS", "FT", "KN", 
                                "PO", "SI", "SL"),
                       labels = c("CH", "CU", "EP", "FC",
                                "FF", "FS", "FT", "KN", 
                                "PO", "SI", "SL"))
table(Rtest_labels)

## Rtest_labels
##  CH  CU  EP  FC  FF  FS  FT  KN  PO  SI  SL 
## 500 500   0 500 500 129 500   0   0 362 500

1.2 패키지 인스톨

pkg_name_vec = c("e1071", "class", "rpart", "randomForest", "nnet", "ipred")
for (pkg_name in pkg_name_vec) {
  if (!requireNamespace(package = pkg_name, quietly = TRUE)) {
    install.packages(pkgs = pkg_name, dependencies = TRUE)
  }
  library(package = pkg_name, character.only = TRUE)
}

## Warning: package 'class' was built under R version 3.5.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## Warning: package 'nnet' was built under R version 3.5.1

2 이중분류

2.1 대분류 모델 만들기

Random Forest

set.seed(1)
RFm <- randomForest(pitch_type ~., data = train, ntree = 100)

set.seed(1)
RFtestm <- predict(RFm, newdata = Rtest, type = "class")

table(Rtest_labels, RFtestm)

##             RFtestm
## Rtest_labels  CH  CU  EP  FC  FF  FS  FT  KN  PO  SI  SL
##           CH 489   0   0   0   1   1   1   0   0   7   1
##           CU   1 468   0   0   0   0   0   0   0   0  31
##           EP   0   0   0   0   0   0   0   0   0   0   0
##           FC   1   0   0 418  12   0   0   0   0   0  69
##           FF   1   0   0   7 452   0  17   0   0  21   2
##           FS  56   1   0   0   0  71   1   0   0   0   0
##           FT  21   0   0   0  34   1 357   0   0  87   0
##           KN   0   0   0   0   0   0   0   0   0   0   0
##           PO   0   0   0   0   0   0   0   0   0   0   0
##           SI  13   0   0   0   8   0  60   0   0 281   0
##           SL   1  42   0  31   0   0   0   0   0   0 426

sum(diag(table(Rtest_labels, RFtestm)))/length(Rtest$pitch_type)

## [1] 0.8484675

sum(Rtest_labels == RFtestm)/length(Rtest$pitch_type)

## [1] 0.8484675

2.2 대분류 모델 레이블 변경

RFtestm <- gsub("FC", "C", RFtestm)
RFtestm <- gsub("SL", "C", RFtestm)
RFtestm <- gsub("CU", "C", RFtestm)
RFtestm <- gsub("EP", "C", RFtestm)
RFtestm <- gsub("FF", "B", RFtestm)
RFtestm <- gsub("FT", "B", RFtestm)
RFtestm <- gsub("SI", "B", RFtestm)
RFtestm <- gsub("PO", "B", RFtestm)
RFtestm <- gsub("CH", "A", RFtestm)
RFtestm <- gsub("FS", "A", RFtestm)
RFtestm <- gsub("KN", "A", RFtestm)
RFtestm <- data.frame(cbind(c(1:length(RFtestm)), RFtestm))

Aindexes <- subset(RFtestm, RFtestm == "A")
Aindexes <- as.vector(Aindexes$V1)
Aindexes <- as.numeric(Aindexes)

Bindexes <- subset(RFtestm, RFtestm == "B")
Bindexes <- as.vector(Bindexes$V1)
Bindexes <- as.numeric(Bindexes)

Cindexes <- subset(RFtestm, RFtestm == "C")
Cindexes <- as.vector(Cindexes$V1)
Cindexes <- as.numeric(Cindexes)

2.3 소분류 실제 테스트 자료

ARtest <- Rtest[Aindexes,]
BRtest <- Rtest[Bindexes,]
CRtest <- Rtest[Cindexes,]

ARtest_labels <- ARtest[,1]
BRtest_labels <- BRtest[,1]
CRtest_labels <- CRtest[,1]

2.4 소분류 모델 전처리

X <- PitchSample

X$pitch_type <- as.factor(X$pitch_type)
X <- cbind(X[1], as.data.frame(scale(X[2:19])))

A <- subset(X, X$pitch_type == "CH" | X$pitch_type == "FS" | X$pitch_type == "KN")
B <- subset(X, X$pitch_type == "SI" | X$pitch_type == "PO" | X$pitch_type == "FT" | X$pitch_type == "FF")
C <- subset(X, X$pitch_type == "SL" | X$pitch_type == "FC" | X$pitch_type == "CU" | X$pitch_type == "EP")

set.seed(1)
indexes = sample(1:nrow(A), size = 0.7*nrow(A))
Atrain = A[indexes,]
Atrain$pitch_type <- factor(Atrain$pitch_type)
Atest = A[-indexes,]
Atest$pitch_type <- factor(Atest$pitch_type)

Atrain_labels <- Atrain[,1]
Atest_labels <- Atest[,1]

set.seed(1)
indexes = sample(1:nrow(B), size = 0.7*nrow(B))
Btrain = B[indexes,]
Btrain$pitch_type <- factor(Btrain$pitch_type)
Btest = B[-indexes,]
Btest$pitch_type <- factor(Btest$pitch_type)

Btrain_labels <- Btrain[,1]
Btest_labels <- Btest[,1]

set.seed(1)
indexes = sample(1:nrow(C), size = 0.7*nrow(C))
Ctrain = C[indexes,]
Ctrain$pitch_type <- factor(Ctrain$pitch_type)
Ctest = C[-indexes,]
Ctest$pitch_type <- factor(Ctest$pitch_type)

Ctrain_labels <- Ctrain[,1]
Ctest_labels <- Ctest[,1]

2.5 소분류 A 모델 만들기

NBAtrain <- Atrain
NBAtrain$pitch_type <- factor(NBAtrain$pitch_type, levels = c("CH", "FS", "KN"))
NBAtrain_labels <- factor(Atrain_labels, levels = c("CH", "FS", "KN")) 

NBmA <- naiveBayes(NBAtrain, NBAtrain_labels, laplace = 0.01)
testm <- predict(NBmA, Atest)

table(Atest_labels, testm)

##             testm
## Atest_labels    CH    FS    KN
##           CH 19869     0    10
##           FS     0  3060     1
##           KN     0     0   218

sum(diag(table(Atest_labels, testm)))/length(Atest$pitch_type)

## [1] 0.999525

2.6 소분류 B 모델 만들기

NBBtrain <- Btrain
NBBtrain$pitch_type <- factor(NBBtrain$pitch_type, levels = c("FF", "FT", "PO", "SI"))
NBBtrain_labels <- factor(Btrain_labels, levels = c("FF", "FT", "PO", "SI")) 

NBmB <- naiveBayes(NBBtrain, NBBtrain_labels, laplace = 0.01)
testm <- predict(NBmB, Btest)

table(Btest_labels, testm)

##             testm
## Btest_labels    FF    FT    PO    SI
##           FF 23848     0    16   116
##           FT     0 18368     5     4
##           PO     1     0    23     0
##           SI     0     0     1 17347

#sum(diag(table(Btest_labels, testm)))/length(Btest$pitch_type)
sum(as.vector(Btest_labels) == as.vector(testm))/length(Btest$pitch_type)

## [1] 0.9976059

2.7 소분류 C 모델 만들기

NBCtrain <- Ctrain
NBCtrain$pitch_type <- factor(NBCtrain$pitch_type, levels = c("CU", "EP", "FC", "SL"))
NBCtrain_labels <- factor(Ctrain_labels, levels = c("CU", "EP", "FC", "SL"))

NBmC <- naiveBayes(NBCtrain, NBCtrain_labels, laplace = 0.01)
testm <- predict(NBmC, Ctest)

table(Ctest_labels, testm)

##             testm
## Ctest_labels    CU    EP    FC    SL
##           CU 22642    68    20     3
##           EP     1    52     0     0
##           FC     5     0 11662    15
##           SL    17     1     4 20834

sum(as.vector(Ctest_labels) == as.vector(testm))/length(Ctest$pitch_type)

## [1] 0.9975779

2.8 소분류 모델 적용 결합

A

ARtest$pitch_type <- factor(ARtest$pitch_type, levels = c("CH", "FS", "KN"))
if (nrow(ARtest) != 0) {
  Atestm <- predict(NBmA, ARtest)
  table(Atestm)
  Atestm <- as.vector(Atestm)
  AA <- cbind(Aindexes, Atestm)
} else {
  AA <- NA
}

B

BRtest$pitch_type <- factor(BRtest$pitch_type, levels = c("FF", "FT", "PO", "SI"))
if (nrow(BRtest) != 0) {
  Btestm <- predict(NBmB, BRtest)
  table(Btestm)
  Btestm <- as.vector(Btestm)
  BB <- cbind(Bindexes, Btestm)
} else {
  BB <- NA
}

C

CRtest$pitch_type <- factor(CRtest$pitch_type, levels = c("CU", "EP", "FC", "SL"))
if (nrow(CRtest) != 0) {
  Ctestm <- predict(NBmC, CRtest)
  table(Ctestm)
  Ctestm <- as.vector(Ctestm)
  CC <- cbind(Cindexes, Ctestm)
} else {
  CC <- NA
}

TOT <- data.frame(rbind(AA, BB, CC))
TOT <- na.omit(TOT)
TOT[,1] <- as.vector(TOT[,1])
TOT[,1] <- as.numeric(TOT[,1])
sTOT <- TOT[c(order(TOT[,1])),]

model_result <- as.vector(sTOT[,2])

2.9 결과

table(Rtest_labels, model_result)

##             model_result
## Rtest_labels  CH  CU  EP  FC  FF  FS  FT  PO  SI  SL
##           CH 490   0   0   0   0   0   2   1   6   1
##           CU   1 495   4   0   0   0   0   0   0   0
##           EP   0   0   0   0   0   0   0   0   0   0
##           FC   1   0   0 486  12   0   0   0   0   1
##           FF   1   0   0   9 490   0   0   0   0   0
##           FS   0   1   0   0   0 127   0   0   1   0
##           FT  22   0   0   0   0   0 477   0   1   0
##           KN   0   0   0   0   0   0   0   0   0   0
##           PO   0   0   0   0   0   0   0   0   0   0
##           SI  13   0   0   0   0   0   0   0 349   0
##           SL   0   0   0   0   0   1   0   0   0 499

sum(Rtest_labels == model_result)/length(Rtest_labels)

## [1] 0.9776568

0.9776568 약 97.78%의 분정정확도를 보인다

3 일반 분류

3.1 k-NN

분류정확도 : 0.8269

set.seed(1)
kNN3 = knn(train[,-1], Rtest[,-1], train_labels, k = 3)
table(Rtest_labels, kNN3)
sum(diag(table(Rtest_labels, kNN3)))/length(Rtest$pitch_type)

3.2 Decision Tree

분류정확도 : 0.6877

Tm <- rpart(pitch_type ~., data = train)
testm <- predict(Tm, newdata = Rtest, type = "class")

table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)

3.3 Random Forest

분류정확도 : 0.8490

set.seed(1)
RFm <- randomForest(pitch_type ~., data = train, ntree = 100)
set.seed(1)
RFtestm <- predict(RFm, newdata = Rtest, type = "class")

table(Rtest_labels, RFtestm)
sum(diag(table(Rtest_labels, RFtestm)))/length(Rtest$pitch_type)
sum(Rtest_labels == RFtestm)/length(Rtest$pitch_type)

3.4 Multinomial Logistic Regression

분류정확도 : 0.6883

LRm <- multinom(pitch_type ~., data = train)
testm <- predict(LRm, newdata = Rtest, type = "class")

table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)

3.5 Naive Bayes

분류정확도 : 0.6451

NBtrain <- train
NBtrain$pitch_type <- factor(NBtrain$pitch_type)#, levels = c("CH", "CU", "FC", "FF", "FS", "FT", "SI", "SL"))
NBtrain_labels <- factor(train_labels)#, levels = c("CH", "CU", "FC", "FF", "FS", "FT", "SI", "SL")) 
NBm <- naiveBayes(NBtrain, NBtrain_labels, laplace = 0.01)

testm <- predict(NBm, Rtest)

table(Rtest_labels, testm)
sum(as.vector(Rtest_labels) == as.vector(testm))/length(Rtest$pitch_type)

3.6 Support Vector Machine

분류정확도 : .

SVMm <- svm(pitch_type ~., data = train, kernel = "radial") #linear, polynomial
testm <- predict(SVMm, Rtest)

table(Rtest_labels, testm)
sum(diag(table(Rtest_labels, testm)))/length(Rtest$pitch_type)

3.7 Bagging

분류정확도 : 0.8593

set.seed(1)
BGm <- bagging(pitch_type ~., data = train, nbagg = 30)
set.seed(1)
testm <- predict(BGm, Rtest)
table(Rtest_labels, factor(testm, 
                       levels = c("CH", "CU", "EP", "FC",
                                "FF", "FS", "FT", "KN", 
                                "PO", "SI", "SL")))
sum(diag(table(Rtest_labels, factor(testm, 
                       levels = c("CH", "CU", "EP", "FC",
                                "FF", "FS", "FT", "KN", 
                                "PO", "SI", "SL"))))/length(Rtest$pitch_type))

3.8 XGboost

분류정확도 : 0.8544

pkg_name_vec = c("xgboost", "dplyr", "caret")
for (pkg_name in pkg_name_vec) {
  if (!requireNamespace(package = pkg_name, quietly = TRUE)) {
    install.packages(pkgs = pkg_name, dependencies = TRUE)
  }
  library(package = pkg_name, character.only = TRUE)
}

dtrain <- as.matrix(train[,2:19])
dtest <- as.matrix(test[,2:19])

train_labels  <- train$pitch_type
test_labels <- test$pitch_type

numberOfClasses <- length(unique(train_labels)) + 1
param <- list("objective" = "multi:softmax",
              "eval_metric" = "mlogloss",
              "num_class" = numberOfClasses)
XBfit <- xgboost(dtrain, label = train_labels,
               nround = 350,
               params = param)

xgb_pred <- predict(XBfit, dtest)
ntest_labels <- as.numeric(test_labels)

table(xgb_pred, ntest_labels)
sum(diag(table(xgb_pred, ntest_labels)))/length(test_labels)

dRtest <- as.matrix(Rtest[,2:19])

xgb_pred <- predict(XBfit, dRtest)
ntest_labels <- as.numeric(Rtest_labels)
table(ntest_labels,factor(xgb_pred, levels = c("1","2","3","4","5","6","7","8","9","10","11")))
sum(diag(table(xgb_pred, ntest_labels)))/length(Rtest_labels)

3.9 결과

이중분류 모델이 단일분류 모델보다 효과적임 알 수 있다.

야구 구종 분류

data mining in KGU

2019

1 분류

1.1 자료준비

1.2 패키지 인스톨

2 이중분류

2.1 대분류 모델 만들기

2.2 대분류 모델 레이블 변경

2.3 소분류 실제 테스트 자료

2.4 소분류 모델 전처리

2.5 소분류 A 모델 만들기

2.6 소분류 B 모델 만들기

2.7 소분류 C 모델 만들기

2.8 소분류 모델 적용 결합

A

B

C

2.9 결과

3 일반 분류

3.1 k-NN

3.2 Decision Tree

3.3 Random Forest

3.4 Multinomial Logistic Regression

3.5 Naive Bayes

3.6 Support Vector Machine

3.7 Bagging

3.8 XGboost

3.9 결과