# 載入所需的套件(Loading package)
library(dplyr)
library(plyr)
library(ggplot2)
library(readr)
library(reshape)
library(ROCR)
library(caret)
library(randomForest)
library(pROC)
# 更改R的預設語系
Sys.setlocale("LC_ALL",'C')      
[1] "C"



資料前處理



(處理後)資料集資訊



載入資料集(Loading dataset):[LOLgamedata.csv]

我們透過組員自身的遊戲經驗,挑出我們認為較顯著之資料欄位進行變數型態轉換

lol <- read.csv("LOLgamedata.csv",header = T)
# View(lol)
# 將欄位進行型態轉換(transfer our data as factor)
lol$winner <- as.factor(lol$winner)
lol$firstBlood <- as.factor(lol$firstBlood)
lol$firstTower <- as.factor(lol$firstTower)
lol$firstInhibitor <- as.factor(lol$firstInhibitor)
lol$firstBaron <- as.factor(lol$firstBaron)
lol$firstDragon <- as.factor(lol$firstDragon)
lol$firstRiftHerald <- as.factor(lol$firstRiftHerald)



新增變數

lol$tower_gap <- (lol$t1_towerKills - lol$t2_towerKills)                              # 勝負隊伍塔差
# lol_train$tower_gap <- (lol_train$t1_towerKills - lol_train$t2_towerKills)
# summary(glm(winner ~ lol_train$tower_gap, data= lol_train, family = "binomial"))
lol$dragon_gap <- (lol$t1_dragonKills - lol$t2_dragonKills)                           # 勝負隊伍殺小龍數量差異
lol$baron_gap <- (lol$t1_baronKills - lol$t2_baronKills)                              # 勝負隊伍殺巴隆數量差異
lol$inhibitorKills_gap <- (lol$t1_inhibitorKills - lol$t2_inhibitorKills)             # 勝負隊伍水晶兵營數量差異



邏輯式迴歸(glm)

# 將資料切割成Training Set(lol_train), Testing Set(lol_test)
set.seed(2018)
train_idx <- sample(1:nrow(lol), size = 0.8 * nrow(lol), replace = F)
lol_train <- lol[train_idx,] 
lol_test <- lol[-train_idx,] 
summary(glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, data = lol_train, family = "binomial"))

Call:
glm(formula = winner ~ firstBlood + firstTower + firstInhibitor + 
    firstBaron + firstDragon + firstRiftHerald, family = "binomial", 
    data = lol_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9057  -0.3409  -0.1809   0.3495   2.8710  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -0.72064    0.11624  -6.199 5.67e-10 ***
firstBlood2       0.28354    0.03497   8.108 5.14e-16 ***
firstTower2       0.82213    0.03631  22.644  < 2e-16 ***
firstInhibitor1  -2.02436    0.05042 -40.153  < 2e-16 ***
firstInhibitor2   1.91971    0.05078  37.805  < 2e-16 ***
firstBaron1      -0.91436    0.04685 -19.518  < 2e-16 ***
firstBaron2       1.18904    0.04549  26.141  < 2e-16 ***
firstDragon1     -0.26268    0.11836  -2.219   0.0265 *  
firstDragon2      0.54305    0.11849   4.583 4.58e-06 ***
firstRiftHerald1 -0.18285    0.04329  -4.224 2.40e-05 ***
firstRiftHerald2  0.16980    0.04365   3.890   0.0001 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 55288  on 39883  degrees of freedom
Residual deviance: 23077  on 39873  degrees of freedom
AIC: 23099

Number of Fisher Scoring iterations: 6
# 預測(glm prediction)(common sense)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9044324
[1] 0.9044324
# 預測(glm prediction)(加入towel_gap)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9736262
[1] 0.9736262
# 預測(glm prediction)(加入towel_gap,並拿掉不顯著之變數)
model1 <- glm(winner ~ firstTower + firstInhibitor + firstRiftHerald + tower_gap, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9731247
[1] 0.9731247
# 預測(glm prediction)(加入towel_gap, dragon_gap, baron_gap, inhibitorKills_gap, gameDuration)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap + dragon_gap + baron_gap + inhibitorKills_gap + gameDuration, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9742278
[1] 0.9742278
summary(model1)

Call:
glm(formula = winner ~ firstBlood + firstTower + firstInhibitor + 
    firstBaron + firstDragon + firstRiftHerald + tower_gap + 
    dragon_gap + baron_gap + inhibitorKills_gap + gameDuration, 
    family = "binomial", data = lol_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.8432  -0.0338  -0.0023   0.0325   4.7229  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.37382    0.27022   1.383 0.166540    
firstBlood2        -0.12787    0.07033  -1.818 0.069042 .  
firstTower2        -0.87222    0.07553 -11.548  < 2e-16 ***
firstInhibitor1     0.51610    0.13912   3.710 0.000207 ***
firstInhibitor2    -0.58941    0.14017  -4.205 2.61e-05 ***
firstBaron1         0.52945    0.13253   3.995 6.47e-05 ***
firstBaron2        -0.08496    0.13143  -0.646 0.518002    
firstDragon1        0.47420    0.26300   1.803 0.071388 .  
firstDragon2        0.59231    0.26366   2.246 0.024673 *  
firstRiftHerald1    0.22080    0.08751   2.523 0.011634 *  
firstRiftHerald2   -0.59453    0.08857  -6.713 1.91e-11 ***
tower_gap          -1.28958    0.02374 -54.313  < 2e-16 ***
dragon_gap          0.04511    0.02163   2.086 0.037020 *  
baron_gap          -0.62911    0.05491 -11.456  < 2e-16 ***
inhibitorKills_gap  0.10260    0.03776   2.717 0.006591 ** 
gameDuration       -0.01023    0.00544  -1.880 0.060134 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 55287.7  on 39883  degrees of freedom
Residual deviance:  5838.7  on 39868  degrees of freedom
AIC: 5870.7

Number of Fisher Scoring iterations: 9



邏輯式迴歸ROC(ROC curve of Logistic Regression)

pred <- prediction(result, lol_test$winner)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, "auc")
# 繪製ROC curve之圖形,並算出AUC
plot(perf, main = "ROC curve(Logistic Regression)", xlab = "Specificity(FPR)", ylab = "Sensitivity(TPR)")
abline(0, 1)
text(0.5, 0.5, as.character(auc@y.values[[1]]))           # AUC = 0.9966



邏輯式迴歸交叉驗證(Cross-Validation(CV) of Logistic Regession)

# Get k-fold CV confusion matrix for Logistic Regression model
# f: formula, d: data, k: number of folds, cutoff: cutoff point 0-1
k_fold_CV_logit = function(f, d, k, cutoff){
  numOfRec = nrow(d) # number of observations
  reponse_var = all.vars(f)[1] # name of the response variable
  # k indices used to split data into k parts
  sample_idx_k = rep(sample(1:k),round(numOfRec / k) + 1)[1:numOfRec]
  # k models for k subsets of data
  k_fits = Map( function(x) glm(f, d[sample_idx_k != x, ],
                                family = "binomial"), 1:k)
  # Predicted & actual classes for each hold-out subset
  predActualClass = Map(function(x){
    predictedProb = predict(k_fits[[x]], d[sample_idx_k == x,],
                            type = "response")
    predictedClass = ifelse(predictedProb > cutoff, 1, 0)
    return(data.frame("predictedClass" = predictedClass,
                      "actualClass" = d[sample_idx_k == x, reponse_var] ) )
  }, 1:k)
  # A data frame with all predicted & actual classes
  output_DF = Reduce(function(x, y) rbind(x, y), predActualClass)
  output_DF$predictedClass = factor(output_DF$predictedClass,
                                    levels=c(0,1),labels = c("No", "Yes"))
  return( table(output_DF$predictedClass, output_DF$actualClass))
}
Map(function(cutoff) k_fold_CV_logit(winner ~ firstBlood+firstTower+firstInhibitor+firstBaron+firstDragon+firstRiftHerald,
                                     lol[train_idx,], 10, cutoff), list(0.9, 0.8, 0.7, 0.6, 0.5, 0.45, 0.4, 0.3, 0.2, 0.1)) # 0.5
[[1]]
     
          1     2
  No  19504  7456
  Yes   618 12306

[[2]]
     
          1     2
  No  19129  4829
  Yes   993 14933

[[3]]
     
          1     2
  No  18822  3285
  Yes  1300 16477

[[4]]
     
          1     2
  No  18674  2637
  Yes  1448 17125

[[5]]
     
          1     2
  No  18233  1978
  Yes  1889 17784

[[6]]
     
          1     2
  No  17860  1664
  Yes  2262 18098

[[7]]
     
          1     2
  No  17692  1587
  Yes  2430 18175

[[8]]
     
          1     2
  No  16964  1335
  Yes  3158 18427

[[9]]
     
          1     2
  No  15230   975
  Yes  4892 18787

[[10]]
     
          1     2
  No  12439   526
  Yes  7683 19236



隨機森林(randomForest)

# 試藉由組員自身遊戲經驗所找出之變數跑randomForest
set.seed(2018)
lol_tree <- randomForest(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, lol_train, ntree = 500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree

Call:
 randomForest(formula = winner ~ firstBlood + firstTower + firstInhibitor +      firstBaron + firstDragon + firstRiftHerald, data = lol_train,      ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 9.36%
Confusion matrix:
      1     2 class.error
1 18342  1780  0.08846039
2  1954 17808  0.09887663
    # Confusion matrix:
    #       1     2 class.error
    # 1 18345  1777  0.08831130
    # 2  1973 17789  0.09983807
# 透過importance()來找出較重要之變數
importance(lol_tree)
                MeanDecreaseGini
firstBlood               49.4587
firstTower             1481.5744
firstInhibitor         8485.8823
firstBaron             2454.6815
firstDragon             718.0792
firstRiftHerald         210.8131
    #                 MeanDecreaseGini
    # firstTower            1059.24050
    # firstInhibitor        9253.06155
    # firstBaron            2451.38878
    # firstDragon            745.84541
    # firstRiftHerald         91.45326 (最低)
# 移除firstRiftHerald變數,再次進行建模
lol_tree <- randomForest(winner ~ firstTower+firstInhibitor+firstBaron+firstDragon+firstRiftHerald,lol_train, ntree=500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree

Call:
 randomForest(formula = winner ~ firstTower + firstInhibitor +      firstBaron + firstDragon + firstRiftHerald, data = lol_train,      ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 2

        OOB estimate of  error rate: 9.42%
Confusion matrix:
      1     2 class.error
1 18316  1806  0.08975251
2  1953 17809  0.09882603
    # Confusion matrix:
    #       1     2 class.error
    # 1 18322  1800  0.08945433
    # 2  1966 17796  0.09948386
# 預測(randomForest prediction)(加入towel_gap, dragon_gap, baron_gap, inhibitorKills_gap, gameDuration)
lol_tree <- randomForest(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap + dragon_gap + baron_gap + inhibitorKills_gap + gameDuration,lol_train, ntree = 500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree

Call:
 randomForest(formula = winner ~ firstBlood + firstTower + firstInhibitor +      firstBaron + firstDragon + firstRiftHerald + tower_gap +      dragon_gap + baron_gap + inhibitorKills_gap + gameDuration,      data = lol_train, ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 2.4%
Confusion matrix:
      1     2 class.error
1 19649   473  0.02350661
2   484 19278  0.02449145
summary(lol_tree)
                Length Class  Mode     
call                4  -none- call     
type                1  -none- character
predicted       39884  factor numeric  
err.rate         1500  -none- numeric  
confusion           6  -none- numeric  
votes           79768  matrix numeric  
oob.times       39884  -none- numeric  
classes             2  -none- character
importance         11  -none- numeric  
importanceSD        0  -none- NULL     
localImportance     0  -none- NULL     
proximity           0  -none- NULL     
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             14  -none- list     
y               39884  factor numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call     



隨機森林ROC(ROC curve of Random Forest)

# 繪製隨機森林的ROC曲線,繪製ROC curve之圖形,並算出AUC
rf.pred <- predict(lol_tree, lol_test, type = "prob")
rf.roc <- prediction(rf.pred[,2], lol_test$winner)
rf.auc <- performance(rf.roc, 'tpr', 'fpr')
# rf.auc
plot(rf.auc)
abline(0, 1)

# text(0.5, 0.5, as.character(rf.auc@y.values[[1]]))  



邏輯式迴歸與隨機森林在ROC曲線與AUC之比較

# 將邏輯式迴歸跟隨機森林所繪製出之ROC曲線進行比較
layout(matrix(c(1,2),1,2,byrow = F))
# 邏輯式迴歸
plot(perf, main = "ROC curve", xlab = "False positive rate", ylab = "True positive rate")
abline(0, 1)
text(0.5, 0.5, as.character(auc@y.values[[1]]))           # AUC = 0.9408
# 隨機森林
plot(rf.auc)
abline(0, 1)



計算平均平方誤差MSE(Mean Square Error)

# mean((result-lol_test$winner)^2)
dim(lol_test)       #  9972    71
[1] 9972   75
dim(lol_train)      # 39884    71
[1] 39884    75
summary(model1)

Call:
glm(formula = winner ~ firstBlood + firstTower + firstInhibitor + 
    firstBaron + firstDragon + firstRiftHerald + tower_gap + 
    dragon_gap + baron_gap + inhibitorKills_gap + gameDuration, 
    family = "binomial", data = lol_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.8432  -0.0338  -0.0023   0.0325   4.7229  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.37382    0.27022   1.383 0.166540    
firstBlood2        -0.12787    0.07033  -1.818 0.069042 .  
firstTower2        -0.87222    0.07553 -11.548  < 2e-16 ***
firstInhibitor1     0.51610    0.13912   3.710 0.000207 ***
firstInhibitor2    -0.58941    0.14017  -4.205 2.61e-05 ***
firstBaron1         0.52945    0.13253   3.995 6.47e-05 ***
firstBaron2        -0.08496    0.13143  -0.646 0.518002    
firstDragon1        0.47420    0.26300   1.803 0.071388 .  
firstDragon2        0.59231    0.26366   2.246 0.024673 *  
firstRiftHerald1    0.22080    0.08751   2.523 0.011634 *  
firstRiftHerald2   -0.59453    0.08857  -6.713 1.91e-11 ***
tower_gap          -1.28958    0.02374 -54.313  < 2e-16 ***
dragon_gap          0.04511    0.02163   2.086 0.037020 *  
baron_gap          -0.62911    0.05491 -11.456  < 2e-16 ***
inhibitorKills_gap  0.10260    0.03776   2.717 0.006591 ** 
gameDuration       -0.01023    0.00544  -1.880 0.060134 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 55287.7  on 39883  degrees of freedom
Residual deviance:  5838.7  on 39868  degrees of freedom
AIC: 5870.7

Number of Fisher Scoring iterations: 9



【敘述統計】 欲瞭解當獲勝方為藍方時,閃現(flash)放在D鍵與F鍵的比例

summary(glm(winner ~ firstDragon, data= lol_train, family = "binomial"))

Call:
glm(formula = winner ~ firstDragon, family = "binomial", data = lol_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4984  -0.8730  -0.8730   0.8873   1.5161  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -0.27006    0.08775  -3.078  0.00209 ** 
firstDragon1 -0.49825    0.08908  -5.593 2.23e-08 ***
firstDragon2  0.99897    0.08906  11.217  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 55288  on 39883  degrees of freedom
Residual deviance: 50130  on 39881  degrees of freedom
AIC: 50136

Number of Fisher Scoring iterations: 4
test1 <- subset(lol_train,lol_train$firstDragon!=0)
summary(glm(test1$winner ~ as.factor(test1$firstDragon),family = "binomial"))

Call:
glm(formula = test1$winner ~ as.factor(test1$firstDragon), family = "binomial")

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4984  -0.8730  -0.8730   0.8873   1.5161  

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -0.76831    0.01535  -50.05   <2e-16 ***
as.factor(test1$firstDragon)2  1.49723    0.02159   69.34   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 54555  on 39354  degrees of freedom
Residual deviance: 49406  on 39353  degrees of freedom
AIC: 49410

Number of Fisher Scoring iterations: 4
summary(glm(lol_train$winner ~ lol_train$t1_champ1_sum1,family = "binomial"))

Call:
glm(formula = lol_train$winner ~ lol_train$t1_champ1_sum1, family = "binomial")

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.202  -1.170  -1.150   1.185   1.247  

Coefficients:
                                 Estimate Std. Error z value Pr(>|z|)
(Intercept)                      -0.06454    0.08717  -0.740    0.459
lol_train$t1_champ1_sum1Cleanse  -0.09710    0.20017  -0.485    0.628
lol_train$t1_champ1_sum1Exhaust   0.01274    0.09496   0.134    0.893
lol_train$t1_champ1_sum1Flash     0.04615    0.08821   0.523    0.601
lol_train$t1_champ1_sum1Ghost     0.12308    0.12037   1.022    0.307
lol_train$t1_champ1_sum1Heal      0.06510    0.09337   0.697    0.486
lol_train$t1_champ1_sum1Ignite    0.04496    0.09459   0.475    0.635
lol_train$t1_champ1_sum1Smite     0.04236    0.09317   0.455    0.649
lol_train$t1_champ1_sum1Teleport  0.06080    0.09310   0.653    0.514

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 55288  on 39883  degrees of freedom
Residual deviance: 55285  on 39875  degrees of freedom
AIC: 55303

Number of Fisher Scoring iterations: 3
a <- lol[,c(5,13,16,19,22,25)]
summary(a)
 winner     t1_champ1_sum1   t1_champ2_sum1   t1_champ3_sum1   t1_champ4_sum1   t1_champ5_sum1 
 1:25211   Flash   :27295   Flash   :27144   Flash   :27117   Flash   :27104   Flash   :27085  
 2:24645   Teleport: 4784   Heal    : 4763   Heal    : 4737   Heal    : 4801   Teleport: 4899  
           Smite   : 4556   Smite   : 4626   Smite   : 4649   Smite   : 4519   Smite   : 4636  
           Heal    : 4439   Teleport: 4426   Teleport: 4529   Teleport: 4515   Heal    : 4352  
           Ignite  : 3700   Exhaust : 3770   Exhaust : 3808   Exhaust : 3863   Ignite  : 3794  
           Exhaust : 3544   Ignite  : 3697   Ignite  : 3656   Ignite  : 3668   Exhaust : 3634  
           (Other) : 1538   (Other) : 1430   (Other) : 1360   (Other) : 1386   (Other) : 1456  
b <- lol[,c(5,38,41,44,47,50)]
summary(b)
 winner     t2_champ1_sum1   t2_champ2_sum1   t2_champ3_sum1   t2_champ4_sum1   t2_champ5_sum1 
 1:25211   Flash   :27312   Flash   :27150   Flash   :27280   Flash   :27166   Flash   :26937  
 2:24645   Teleport: 4790   Heal    : 4704   Heal    : 4687   Heal    : 4757   Teleport: 4880  
           Smite   : 4451   Smite   : 4561   Teleport: 4576   Smite   : 4743   Smite   : 4624  
           Heal    : 4325   Teleport: 4533   Smite   : 4470   Teleport: 4456   Heal    : 4428  
           Ignite  : 3799   Exhaust : 3782   Exhaust : 3813   Exhaust : 3734   Ignite  : 3840  
           Exhaust : 3660   Ignite  : 3693   Ignite  : 3642   Ignite  : 3583   Exhaust : 3759  
           (Other) : 1519   (Other) : 1433   (Other) : 1388   (Other) : 1417   (Other) : 1388  
c <- a[a$winner=="1",]
d <- b[b$winner=="1",]
summary(c)
 winner     t1_champ1_sum1   t1_champ2_sum1   t1_champ3_sum1   t1_champ4_sum1   t1_champ5_sum1 
 1:25211   Flash   :13819   Flash   :13715   Flash   :13708   Flash   :13614   Flash   :13681  
 2:    0   Teleport: 2428   Heal    : 2392   Smite   : 2398   Heal    : 2482   Teleport: 2543  
           Smite   : 2296   Smite   : 2296   Heal    : 2381   Smite   : 2310   Smite   : 2334  
           Heal    : 2230   Teleport: 2215   Teleport: 2241   Teleport: 2251   Heal    : 2135  
           Ignite  : 1870   Exhaust : 1940   Exhaust : 1926   Exhaust : 1984   Ignite  : 1926  
           Exhaust : 1792   Ignite  : 1924   Ignite  : 1866   Ignite  : 1832   Exhaust : 1858  
           (Other) :  776   (Other) :  729   (Other) :  691   (Other) :  738   (Other) :  734  
summary(d)
 winner     t2_champ1_sum1   t2_champ2_sum1   t2_champ3_sum1   t2_champ4_sum1   t2_champ5_sum1 
 1:25211   Flash   :13769   Flash   :13727   Flash   :13857   Flash   :13659   Flash   :13572  
 2:    0   Teleport: 2475   Heal    : 2415   Teleport: 2361   Smite   : 2394   Teleport: 2450  
           Smite   : 2249   Smite   : 2295   Heal    : 2330   Heal    : 2363   Smite   : 2394  
           Heal    : 2241   Teleport: 2240   Smite   : 2181   Teleport: 2337   Heal    : 2269  
           Ignite  : 1855   Exhaust : 1926   Exhaust : 1938   Exhaust : 1954   Ignite  : 1939  
           Exhaust : 1841   Ignite  : 1875   Ignite  : 1856   Ignite  : 1810   Exhaust : 1882  
           (Other) :  781   (Other) :  733   (Other) :  688   (Other) :  694   (Other) :  705  
s1 <- 13819 + 13715 + 13708 + 13614 + 13681
s2 <- 13769 + 13727 + 13857 + 13659 + 13572
s1 / (25211 * 5)                                  # 獲勝方為藍方時,將閃現放在D鍵的比例          # 0.5437071
[1] 0.5437071
s2 / (25211 * 5)                                  # 獲勝方為藍方時,將閃現放在F鍵的比例          # 0.54408
[1] 0.54408
---
title: "1071巨量期末專案"
output: html_notebook
---

<br>

---


```{r}
# 載入所需的套件(Loading package)
library(dplyr)
library(plyr)
library(ggplot2)
library(readr)
library(reshape)
library(ROCR)
library(caret)
library(randomForest)
library(pROC)

# 更改R的預設語系
Sys.setlocale("LC_ALL",'C')      
```

<br>
<br>
**資料前處理**

+ 將角色名還有各個角色的類型(例如support等)放入資料集當中，但沒有保留原有數字的部分，以避免資料集過於混亂。

+ 將gameDuration(遊戲進行時間的部分)換算成以「分鐘」為單位。

+ 刪除不合理的值，例如首殺為0(代表該場遊戲無人拿到首殺),第一座塔為0(代表該場遊戲無人拿到第一座塔)以及遊戲時間少於15分鐘的資料。

+ 由於本資料集無任何空值，所以沒做空值相關處理。

```{r}
```

<br>
<br>
**(處理後)資料集資訊**

+ 剩下49,856筆(原先有51,490筆) 。

+ 新增10個欄位，總計共71個欄位(t1:藍方、t2:紅方，雙方各隊分別5位玩家，總計共10位玩家，變數說明在此處不贅述)：
    + gameId:遊戲編號
    + creationTime:遊戲創立時間
    + gameDuration:遊戲時間
    + seasonId:第幾季
    + winner:哪一隊贏
    + firstBlood:哪一隊拿到首殺
    + firstTower:哪一隊拿到首塔
    + firstInhibitor:哪一隊拿到首兵營
    + firstBaron:哪一隊拿到首巴龍
    + firstDragon:哪一隊拿到首小龍
    + firstRiftHearld:哪一隊拿到首預示者
    + t1_champ1id:藍方第一個角色名稱
    + t1_champ1id_tags:藍方第一個角色種類
    + t1_champ1_sum1、sum2:藍方第一個角色召喚師技能1、2
    + ...
    + t1_towerKills:藍方拆了幾個塔
    + t1_inhibitorKills:藍方拆了幾個兵營
    + t1_baronKills:藍方拿了幾隻巴龍
    + t1_dragonKills:藍方拿了幾隻小龍
    + t1_riftHeraldKills:藍方拿了幾隻預示者
    + t1_ban1:藍方第一ban
    + ...
    + t2同上

<br>
<br>
**載入資料集(Loading dataset)：[LOLgamedata.csv]**

我們透過組員自身的遊戲經驗，挑出我們認為較顯著之資料欄位進行變數型態轉換
```{r}
lol <- read.csv("LOLgamedata.csv",header = T)

# View(lol)

# 將欄位進行型態轉換(transfer our data as factor)
lol$winner <- as.factor(lol$winner)
lol$firstBlood <- as.factor(lol$firstBlood)
lol$firstTower <- as.factor(lol$firstTower)
lol$firstInhibitor <- as.factor(lol$firstInhibitor)
lol$firstBaron <- as.factor(lol$firstBaron)
lol$firstDragon <- as.factor(lol$firstDragon)
lol$firstRiftHerald <- as.factor(lol$firstRiftHerald)
```

<br>
<br>
**新增變數**
```{r}
lol$tower_gap <- (lol$t1_towerKills - lol$t2_towerKills)                              # 勝負隊伍塔差
# lol_train$tower_gap <- (lol_train$t1_towerKills - lol_train$t2_towerKills)
# summary(glm(winner ~ lol_train$tower_gap, data= lol_train, family = "binomial"))
lol$dragon_gap <- (lol$t1_dragonKills - lol$t2_dragonKills)                           # 勝負隊伍殺小龍數量差異
lol$baron_gap <- (lol$t1_baronKills - lol$t2_baronKills)                              # 勝負隊伍殺巴隆數量差異
lol$inhibitorKills_gap <- (lol$t1_inhibitorKills - lol$t2_inhibitorKills)             # 勝負隊伍水晶兵營數量差異
```

<br>
<br>
**邏輯式迴歸(glm)**
```{r}
# 將資料切割成Training Set(lol_train), Testing Set(lol_test)
set.seed(2018)
train_idx <- sample(1:nrow(lol), size = 0.8 * nrow(lol), replace = F)
lol_train <- lol[train_idx,] 
lol_test <- lol[-train_idx,] 
summary(glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, data = lol_train, family = "binomial"))

# 預測(glm prediction)(common sense)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9044324

# 預測(glm prediction)(加入towel_gap)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9736262

# 預測(glm prediction)(加入towel_gap,並拿掉不顯著之變數)
model1 <- glm(winner ~ firstTower + firstInhibitor + firstRiftHerald + tower_gap, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9731247

# 預測(glm prediction)(加入towel_gap, dragon_gap, baron_gap, inhibitorKills_gap, gameDuration)
model1 <- glm(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap + dragon_gap + baron_gap + inhibitorKills_gap + gameDuration, data = lol_train ,family = "binomial")
result <- predict(model1,newdata = lol_test, type = "response")
table(lol_test$winner, result>0.5) %>% {sum(diag(.))/sum(.)}         # ACC = 0.9742278

summary(model1)
```

<br>
<br>
**邏輯式迴歸ROC(ROC curve of Logistic Regression)**
```{r}
pred <- prediction(result, lol_test$winner)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, "auc")

# 繪製ROC curve之圖形，並算出AUC
plot(perf, main = "ROC curve(Logistic Regression)", xlab = "Specificity(FPR)", ylab = "Sensitivity(TPR)")
abline(0, 1)
text(0.5, 0.5, as.character(auc@y.values[[1]]))           # AUC = 0.9966
```

<br>
<br>
**邏輯式迴歸交叉驗證(Cross-Validation(CV) of Logistic Regession)**
```{r}
# Get k-fold CV confusion matrix for Logistic Regression model
# f: formula, d: data, k: number of folds, cutoff: cutoff point 0-1
k_fold_CV_logit = function(f, d, k, cutoff){
  numOfRec = nrow(d) # number of observations
  reponse_var = all.vars(f)[1] # name of the response variable
  # k indices used to split data into k parts
  sample_idx_k = rep(sample(1:k),round(numOfRec / k) + 1)[1:numOfRec]
  # k models for k subsets of data
  k_fits = Map( function(x) glm(f, d[sample_idx_k != x, ],
                                family = "binomial"), 1:k)
  # Predicted & actual classes for each hold-out subset
  predActualClass = Map(function(x){
    predictedProb = predict(k_fits[[x]], d[sample_idx_k == x,],
                            type = "response")
    predictedClass = ifelse(predictedProb > cutoff, 1, 0)
    return(data.frame("predictedClass" = predictedClass,
                      "actualClass" = d[sample_idx_k == x, reponse_var] ) )
  }, 1:k)
  # A data frame with all predicted & actual classes
  output_DF = Reduce(function(x, y) rbind(x, y), predActualClass)
  output_DF$predictedClass = factor(output_DF$predictedClass,
                                    levels=c(0,1),labels = c("No", "Yes"))
  return( table(output_DF$predictedClass, output_DF$actualClass))
}

Map(function(cutoff) k_fold_CV_logit(winner ~ firstBlood+firstTower+firstInhibitor+firstBaron+firstDragon+firstRiftHerald,
                                     lol[train_idx,], 10, cutoff), list(0.9, 0.8, 0.7, 0.6, 0.5, 0.45, 0.4, 0.3, 0.2, 0.1)) # 0.5
```


<br>
<br>
**隨機森林(randomForest)**

+ 試藉由組員自身遊戲經驗所找出之變數跑randomForest
    
    + 變數：firstBlood, firstTower, firstInhibitor, firstBaron, firstDragon, firstRiftHerald, lol_train
    
    + ntree = 500

+ 透過importance()來找出較重要之變數

    + 發現firstRiftHerald變數重要度最低，故嘗試移除firstRiftHerald變數，再次進行建模
    
    + 重新建模後，發現透過impotance()所挑出的變數所計算出的結果並沒有比較好
    
+ 確定模型建置
```{r}
# 試藉由組員自身遊戲經驗所找出之變數跑randomForest
set.seed(2018)
lol_tree <- randomForest(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald, lol_train, ntree = 500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree
    # Confusion matrix:
    #       1     2 class.error
    # 1 18345  1777  0.08831130
    # 2  1973 17789  0.09983807

# 透過importance()來找出較重要之變數
importance(lol_tree)
    #                 MeanDecreaseGini
    # firstTower            1059.24050
    # firstInhibitor        9253.06155
    # firstBaron            2451.38878
    # firstDragon            745.84541
    # firstRiftHerald         91.45326 (最低)

# 移除firstRiftHerald變數，再次進行建模
lol_tree <- randomForest(winner ~ firstTower+firstInhibitor+firstBaron+firstDragon+firstRiftHerald,lol_train, ntree=500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree
    # Confusion matrix:
    #       1     2 class.error
    # 1 18322  1800  0.08945433
    # 2  1966 17796  0.09948386

# 預測(randomForest prediction)(加入towel_gap, dragon_gap, baron_gap, inhibitorKills_gap, gameDuration)
lol_tree <- randomForest(winner ~ firstBlood + firstTower + firstInhibitor + firstBaron + firstDragon + firstRiftHerald + tower_gap + dragon_gap + baron_gap + inhibitorKills_gap + gameDuration,lol_train, ntree = 500)
result_tree <- predict(lol_tree,newdata = lol_test)
lol_tree
summary(lol_tree)
```

<br>
<br>
**隨機森林ROC(ROC curve of Random Forest)**
```{r}
# 繪製隨機森林的ROC曲線，繪製ROC curve之圖形，並算出AUC
rf.pred <- predict(lol_tree, lol_test, type = "prob")
rf.roc <- prediction(rf.pred[,2], lol_test$winner)
rf.auc <- performance(rf.roc, 'tpr', 'fpr')
# rf.auc
plot(rf.auc)
abline(0, 1)
# text(0.5, 0.5, as.character(rf.auc@y.values[[1]]))  
```

<br>
<br>
**邏輯式迴歸與隨機森林在ROC曲線與AUC之比較**
```{r} 
# 將邏輯式迴歸跟隨機森林所繪製出之ROC曲線進行比較
layout(matrix(c(1,2),1,2,byrow = F))

# 邏輯式迴歸
plot(perf, main = "ROC curve", xlab = "False positive rate", ylab = "True positive rate")
abline(0, 1)
text(0.5, 0.5, as.character(auc@y.values[[1]]))           # AUC = 0.9408

# 隨機森林
plot(rf.auc)
abline(0, 1)
```

<br>
<br>
**計算平均平方誤差MSE(Mean Square Error)**
```{r}
# mean((result-lol_test$winner)^2)
dim(lol_test)       #  9972    71
dim(lol_train)      # 39884    71
summary(model1)
```

<br>
<br>
**【敘述統計】**
**欲瞭解當獲勝方為藍方時，閃現(flash)放在D鍵與F鍵的比例**
```{r}
summary(glm(winner ~ firstDragon, data= lol_train, family = "binomial"))
test1 <- subset(lol_train,lol_train$firstDragon!=0)
summary(glm(test1$winner ~ as.factor(test1$firstDragon),family = "binomial"))

summary(glm(lol_train$winner ~ lol_train$t1_champ1_sum1,family = "binomial"))
a <- lol[,c(5,13,16,19,22,25)]
summary(a)
b <- lol[,c(5,38,41,44,47,50)]
summary(b)
c <- a[a$winner=="1",]
d <- b[b$winner=="1",]
summary(c)
summary(d)
s1 <- 13819 + 13715 + 13708 + 13614 + 13681
s2 <- 13769 + 13727 + 13857 + 13659 + 13572
s1 / (25211 * 5)                                  # 獲勝方為藍方時，將閃現放在D鍵的比例          # 0.5437071
s2 / (25211 * 5)                                  # 獲勝方為藍方時，將閃現放在F鍵的比例          # 0.54408
```

```{r}

```

