資料整理

讀取所需要的變項

DT <-read.csv("D:/104/ML_R/WOW_data.csv", 
                header=TRUE, sep=",")
DT <- DT[c(13,4,9,12,14,15,16)]

載入所需套件

library(plyr)
library(plotly)
library(ggplot2)
library(reshape2)
library(rpart)
library(rpart.plot)

設定高低分組、男女、有無等二分變項

DT$gender <- mapvalues(DT$gender,from = c(0,1),to = c("female", "male"))
DT$eco_3  <- mapvalues(DT$eco_3,from = c(0,1),to = c("No","Yes"))
DT$reading_3 = ifelse(DT$reading_3 >= 499.3, "High", "Low")
DT$math_3 = ifelse(DT$math_3 >= 504.9, "High", "Low")
DT$WA_S_3 = ifelse(DT$WA_S_3 >=3.47, "High", "Low")
DT$CO_S_3 = ifelse(DT$CO_S_3 >=1.86, "High", "Low")
DT$eng_3 = ifelse(DT$eng_3 >=2.8, "High", "Low")
names(DT)<-c("Math","Gender","Eco-dis",
              "Reading","Warth","Conflict","Eng ")
head(DT)

##   Math Gender Eco-dis Reading Warth Conflict Eng 
## 1  Low female      No     Low  High      Low  Low
## 2 High female     Yes     Low  High      Low High
## 3 High female      No    High  High      Low  Low
## 4  Low   male     Yes     Low   Low     High  Low
## 5 High female      No    High   Low      Low High
## 6  Low female     Yes     Low  High      Low High

計算每個變項各水準次數

t(apply(DT,  2, table))

##          [,1] [,2]
## Math       98   95
## Gender     89  104
## Eco-dis    72  121
## Reading   105   88
## Warth     105   88
## Conflict   77  116
## Eng        99   94

計算每個變項各水準百分比

show(DTp <- prop.table(t(apply(DT,  2, table)), 1))

##               [,1]      [,2]
## Math     0.5077720 0.4922280
## Gender   0.4611399 0.5388601
## Eco-dis  0.3730570 0.6269430
## Reading  0.5440415 0.4559585
## Warth    0.5440415 0.4559585
## Conflict 0.3989637 0.6010363
## Eng      0.5129534 0.4870466

DTp<-round(DTp,digits = 2)
DTp

##          [,1] [,2]
## Math     0.51 0.49
## Gender   0.46 0.54
## Eco-dis  0.37 0.63
## Reading  0.54 0.46
## Warth    0.54 0.46
## Conflict 0.40 0.60
## Eng      0.51 0.49

資料重新排列，換成百分比

type<-c("Math","Gender","Eco-dis",
         "Reading","Warth","Conflict","Eng ")
DTp<-as.data.frame(DTp)
DTp1<- cbind(type,DTp$V1,DTp$V2)
DTp1<-as.data.frame(DTp1)
DTp1$V2<-as.numeric(as.character(DTp1$V2))
DTp1$V3<-as.numeric(as.character(DTp1$V3))
DTp1$V2<-DTp1$V2*100
DTp1$V3<-DTp1$V3*100
View(DTp1)

繪圖

p1 <- plot_ly(DTp1, x = ~V2, y = ~type, type = 'bar', orientation = 'h', name = 'Female_No_High',
             marker = list(color = 'rgba(246, 78, 139, 0.6)',
                           line = list(color = 'rgba(246, 78, 139, 1.0)',
                                       width = 3))) %>%
  add_trace(x = ~V3, name = 'Male_Yes_Low',
            marker = list(color = 'rgba(58, 71, 80, 0.6)',
                          line = list(color = 'rgba(58, 71, 80, 1.0)',
                                      width = 3))) %>%
  layout(barmode = 'stack',
         title = "Variables Percentage",
         xaxis = list(title = "Percentage"),
         yaxis = list(title =""))
p1

數學(High)與其它變項之百分比

數學和每個變項作卡方檢定

with(DT, sapply(2:7, function(x){c(names(DT[,2:7])[x-1],
            round(chisq.test(table(DT[,1], DT[,x]))$p.val,6))}))

##      [,1]       [,2]       [,3]      [,4]       [,5]       [,6]      
## [1,] "Gender"   "Eco-dis"  "Reading" "Warth"    "Conflict" "Eng "    
## [2,] "0.625107" "0.001125" "0"       "0.023852" "0.290038" "0.722983"

計算每個變項各類別的Low、High數學成績比率

DT_p <- with(DT, sapply(2:7, function(x){
  prop.table(table(DT[, x], DT[, 1]), 1) } ))

只取數學成績High比率

DT_p <- t( DT_p[1:2, ])
DT_p

##           [,1]      [,2]
## [1,] 0.4831461 0.5288462
## [2,] 0.6666667 0.4132231
## [3,] 0.7047619 0.2727273
## [4,] 0.4285714 0.6022727
## [5,] 0.4545455 0.5431034
## [6,] 0.5252525 0.4893617

放進變項名稱

rownames(DT_p) <- names(DT[, -1])
DT_p

##               [,1]      [,2]
## Gender   0.4831461 0.5288462
## Eco-dis  0.6666667 0.4132231
## Reading  0.7047619 0.2727273
## Warth    0.4285714 0.6022727
## Conflict 0.4545455 0.5431034
## Eng      0.5252525 0.4893617

比率排列

d_p <- DT_p[order(abs(DT_p[, 1] - DT_p[, 2])), ]
head(d_p)

##               [,1]      [,2]
## Eng      0.5252525 0.4893617
## Gender   0.4831461 0.5288462
## Conflict 0.4545455 0.5431034
## Warth    0.4285714 0.6022727
## Eco-dis  0.6666667 0.4132231
## Reading  0.7047619 0.2727273

改變資料排列並命名

d_p <- melt(d_p)
names(d_p) <- c('Variables', 'Type', 'Percentage')
d_p$Percentage<-round(d_p$Percentage*100,digits = 2)
d_p$Type <- mapvalues(d_p$Type,
                       from = c(1,2),
                       to = c('Female_No_High', 'Male_Yes_Low'))
head(d_p)

##   Variables           Type Percentage
## 1      Eng  Female_No_High      52.53
## 2    Gender Female_No_High      48.31
## 3  Conflict Female_No_High      45.45
## 4     Warth Female_No_High      42.86
## 5   Eco-dis Female_No_High      66.67
## 6   Reading Female_No_High      70.48

繪圖

d_p <- ddply(d_p, .(Variables),
                     transform, pos = cumsum(Percentage) - (0.5 * Percentage))

p2 <- ggplot() + geom_bar(aes(y =Percentage, x = Variables, fill = Type), data = d_p,
                            stat="identity") +
                          scale_fill_brewer(palette = "Set3")+
                          coord_flip()+
                          geom_text(data=d_p, aes(x = Variables, y = pos, 
                          label = paste0(Percentage,"%")),size=4)+
                          ggtitle("High Math precentage ")

ggplotly(p2)

決策樹分析

先把資料區分成 train=0.5, test=0.5

set.seed(201612)
n <- dim(DT)[1]
nh <- sample(1:n, floor(n/2))
DT_trn <- DT[nh, ] #訓練資料
DT_tst <- DT[-nh, ] #測試資料

利用訓練資料建立模型

rslt_trn <- rpart(Math ~ ., data = DT_trn)

繪圖

prp(rslt_trn,           # 模型
    faclen=0,           # 呈現的變數不要縮寫
    fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
    shadow.col="gray",  # 最下面的節點塗上陰影
    extra=7,
    type=4,main="Low Math Decision Tree")

利用測試資料進行分析

pred <- predict(rslt_trn, newdata=DT_tst, type="class")

看一下預測結果

table(real=DT_tst$Math, predict=pred)

##       predict
## real   High Low
##   High   32  18
##   Low     8  39

計算預測正確率

confus.matrix <- table(real=DT_tst$Math, predict=pred)
sum(diag(confus.matrix))/sum(confus.matrix)

## [1] 0.7319588

Decision Tree Analysis

邱浩恩

2016年12月23日

資料整理

數學(High)與其它變項之百分比

決策樹分析

THE END