讀取所需要的變項
DT <-read.csv("D:/104/ML_R/WOW_data.csv",
header=TRUE, sep=",")
DT <- DT[c(13,4,9,12,14,15,16)]
載入所需套件
library(plyr)
library(plotly)
library(ggplot2)
library(reshape2)
library(rpart)
library(rpart.plot)
設定高低分組、男女、有無等二分變項
DT$gender <- mapvalues(DT$gender,from = c(0,1),to = c("female", "male"))
DT$eco_3 <- mapvalues(DT$eco_3,from = c(0,1),to = c("No","Yes"))
DT$reading_3 = ifelse(DT$reading_3 >= 499.3, "High", "Low")
DT$math_3 = ifelse(DT$math_3 >= 504.9, "High", "Low")
DT$WA_S_3 = ifelse(DT$WA_S_3 >=3.47, "High", "Low")
DT$CO_S_3 = ifelse(DT$CO_S_3 >=1.86, "High", "Low")
DT$eng_3 = ifelse(DT$eng_3 >=2.8, "High", "Low")
names(DT)<-c("Math","Gender","Eco-dis",
"Reading","Warth","Conflict","Eng ")
head(DT)
## Math Gender Eco-dis Reading Warth Conflict Eng
## 1 Low female No Low High Low Low
## 2 High female Yes Low High Low High
## 3 High female No High High Low Low
## 4 Low male Yes Low Low High Low
## 5 High female No High Low Low High
## 6 Low female Yes Low High Low High
計算每個變項各水準次數
t(apply(DT, 2, table))
## [,1] [,2]
## Math 98 95
## Gender 89 104
## Eco-dis 72 121
## Reading 105 88
## Warth 105 88
## Conflict 77 116
## Eng 99 94
計算每個變項各水準百分比
show(DTp <- prop.table(t(apply(DT, 2, table)), 1))
## [,1] [,2]
## Math 0.5077720 0.4922280
## Gender 0.4611399 0.5388601
## Eco-dis 0.3730570 0.6269430
## Reading 0.5440415 0.4559585
## Warth 0.5440415 0.4559585
## Conflict 0.3989637 0.6010363
## Eng 0.5129534 0.4870466
DTp<-round(DTp,digits = 2)
DTp
## [,1] [,2]
## Math 0.51 0.49
## Gender 0.46 0.54
## Eco-dis 0.37 0.63
## Reading 0.54 0.46
## Warth 0.54 0.46
## Conflict 0.40 0.60
## Eng 0.51 0.49
資料重新排列,換成百分比
type<-c("Math","Gender","Eco-dis",
"Reading","Warth","Conflict","Eng ")
DTp<-as.data.frame(DTp)
DTp1<- cbind(type,DTp$V1,DTp$V2)
DTp1<-as.data.frame(DTp1)
DTp1$V2<-as.numeric(as.character(DTp1$V2))
DTp1$V3<-as.numeric(as.character(DTp1$V3))
DTp1$V2<-DTp1$V2*100
DTp1$V3<-DTp1$V3*100
View(DTp1)
繪圖
p1 <- plot_ly(DTp1, x = ~V2, y = ~type, type = 'bar', orientation = 'h', name = 'Female_No_High',
marker = list(color = 'rgba(246, 78, 139, 0.6)',
line = list(color = 'rgba(246, 78, 139, 1.0)',
width = 3))) %>%
add_trace(x = ~V3, name = 'Male_Yes_Low',
marker = list(color = 'rgba(58, 71, 80, 0.6)',
line = list(color = 'rgba(58, 71, 80, 1.0)',
width = 3))) %>%
layout(barmode = 'stack',
title = "Variables Percentage",
xaxis = list(title = "Percentage"),
yaxis = list(title =""))
p1
數學和每個變項作卡方檢定
with(DT, sapply(2:7, function(x){c(names(DT[,2:7])[x-1],
round(chisq.test(table(DT[,1], DT[,x]))$p.val,6))}))
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "Gender" "Eco-dis" "Reading" "Warth" "Conflict" "Eng "
## [2,] "0.625107" "0.001125" "0" "0.023852" "0.290038" "0.722983"
計算每個變項各類別的Low、High數學成績比率
DT_p <- with(DT, sapply(2:7, function(x){
prop.table(table(DT[, x], DT[, 1]), 1) } ))
只取數學成績High比率
DT_p <- t( DT_p[1:2, ])
DT_p
## [,1] [,2]
## [1,] 0.4831461 0.5288462
## [2,] 0.6666667 0.4132231
## [3,] 0.7047619 0.2727273
## [4,] 0.4285714 0.6022727
## [5,] 0.4545455 0.5431034
## [6,] 0.5252525 0.4893617
放進變項名稱
rownames(DT_p) <- names(DT[, -1])
DT_p
## [,1] [,2]
## Gender 0.4831461 0.5288462
## Eco-dis 0.6666667 0.4132231
## Reading 0.7047619 0.2727273
## Warth 0.4285714 0.6022727
## Conflict 0.4545455 0.5431034
## Eng 0.5252525 0.4893617
比率排列
d_p <- DT_p[order(abs(DT_p[, 1] - DT_p[, 2])), ]
head(d_p)
## [,1] [,2]
## Eng 0.5252525 0.4893617
## Gender 0.4831461 0.5288462
## Conflict 0.4545455 0.5431034
## Warth 0.4285714 0.6022727
## Eco-dis 0.6666667 0.4132231
## Reading 0.7047619 0.2727273
改變資料排列並命名
d_p <- melt(d_p)
names(d_p) <- c('Variables', 'Type', 'Percentage')
d_p$Percentage<-round(d_p$Percentage*100,digits = 2)
d_p$Type <- mapvalues(d_p$Type,
from = c(1,2),
to = c('Female_No_High', 'Male_Yes_Low'))
head(d_p)
## Variables Type Percentage
## 1 Eng Female_No_High 52.53
## 2 Gender Female_No_High 48.31
## 3 Conflict Female_No_High 45.45
## 4 Warth Female_No_High 42.86
## 5 Eco-dis Female_No_High 66.67
## 6 Reading Female_No_High 70.48
繪圖
d_p <- ddply(d_p, .(Variables),
transform, pos = cumsum(Percentage) - (0.5 * Percentage))
p2 <- ggplot() + geom_bar(aes(y =Percentage, x = Variables, fill = Type), data = d_p,
stat="identity") +
scale_fill_brewer(palette = "Set3")+
coord_flip()+
geom_text(data=d_p, aes(x = Variables, y = pos,
label = paste0(Percentage,"%")),size=4)+
ggtitle("High Math precentage ")
ggplotly(p2)
先把資料區分成 train=0.5, test=0.5
set.seed(201612)
n <- dim(DT)[1]
nh <- sample(1:n, floor(n/2))
DT_trn <- DT[nh, ] #訓練資料
DT_tst <- DT[-nh, ] #測試資料
利用訓練資料建立模型
rslt_trn <- rpart(Math ~ ., data = DT_trn)
繪圖
prp(rslt_trn, # 模型
faclen=0, # 呈現的變數不要縮寫
fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
shadow.col="gray", # 最下面的節點塗上陰影
extra=7,
type=4,main="Low Math Decision Tree")
利用測試資料進行分析
pred <- predict(rslt_trn, newdata=DT_tst, type="class")
看一下預測結果
table(real=DT_tst$Math, predict=pred)
## predict
## real High Low
## High 32 18
## Low 8 39
計算預測正確率
confus.matrix <- table(real=DT_tst$Math, predict=pred)
sum(diag(confus.matrix))/sum(confus.matrix)
## [1] 0.7319588