data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#install.packages("rpart")
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
fit <- rpart(Species ~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris)
summary(fit)
## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
## Petal.Width, data = iris)
## n= 150
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.00 1.16 0.05127703
## 2 0.44 1 0.50 0.68 0.06096994
## 3 0.01 2 0.06 0.08 0.02751969
##
## Variable importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 34 31 21 14
##
## Node number 1: 150 observations, complexity param=0.5
## predicted class=setosa expected loss=0.6666667 P(node) =1
## class counts: 50 50 50
## probabilities: 0.333 0.333 0.333
## left son=2 (50 obs) right son=3 (100 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=50.00000, (0 missing)
## Petal.Width < 0.8 to the left, improve=50.00000, (0 missing)
## Sepal.Length < 5.45 to the left, improve=34.16405, (0 missing)
## Sepal.Width < 3.35 to the right, improve=19.03851, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.00, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.920, adj=0.76, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 2: 50 observations
## predicted class=setosa expected loss=0 P(node) =0.3333333
## class counts: 50 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 100 observations, complexity param=0.44
## predicted class=versicolor expected loss=0.5 P(node) =0.6666667
## class counts: 0 50 50
## probabilities: 0.000 0.500 0.500
## left son=6 (54 obs) right son=7 (46 obs)
## Primary splits:
## Petal.Width < 1.75 to the left, improve=38.969400, (0 missing)
## Petal.Length < 4.75 to the left, improve=37.353540, (0 missing)
## Sepal.Length < 6.15 to the left, improve=10.686870, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.555556, (0 missing)
## Surrogate splits:
## Petal.Length < 4.75 to the left, agree=0.91, adj=0.804, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.73, adj=0.413, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.67, adj=0.283, (0 split)
##
## Node number 6: 54 observations
## predicted class=versicolor expected loss=0.09259259 P(node) =0.36
## class counts: 0 49 5
## probabilities: 0.000 0.907 0.093
##
## Node number 7: 46 observations
## predicted class=virginica expected loss=0.02173913 P(node) =0.3066667
## class counts: 0 1 45
## probabilities: 0.000 0.022 0.978
plot(fit, margin = 0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col="red")

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
abline(v=2.45, col="red")
abline(h=1.75, col="blue")

newpt = data.frame(Petal.Width = 2.2, Petal.Length = 3.5, Sepal.Length = 2, Sepal.Width = 2)
predict(fit, newpt)
## setosa versicolor virginica
## 1 0 0.02173913 0.9782609
pred <- predict(fit, iris)
pred <- predict(fit, iris, type="class")
t <- c(1,1,0,1,0)
p <- c(0,1,0,1,1)
table(t,p)
## p
## t 0 1
## 0 1 1
## 1 1 2
cm <- table(pred, iris$Species)
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
#.Random.seed
sample.int(2, 10, replace=TRUE)
## [1] 2 2 2 1 2 1 2 2 1 2
idx = sample.int(2, nrow(iris), replace=TRUE)
set.seed(123)
idx = sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
## 1 2
## 106 44
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
fit2 <- rpart(Species ~., data=trainset)
plot(fit2, margin = 0.1)

pred <- predict(fit2, testset[,-5], type= "class")
cm <- table(pred, testset[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977
#install.packages("C50")
library(C50)
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]
library(rpart)
churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

min(churn.rp$cptable[,"xerror"])
## [1] 0.4707602
pos = which.min(churn.rp$cptable[,"xerror"])
churn.cp = churn.rp$cptable[pos,"CP"]
prune.tree = prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin= 0.1)
text(prune.tree, all=TRUE , use.n=TRUE)

predictions <- predict(churn.rp, testset, type="class")
table(testset$churn, predictions)
## predictions
## yes no
## yes 100 41
## no 18 859
library(caret)
confusionMatrix(table(predictions, testset$churn))
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 100 18
## no 41 859
##
## Accuracy : 0.942
## 95% CI : (0.9259, 0.9556)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7393
## Mcnemar's Test P-Value : 0.004181
##
## Sensitivity : 0.70922
## Specificity : 0.97948
## Pos Pred Value : 0.84746
## Neg Pred Value : 0.95444
## Prevalence : 0.13851
## Detection Rate : 0.09823
## Detection Prevalence : 0.11591
## Balanced Accuracy : 0.84435
##
## 'Positive' Class : yes
##
pred <- predict(churn.rp, testset)
testset$churn
## [1] no no no no no yes no no no no yes no no no no no no
## [18] no no no yes no no no no no yes no yes no no no no no
## [35] no no yes no no no no no no no no no no no yes no no
## [52] no no no no no no no no no no no no no no no no yes
## [69] no no no no no no no no no yes no yes no no no no no
## [86] no no no no no no no no no no no no no no no yes yes
## [103] no no yes no no no no no no no no no no no no no yes
## [120] yes no no no no yes no no no no no no yes no no no no
## [137] yes yes no no no yes no no no no no no no no no no no
## [154] no no no no no no no no no no no no no no no yes no
## [171] yes no no no no no no no no no no no no no yes no no
## [188] no no no no no yes no no no no no no no no no no yes
## [205] no no no no no no no no no no no no no no no no no
## [222] no no yes no no no no no no no no no no no no no no
## [239] no no no no no no no yes no no no no no no no no yes
## [256] no no no no no yes no no no no no no no no no no no
## [273] no yes yes no no yes no no no no yes no no no no yes no
## [290] no no no no no yes no yes no no yes no no no no no no
## [307] yes no no no no no no no no yes no no no no no no no
## [324] no no no no no no no no yes no no no no no yes no no
## [341] no no no no no no no no no no no no no yes no no yes
## [358] no no no no no no no no no no no no no no no no no
## [375] no yes no no no no no no no no no no no no yes yes no
## [392] no no no no no no no no no no no no yes no no yes no
## [409] no no yes no yes no no no no no yes no yes no no no no
## [426] yes no no no no no no no no no yes no no no no no no
## [443] no no no no no no no no no no no no no no yes no no
## [460] no no no no no no no no no yes no no no no no no no
## [477] yes yes yes yes no no no no no no no no no no no no no
## [494] no no no no no no no yes no no no yes no no no no no
## [511] no no no yes no no no no no no no no no no yes no yes
## [528] yes no no yes no yes no no no no no no no no no no no
## [545] no no yes yes no no no no yes no no no no no no no no
## [562] no no no no no yes no yes no yes yes no yes no yes no no
## [579] no no yes no no yes no no no no no no no no no no no
## [596] no yes no no no no yes no no no no yes no no no no no
## [613] no no no no no no no no no no yes no yes no no no no
## [630] no no no no no yes no no no yes no yes no no no no no
## [647] no no no yes no no no no no no no no no no no no yes
## [664] no yes no no no no no no no no no no no yes no no yes
## [681] no no yes no no no no no no yes no no no no no no no
## [698] no no no no no no no no no no no no no no no yes yes
## [715] no no no no yes no no yes no no no no yes no no yes no
## [732] yes no no no no no no no yes yes no no no no no no no
## [749] no no yes no no no no no no no no no yes no no no no
## [766] no no no no no no no no no no yes no no no no no no
## [783] no no no no no no no yes no no yes no no no no yes no
## [800] no yes no no no no no no no no no yes no no no no no
## [817] no no no no no no no no no no yes no yes no no no yes
## [834] yes no yes no no no no no no no yes no no no no no no
## [851] no no yes no no no no no no yes yes yes no no yes no yes
## [868] no no no no no no no no no no no yes no no no no no
## [885] yes no yes no no no no no no no no no no no no no no
## [902] no no no no yes yes no no no yes no no yes no yes no no
## [919] no no no no no no no no no no no no no no no no no
## [936] no no no no yes no no no no no yes no yes no no no no
## [953] no no no no no no no no no no no no no no no no no
## [970] no yes no no no no no yes no no no yes no no no no no
## [987] no no no no yes no no no no no no no no no no no no
## [1004] no no no yes no no yes no no no no yes no no no
## Levels: yes no
xary = c()
yary = c()
for(i in seq(0,1,0.1)){
f <- as.factor(ifelse(pred[,1] > i, 0, 1))
levels(f) = c("yes", "no")
tb <- table(f, testset$churn )
cm <- confusionMatrix(tb)
y = cm$byClass[1]
x = 1- cm$byClass[2]
xary = c(xary, x)
yary = c(yary, y)
}
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

download.file('https://github.com/ywchiu/rtibame/raw/master/appledaily2.RData', destfile="appledaily2.RData")
load("appledaily2.RData")
apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂'),]
nrow(apple.subset)
## [1] 234
table(apple.subset$category)
##
## 娛樂 財經
## 113 121
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))
apple.seg[1]
## [[1]]
## [1] "台灣浩鼎" "生技" "股份" "有限公司" "4174"
## [6] "今" "15" "日" "中午" "在"
## [11] "法務部" "公司" "搜索" "後" "再度"
## [16] "發布" "聲明" "該" "公司" "自信"
## [21] "一切" "行事" "合法" "絕對" "積極"
## [26] "配合" "調查" "協助" "司法" "儘速"
## [31] "釐清" "真相" "但" "也" "籲請"
## [36] "司法" "調查" "單位" "秉持" "偵查"
## [41] "不" "公開" "原則" "勿將" "公司"
## [46] "基於" "信任" "政府" "和" "司法"
## [51] "在" "搜索" "中所" "交付" "的"
## [56] "商業" "機密" "隨意" "對外" "公開"
## [61] "讓" "公司" "數十年" "研發" "心血"
## [66] "付諸流水" "將" "嚴重" "影響" "及"
## [71] "股東權益" "並使" "國內" "生技" "產業"
## [76] "重挫" "浩" "鼎" "指出" "檢調"
## [81] "搜索" "後" "攜" "走" "大量"
## [86] "資料" "包括" "公司" "許多" "智財"
## [91] "技術" "和" "研究" "數據" "令"
## [96] "員工" "非常" "耽憂" "唯恐" "在"
## [101] "研究" "發表" "前" "數據" "外流"
## [106] "將" "嚴重" "影響" "公司" "未來"
## [111] "發展" "由於" "特定" "媒體" "已"
## [116] "連續" "三周" "獨家" "報導" "浩鼎"
## [121] "相關" "訊息" "報導" "雖不實" "但"
## [126] "部分" "資料" "均" "來自" "主管"
## [131] "單位" "要求" "公司" "陳" "交"
## [136] "之" "內部資料" "這些" "資料" "從未"
## [141] "對外" "提供" "今天" "檢調" "一抵"
## [146] "搜索" "現場" "該" "媒體" "相關"
## [151] "部門" "即" "來電" "查證" "在"
## [156] "偵查" "不" "公開" "下" "特定"
## [161] "媒體" "在" "第一" "時間" "即"
## [166] "知曉" "令浩鼎" "為" "之" "驚訝"
## [171] "與" "耽憂" "本" "公司" "在"
## [176] "此" "嚴正聲明" "本" "公司" "已"
## [181] "獲邀" "今年" "六月" "將" "前往"
## [186] "美國" "臨床" "腫瘤" "醫學會" "年會"
## [191] "口頭" "發表" "OBI" "822" "臨床"
## [196] "試驗" "計畫" "解盲" "報告" "論文"
## [201] "依據" "學術" "規範" "在" "發表"
## [206] "前" "相關" "數據" "皆" "不得"
## [211] "揭露" "浩" "鼎" "也" "籲請"
## [216] "媒體" "尊重" "司法" "若因" "數據"
## [221] "外流" "而" "導致" "公司" "及"
## [226] "投資人" "權益" "損失" "公司" "將"
## [231] "依據" "營業" "秘密" "法" "及"
## [236] "相關" "法規" "對洩" "密及" "傳播者"
## [241] "追訴" "及" "求償" "浩" "鼎"
## [246] "也" "呼籲" "投資人" "不用" "驚慌"
## [251] "公司" "營運" "正常" "經得起" "考驗"
## [256] "公司" "也" "在" "事前" "即"
## [261] "告知" "員工" "積極" "配合" "調查"
## [266] "惟有" "儘速" "將" "真相" "查明"
## [271] "才能" "還" "公司" "清白" "20160415"
## [276] "台灣浩鼎生" "技股" "有限公司" "4174" "今天"
## [281] "發表聲明" "公司" "自信" "無" "任何"
## [286] "不法" "情事" "願意" "主動" "配合"
## [291] "司法" "調查" "也" "懇請" "司法"
## [296] "單位" "秉持" "公平正義" "原則" "保障"
## [301] "合法" "百姓" "和" "業者" "權益"
## [306] "早日" "釐清" "案情" "還給" "浩鼎"
## [311] "公道" "江" "俞" "庭" "台北"
## [316] "報導"
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)
## [1] 234 10887
ft <- findFreqTerms(dtm, 5)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)
new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
class(dtm.count)
## [1] "matrix"
m<-as.data.frame(dtm.count)
class(m)
## [1] "data.frame"
nrow(m)
## [1] 234
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]
#install.packages("e1071")
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
model <- naiveBayes(trainset , as.factor(traintag) )
pred <- predict(model, testset)
tb <- table(pred, testtag)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## testtag
## pred 娛樂 財經
## 娛樂 35 2
## 財經 0 29
##
## Accuracy : 0.9697
## 95% CI : (0.8948, 0.9963)
## No Information Rate : 0.5303
## P-Value [Acc > NIR] : 1.147e-15
##
## Kappa : 0.9389
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9355
## Pos Pred Value : 0.9459
## Neg Pred Value : 1.0000
## Prevalence : 0.5303
## Detection Rate : 0.5303
## Detection Prevalence : 0.5606
## Balanced Accuracy : 0.9677
##
## 'Positive' Class : 娛樂
##
which(testtag != pred)
## [1] 50 61
apple.subset[idx==2,"title"][which(testtag != pred)]
## [1] "【壹週刊】從貴婦到攤販 看見台灣媳婦的堅毅"
## [2] "HTC 10影片18秒處 網熱議M7藍再現?"
apple.subset[idx==2,"content"][which(testtag != pred)]
## [1] "本內容由台灣壹週刊提供<U+00A0>台灣媳婦林素蓮,原本是旅居巴基斯坦及杜拜的貴婦,2009年她帶著長子林恩勝回台,躲避巴基斯坦的內戰烽火....<U+00A0>「看完整內容請點我」「看更多壹週刊內容請點我」<U+00A0><U+00A0><U+00A0>"
## [2] "HTC 10官方影片First Impressions 宏達電新機HTC 10話題不斷,眼尖的網友從官方影片看出,可能將推出天空藍?批踢踢熱議很期待,超愛M7藍,10上藍色應該也很美。<U+00A0>批踢踢網友分享HTC 10官方影片「HTC 10:first Impressions」,18秒的地方,機身標籤貼紙出現「天空藍」的字樣,網友甚至截圖下來,看得更清楚,「有沒有可能是眾所期待的藍色機身呢?」<U+00A0>如果是M7的藍,真的超好看,現在已是絕響,「這藍色不錯,買定了」。(馬婉珍/綜合報導)<U+00A0> 批踢踢網友截圖影片第18秒處,可見「天空藍」字樣。翻攝自HTC上傳YouTube"
testtag[54]
## [1] "娛樂"
預測新聞類別
library(jiebaR)
library(tm)
library(e1071)
apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂', '政治', '體育', '國際'),]
mixseg = worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
ft <- findFreqTerms(dtm, 5)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)
new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
m<-as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]
model <- naiveBayes(trainset , as.factor(traintag) )
pred <- predict(model, testset)
tb <- table(pred, testtag)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## testtag
## pred 政治 娛樂 財經 國際 體育
## 政治 47 1 1 1 0
## 娛樂 0 32 1 0 1
## 財經 1 0 30 2 0
## 國際 3 2 6 84 1
## 體育 0 0 0 0 25
##
## Overall Statistics
##
## Accuracy : 0.916
## 95% CI : (0.8732, 0.9479)
## No Information Rate : 0.3655
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8883
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 政治 Class: 娛樂 Class: 財經 Class: 國際
## Sensitivity 0.9216 0.9143 0.7895 0.9655
## Specificity 0.9840 0.9901 0.9850 0.9205
## Pos Pred Value 0.9400 0.9412 0.9091 0.8750
## Neg Pred Value 0.9787 0.9853 0.9610 0.9789
## Prevalence 0.2143 0.1471 0.1597 0.3655
## Detection Rate 0.1975 0.1345 0.1261 0.3529
## Detection Prevalence 0.2101 0.1429 0.1387 0.4034
## Balanced Accuracy 0.9528 0.9522 0.8872 0.9430
## Class: 體育
## Sensitivity 0.9259
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9906
## Prevalence 0.1134
## Detection Rate 0.1050
## Detection Prevalence 0.1050
## Balanced Accuracy 0.9630
library(e1071)
data(iris)
fit = svm(Species ~., data=iris)
pred = predict(fit, iris)
table(pred, iris$Species)
##
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
df = data.frame(x = c(1,2,3), y = c(2,3,4))
plot(df$x, df$y)

str(df)
## 'data.frame': 3 obs. of 2 variables:
## $ x: num 1 2 3
## $ y: num 2 3 4
df$x = as.factor(df$x)
str(df)
## 'data.frame': 3 obs. of 2 variables:
## $ x: Factor w/ 3 levels "1","2","3": 1 2 3
## $ y: num 2 3 4
plot(df$x, df$y)

m = matrix(1:9, nrow=3)
apply(m, MARGIN = 2, sum)
## [1] 6 15 24
apply(m, MARGIN = 1, sum)
## [1] 12 15 18
m = matrix(c(1,2,0,0,0,2,0,1,0), nrow=3)
m
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 2 0 1
## [3,] 0 2 0
apply(m, MARGIN = 2, convert_counts)
## [,1] [,2] [,3]
## [1,] "Yes" "No" "No"
## [2,] "Yes" "No" "Yes"
## [3,] "No" "Yes" "No"