20160528Demo

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#install.packages("rpart")
library(rpart)

## Warning: package 'rpart' was built under R version 3.2.5

fit <- rpart(Species ~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris)
summary(fit)

## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, data = iris)
##   n= 150 
## 
##     CP nsplit rel error xerror       xstd
## 1 0.50      0      1.00   1.16 0.05127703
## 2 0.44      1      0.50   0.68 0.06096994
## 3 0.01      2      0.06   0.08 0.02751969
## 
## Variable importance
##  Petal.Width Petal.Length Sepal.Length  Sepal.Width 
##           34           31           21           14 
## 
## Node number 1: 150 observations,    complexity param=0.5
##   predicted class=setosa      expected loss=0.6666667  P(node) =1
##     class counts:    50    50    50
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (50 obs) right son=3 (100 obs)
##   Primary splits:
##       Petal.Length < 2.45 to the left,  improve=50.00000, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=50.00000, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=34.16405, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=19.03851, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.00, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.920, adj=0.76, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
## 
## Node number 2: 50 observations
##   predicted class=setosa      expected loss=0  P(node) =0.3333333
##     class counts:    50     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 100 observations,    complexity param=0.44
##   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    50    50
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (54 obs) right son=7 (46 obs)
##   Primary splits:
##       Petal.Width  < 1.75 to the left,  improve=38.969400, (0 missing)
##       Petal.Length < 4.75 to the left,  improve=37.353540, (0 missing)
##       Sepal.Length < 6.15 to the left,  improve=10.686870, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 3.555556, (0 missing)
##   Surrogate splits:
##       Petal.Length < 4.75 to the left,  agree=0.91, adj=0.804, (0 split)
##       Sepal.Length < 6.15 to the left,  agree=0.73, adj=0.413, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.67, adj=0.283, (0 split)
## 
## Node number 6: 54 observations
##   predicted class=versicolor  expected loss=0.09259259  P(node) =0.36
##     class counts:     0    49     5
##    probabilities: 0.000 0.907 0.093 
## 
## Node number 7: 46 observations
##   predicted class=virginica   expected loss=0.02173913  P(node) =0.3066667
##     class counts:     0     1    45
##    probabilities: 0.000 0.022 0.978

plot(fit, margin = 0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col="red")

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
abline(v=2.45, col="red")
abline(h=1.75, col="blue")

newpt = data.frame(Petal.Width = 2.2, Petal.Length = 3.5, Sepal.Length = 2, Sepal.Width = 2)
predict(fit, newpt)

##   setosa versicolor virginica
## 1      0 0.02173913 0.9782609

pred <- predict(fit, iris)
pred <- predict(fit, iris, type="class")

t <- c(1,1,0,1,0)
p <- c(0,1,0,1,1)
table(t,p)

##    p
## t   0 1
##   0 1 1
##   1 1 2

cm <- table(pred, iris$Species)
#install.packages("caret")
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
## pred         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

set.seed(123)
sample.int(42,6)

## [1] 13 33 17 35 36  2

#.Random.seed


sample.int(2, 10, replace=TRUE)

##  [1] 2 2 2 1 2 1 2 2 1 2

idx = sample.int(2, nrow(iris), replace=TRUE)
set.seed(123)
idx = sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)

## idx
##   1   2 
## 106  44

trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
fit2 <- rpart(Species ~., data=trainset)
plot(fit2, margin = 0.1)

pred <- predict(fit2, testset[,-5], type= "class")
cm <- table(pred, testset[,5])
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
## pred         setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977

#install.packages("C50")
library(C50)
data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]

library(rpart)
churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

min(churn.rp$cptable[,"xerror"])

## [1] 0.4707602

pos = which.min(churn.rp$cptable[,"xerror"])

churn.cp = churn.rp$cptable[pos,"CP"]

prune.tree = prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin= 0.1)
text(prune.tree, all=TRUE , use.n=TRUE)

predictions <- predict(churn.rp, testset, type="class")
table(testset$churn, predictions)

##      predictions
##       yes  no
##   yes 100  41
##   no   18 859

library(caret)
confusionMatrix(table(predictions, testset$churn))

## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes 100  18
##         no   41 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.70922         
##             Specificity : 0.97948         
##          Pos Pred Value : 0.84746         
##          Neg Pred Value : 0.95444         
##              Prevalence : 0.13851         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.11591         
##       Balanced Accuracy : 0.84435         
##                                           
##        'Positive' Class : yes             
##

pred <- predict(churn.rp, testset)
testset$churn

##    [1] no  no  no  no  no  yes no  no  no  no  yes no  no  no  no  no  no 
##   [18] no  no  no  yes no  no  no  no  no  yes no  yes no  no  no  no  no 
##   [35] no  no  yes no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##   [52] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes
##   [69] no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  no  no 
##   [86] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes
##  [103] no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  yes
##  [120] yes no  no  no  no  yes no  no  no  no  no  no  yes no  no  no  no 
##  [137] yes yes no  no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [154] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes no 
##  [171] yes no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##  [188] no  no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  yes
##  [205] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [222] no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [239] no  no  no  no  no  no  no  yes no  no  no  no  no  no  no  no  yes
##  [256] no  no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [273] no  yes yes no  no  yes no  no  no  no  yes no  no  no  no  yes no 
##  [290] no  no  no  no  no  yes no  yes no  no  yes no  no  no  no  no  no 
##  [307] yes no  no  no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [324] no  no  no  no  no  no  no  no  yes no  no  no  no  no  yes no  no 
##  [341] no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes
##  [358] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [375] no  yes no  no  no  no  no  no  no  no  no  no  no  no  yes yes no 
##  [392] no  no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes no 
##  [409] no  no  yes no  yes no  no  no  no  no  yes no  yes no  no  no  no 
##  [426] yes no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [443] no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes no  no 
##  [460] no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [477] yes yes yes yes no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [494] no  no  no  no  no  no  no  yes no  no  no  yes no  no  no  no  no 
##  [511] no  no  no  yes no  no  no  no  no  no  no  no  no  no  yes no  yes
##  [528] yes no  no  yes no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [545] no  no  yes yes no  no  no  no  yes no  no  no  no  no  no  no  no 
##  [562] no  no  no  no  no  yes no  yes no  yes yes no  yes no  yes no  no 
##  [579] no  no  yes no  no  yes no  no  no  no  no  no  no  no  no  no  no 
##  [596] no  yes no  no  no  no  yes no  no  no  no  yes no  no  no  no  no 
##  [613] no  no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  no 
##  [630] no  no  no  no  no  yes no  no  no  yes no  yes no  no  no  no  no 
##  [647] no  no  no  yes no  no  no  no  no  no  no  no  no  no  no  no  yes
##  [664] no  yes no  no  no  no  no  no  no  no  no  no  no  yes no  no  yes
##  [681] no  no  yes no  no  no  no  no  no  yes no  no  no  no  no  no  no 
##  [698] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  yes yes
##  [715] no  no  no  no  yes no  no  yes no  no  no  no  yes no  no  yes no 
##  [732] yes no  no  no  no  no  no  no  yes yes no  no  no  no  no  no  no 
##  [749] no  no  yes no  no  no  no  no  no  no  no  no  yes no  no  no  no 
##  [766] no  no  no  no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [783] no  no  no  no  no  no  no  yes no  no  yes no  no  no  no  yes no 
##  [800] no  yes no  no  no  no  no  no  no  no  no  yes no  no  no  no  no 
##  [817] no  no  no  no  no  no  no  no  no  no  yes no  yes no  no  no  yes
##  [834] yes no  yes no  no  no  no  no  no  no  yes no  no  no  no  no  no 
##  [851] no  no  yes no  no  no  no  no  no  yes yes yes no  no  yes no  yes
##  [868] no  no  no  no  no  no  no  no  no  no  no  yes no  no  no  no  no 
##  [885] yes no  yes no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [902] no  no  no  no  yes yes no  no  no  yes no  no  yes no  yes no  no 
##  [919] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [936] no  no  no  no  yes no  no  no  no  no  yes no  yes no  no  no  no 
##  [953] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [970] no  yes no  no  no  no  no  yes no  no  no  yes no  no  no  no  no 
##  [987] no  no  no  no  yes no  no  no  no  no  no  no  no  no  no  no  no 
## [1004] no  no  no  yes no  no  yes no  no  no  no  yes no  no  no 
## Levels: yes no

xary = c()
yary = c()
for(i in seq(0,1,0.1)){
  f <- as.factor(ifelse(pred[,1] > i, 0, 1))
  levels(f) = c("yes", "no")
  tb <- table(f, testset$churn )
  cm <- confusionMatrix(tb)
  y = cm$byClass[1]
  x = 1- cm$byClass[2]
  xary = c(xary, x)
  yary = c(yary, y)
}

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.2.5

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.2.5

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

download.file('https://github.com/ywchiu/rtibame/raw/master/appledaily2.RData', destfile="appledaily2.RData")
load("appledaily2.RData")
apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂'),]
nrow(apple.subset)

## [1] 234

table(apple.subset$category)

## 
## 娛樂 財經 
##  113  121

library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.2.5

## Loading required package: jiebaRD

## Warning: package 'jiebaRD' was built under R version 3.2.5

mixseg = worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))
apple.seg[1]

## [[1]]
##   [1] "台灣浩鼎"   "生技"       "股份"       "有限公司"   "4174"      
##   [6] "今"         "15"         "日"         "中午"       "在"        
##  [11] "法務部"     "公司"       "搜索"       "後"         "再度"      
##  [16] "發布"       "聲明"       "該"         "公司"       "自信"      
##  [21] "一切"       "行事"       "合法"       "絕對"       "積極"      
##  [26] "配合"       "調查"       "協助"       "司法"       "儘速"      
##  [31] "釐清"       "真相"       "但"         "也"         "籲請"      
##  [36] "司法"       "調查"       "單位"       "秉持"       "偵查"      
##  [41] "不"         "公開"       "原則"       "勿將"       "公司"      
##  [46] "基於"       "信任"       "政府"       "和"         "司法"      
##  [51] "在"         "搜索"       "中所"       "交付"       "的"        
##  [56] "商業"       "機密"       "隨意"       "對外"       "公開"      
##  [61] "讓"         "公司"       "數十年"     "研發"       "心血"      
##  [66] "付諸流水"   "將"         "嚴重"       "影響"       "及"        
##  [71] "股東權益"   "並使"       "國內"       "生技"       "產業"      
##  [76] "重挫"       "浩"         "鼎"         "指出"       "檢調"      
##  [81] "搜索"       "後"         "攜"         "走"         "大量"      
##  [86] "資料"       "包括"       "公司"       "許多"       "智財"      
##  [91] "技術"       "和"         "研究"       "數據"       "令"        
##  [96] "員工"       "非常"       "耽憂"       "唯恐"       "在"        
## [101] "研究"       "發表"       "前"         "數據"       "外流"      
## [106] "將"         "嚴重"       "影響"       "公司"       "未來"      
## [111] "發展"       "由於"       "特定"       "媒體"       "已"        
## [116] "連續"       "三周"       "獨家"       "報導"       "浩鼎"      
## [121] "相關"       "訊息"       "報導"       "雖不實"     "但"        
## [126] "部分"       "資料"       "均"         "來自"       "主管"      
## [131] "單位"       "要求"       "公司"       "陳"         "交"        
## [136] "之"         "內部資料"   "這些"       "資料"       "從未"      
## [141] "對外"       "提供"       "今天"       "檢調"       "一抵"      
## [146] "搜索"       "現場"       "該"         "媒體"       "相關"      
## [151] "部門"       "即"         "來電"       "查證"       "在"        
## [156] "偵查"       "不"         "公開"       "下"         "特定"      
## [161] "媒體"       "在"         "第一"       "時間"       "即"        
## [166] "知曉"       "令浩鼎"     "為"         "之"         "驚訝"      
## [171] "與"         "耽憂"       "本"         "公司"       "在"        
## [176] "此"         "嚴正聲明"   "本"         "公司"       "已"        
## [181] "獲邀"       "今年"       "六月"       "將"         "前往"      
## [186] "美國"       "臨床"       "腫瘤"       "醫學會"     "年會"      
## [191] "口頭"       "發表"       "OBI"        "822"        "臨床"      
## [196] "試驗"       "計畫"       "解盲"       "報告"       "論文"      
## [201] "依據"       "學術"       "規範"       "在"         "發表"      
## [206] "前"         "相關"       "數據"       "皆"         "不得"      
## [211] "揭露"       "浩"         "鼎"         "也"         "籲請"      
## [216] "媒體"       "尊重"       "司法"       "若因"       "數據"      
## [221] "外流"       "而"         "導致"       "公司"       "及"        
## [226] "投資人"     "權益"       "損失"       "公司"       "將"        
## [231] "依據"       "營業"       "秘密"       "法"         "及"        
## [236] "相關"       "法規"       "對洩"       "密及"       "傳播者"    
## [241] "追訴"       "及"         "求償"       "浩"         "鼎"        
## [246] "也"         "呼籲"       "投資人"     "不用"       "驚慌"      
## [251] "公司"       "營運"       "正常"       "經得起"     "考驗"      
## [256] "公司"       "也"         "在"         "事前"       "即"        
## [261] "告知"       "員工"       "積極"       "配合"       "調查"      
## [266] "惟有"       "儘速"       "將"         "真相"       "查明"      
## [271] "才能"       "還"         "公司"       "清白"       "20160415"  
## [276] "台灣浩鼎生" "技股"       "有限公司"   "4174"       "今天"      
## [281] "發表聲明"   "公司"       "自信"       "無"         "任何"      
## [286] "不法"       "情事"       "願意"       "主動"       "配合"      
## [291] "司法"       "調查"       "也"         "懇請"       "司法"      
## [296] "單位"       "秉持"       "公平正義"   "原則"       "保障"      
## [301] "合法"       "百姓"       "和"         "業者"       "權益"      
## [306] "早日"       "釐清"       "案情"       "還給"       "浩鼎"      
## [311] "公道"       "江"         "俞"         "庭"         "台北"      
## [316] "報導"

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
  unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)

## [1]   234 10887

ft <- findFreqTerms(dtm, 5)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)
new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)

convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}

dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)

class(dtm.count)

## [1] "matrix"

m<-as.data.frame(dtm.count)
class(m)

## [1] "data.frame"

nrow(m)

## [1] 234

idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]

#install.packages("e1071")
library(e1071)

## Warning: package 'e1071' was built under R version 3.2.5

model <- naiveBayes(trainset , as.factor(traintag) )
pred <- predict(model, testset)
tb <- table(pred, testtag)
confusionMatrix(tb)

## Confusion Matrix and Statistics
## 
##       testtag
## pred   娛樂 財經
##   娛樂   35    2
##   財經    0   29
##                                           
##                Accuracy : 0.9697          
##                  95% CI : (0.8948, 0.9963)
##     No Information Rate : 0.5303          
##     P-Value [Acc > NIR] : 1.147e-15       
##                                           
##                   Kappa : 0.9389          
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9355          
##          Pos Pred Value : 0.9459          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.5303          
##          Detection Rate : 0.5303          
##    Detection Prevalence : 0.5606          
##       Balanced Accuracy : 0.9677          
##                                           
##        'Positive' Class : 娛樂            
##

which(testtag != pred)

## [1] 50 61

apple.subset[idx==2,"title"][which(testtag != pred)]

## [1] "【壹週刊】從貴婦到攤販　看見台灣媳婦的堅毅"
## [2] "HTC 10影片18秒處　網熱議M7藍再現？"

apple.subset[idx==2,"content"][which(testtag != pred)]

## [1] "本內容由台灣壹週刊提供<U+00A0>台灣媳婦林素蓮，原本是旅居巴基斯坦及杜拜的貴婦，2009年她帶著長子林恩勝回台，躲避巴基斯坦的內戰烽火....<U+00A0>「看完整內容請點我」「看更多壹週刊內容請點我」<U+00A0><U+00A0><U+00A0>"                                                                                                                                                                                                                                                                             
## [2] "HTC 10官方影片First Impressions 宏達電新機HTC 10話題不斷，眼尖的網友從官方影片看出，可能將推出天空藍？批踢踢熱議很期待，超愛M7藍，10上藍色應該也很美。<U+00A0>批踢踢網友分享HTC 10官方影片「HTC 10：first Impressions」，18秒的地方，機身標籤貼紙出現「天空藍」的字樣，網友甚至截圖下來，看得更清楚，「有沒有可能是眾所期待的藍色機身呢？」<U+00A0>如果是M7的藍，真的超好看，現在已是絕響，「這藍色不錯，買定了」。（馬婉珍／綜合報導）<U+00A0> 批踢踢網友截圖影片第18秒處，可見「天空藍」字樣。翻攝自HTC上傳YouTube"

testtag[54]

## [1] "娛樂"

預測新聞類別

library(jiebaR)
library(tm)
library(e1071)
apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂', '政治', '體育', '國際'),]
mixseg = worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
space_tokenizer=function(x){
  unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}
doc=VCorpus(VectorSource(apple.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)

ft <- findFreqTerms(dtm, 5)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)
new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)


convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}

dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)

m<-as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]

model <- naiveBayes(trainset , as.factor(traintag) )
pred <- predict(model, testset)
tb <- table(pred, testtag)
confusionMatrix(tb)

## Confusion Matrix and Statistics
## 
##       testtag
## pred   政治 娛樂 財經 國際 體育
##   政治   47    1    1    1    0
##   娛樂    0   32    1    0    1
##   財經    1    0   30    2    0
##   國際    3    2    6   84    1
##   體育    0    0    0    0   25
## 
## Overall Statistics
##                                           
##                Accuracy : 0.916           
##                  95% CI : (0.8732, 0.9479)
##     No Information Rate : 0.3655          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8883          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 政治 Class: 娛樂 Class: 財經 Class: 國際
## Sensitivity               0.9216      0.9143      0.7895      0.9655
## Specificity               0.9840      0.9901      0.9850      0.9205
## Pos Pred Value            0.9400      0.9412      0.9091      0.8750
## Neg Pred Value            0.9787      0.9853      0.9610      0.9789
## Prevalence                0.2143      0.1471      0.1597      0.3655
## Detection Rate            0.1975      0.1345      0.1261      0.3529
## Detection Prevalence      0.2101      0.1429      0.1387      0.4034
## Balanced Accuracy         0.9528      0.9522      0.8872      0.9430
##                      Class: 體育
## Sensitivity               0.9259
## Specificity               1.0000
## Pos Pred Value            1.0000
## Neg Pred Value            0.9906
## Prevalence                0.1134
## Detection Rate            0.1050
## Detection Prevalence      0.1050
## Balanced Accuracy         0.9630

library(e1071)
data(iris)
fit = svm(Species ~., data=iris)
pred = predict(fit, iris)
table(pred, iris$Species)

##             
## pred         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

df = data.frame(x = c(1,2,3), y = c(2,3,4))
plot(df$x, df$y)

str(df)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: num  1 2 3
##  $ y: num  2 3 4

df$x = as.factor(df$x)
str(df)

## 'data.frame':    3 obs. of  2 variables:
##  $ x: Factor w/ 3 levels "1","2","3": 1 2 3
##  $ y: num  2 3 4

plot(df$x, df$y)

m = matrix(1:9, nrow=3)
apply(m, MARGIN = 2, sum)

## [1]  6 15 24

apply(m, MARGIN = 1, sum)

## [1] 12 15 18

m = matrix(c(1,2,0,0,0,2,0,1,0), nrow=3)
m

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    2    0    1
## [3,]    0    2    0

apply(m, MARGIN = 2, convert_counts)

##      [,1]  [,2]  [,3] 
## [1,] "Yes" "No"  "No" 
## [2,] "Yes" "No"  "Yes"
## [3,] "No"  "Yes" "No"

20160528Demo

David Chiu

2016年5月28日

預測新聞類別