mecab -Odump osaka-u_ja.txt > dump_osaka-u_ja.txt
mecabDump <-read.table("dump_osaka-u_ja.txt", header=FALSE)
head(mecabDump)
## V1 V2
## 1 0 BOS
## 2 9 この
## 3 21 たび
## 4 27 、
## 5 44 大阪
## 6 65 大学
## V3
## 1 BOS/EOS,*,*,*,*,*,*,*,*
## 2 連体詞,*,*,*,*,*,コノ,此の,この,コノ,この,コノ,和,*,*,*,*
## 3 名詞,普通名詞,助数詞可能,*,*,*,タビ,度,たび,タビ,たび,タビ,和,*,*,*,*
## 4 補助記号,読点,*,*,*,*,,、,、,,、,,記号,*,*,*,*
## 5 名詞,固有名詞,地名,一般,*,*,オオサカ,オオサカ,大阪,オーサカ,大阪,オーサカ,固,*,*,*,*
## 6 名詞,普通名詞,一般,*,*,*,ダイガク,大学,大学,ダイガク,大学,ダイガク,漢,*,*,*,*
## V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 0 0 0 0 0 0 2 1 0 0 0 0
## 2 0 6 5979 5979 1 6 0 1 0 0 0 2417
## 3 6 12 5151 5151 1 6 0 1 0 0 0 7609
## 4 12 15 5974 5974 1 3 0 1 0 0 0 5620
## 5 15 21 4792 4792 1 2 0 1 0 0 0 11949
## 6 21 27 5146 5146 1 2 0 1 0 0 0 14305
pos<-c()
for(i in mecabDump[,3]){
tmp<-unlist(strsplit(as.character(i),","))[1]
pos<- append(pos,tmp)
}
pos[1:10]
## [1] "BOS/EOS" "連体詞" "名詞" "補助記号" "名詞" "名詞"
## [7] "接頭辞" "名詞" "名詞" "名詞"
mtx<-data.frame(cbind(mecabDump[2], pos))
dim(mtx)
## [1] 714 2
colnames(mtx) <- c("term","POS")
head(mtx)
## term POS
## 1 BOS BOS/EOS
## 2 この 連体詞
## 3 たび 名詞
## 4 、 補助記号
## 5 大阪 名詞
## 6 大学 名詞
mtx <- mtx[mtx!="BOS" & mtx!="EOS" ,]
tmMtx<-aggregate(x=mtx[c("term")], by=list(mtx$term,mtx$POS), length)
colnames(tmMtx) <- c("単語","POS", "頻度")
tmMtx<-tmMtx[order(tmMtx$頻度, decreasing = TRUE),]
head(tmMtx)
## 単語 POS 頻度
## 260 、 補助記号 37
## 29 の 助詞 36
## 35 を 助詞 27
## 24 て 助詞 25
## 28 に 助詞 23
## 261 。 補助記号 20
posMtx<-aggregate(x=mtx[c("POS")], by=list(mtx$POS), length)
colnames(posMtx) <- c("POS", "頻度")
posMtx<-posMtx[order(posMtx$頻度, decreasing = TRUE),]
posMtx
## POS 頻度
## 6 名詞 210
## 4 助詞 169
## 5 動詞 91
## 13 補助記号 82
## 3 助動詞 50
## 9 接尾辞 21
## 8 形状詞 16
## 11 接頭辞 10
## 15 連体詞 9
## 10 接続詞 7
## 1 代名詞 6
## 2 副詞 6
## 12 空白 5
## 7 形容詞 4
## 14 記号 2
library(wordcloud)
## Loading required package: RColorBrewer
library(shiny)
source("getFreqDir.R")
tf <- getFreqDir("univ")
head(tf)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## to 11 17 26 11 17 11 15 35
## and 7 18 38 15 26 31 22 37
## the 7 25 36 16 31 32 52 39
## university 6 5 16 18 21 15 9 22
## hiroshima 5 0 0 0 0 0 0 0
## of 5 18 35 10 26 30 38 34
wordcloud(rownames(tf),tf$waseda)
comparison.cloud(tf[,1:2])
comparison.cloud(tf[,c(4,6)],max.words=40)
colnames(tf)
comparison.cloud(tf[,c(F,F,T,F,F,T,T,F)],max.words=40)
head(brewer.pal.info[])
## maxcolors category colorblind
## BrBG 11 div TRUE
## PiYG 11 div TRUE
## PRGn 11 div TRUE
## PuOr 11 div TRUE
## RdBu 11 div TRUE
## RdGy 11 div FALSE
rownames(brewer.pal.info[])
## [1] "BrBG" "PiYG" "PRGn" "PuOr" "RdBu" "RdGy"
## [7] "RdYlBu" "RdYlGn" "Spectral" "Accent" "Dark2" "Paired"
## [13] "Pastel1" "Pastel2" "Set1" "Set2" "Set3" "Blues"
## [19] "BuGn" "BuPu" "GnBu" "Greens" "Greys" "Oranges"
## [25] "OrRd" "PuBu" "PuBuGn" "PuRd" "Purples" "RdPu"
## [31] "Reds" "YlGn" "YlGnBu" "YlOrBr" "YlOrRd"
binfo<-brewer.pal.info[]
palets <-rownames(binfo[binfo$maxcolors>10,])
tmpPalet = palets[1]
(maxNum=binfo[rownames(binfo)==tmpPalet,][1])
## maxcolors
## BrBG 11
wordcloud(rownames(tf),tf$waseda,
colors=brewer.pal(as.integer(maxNum),tmpPalet))
tmpPalet = palets[5]
wordcloud(rownames(tf),tf$waseda,
colors=brewer.pal(as.integer(maxNum),tmpPalet),min.freq=5)
runApp("app_freqBar_Lec07")
library(DT)
##
## Attaching package: 'DT'
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
datatable(tf, rownames = TRUE)
alt text
\[Corr(x,y)= \frac{\sum (x_{i}-\overline{x}) (y_{i}-\overline{y})}{\sqrt{\sum (x_{i}-\overline{x})^2\sum (y_{i}-\overline{y})^2}} \]
tf <- getFreqDir("testData")
res <-cor(tf)
round(res,2)
## test1 test2 test3 test4
## test1 1.00 -0.29 -0.38 -0.35
## test2 -0.29 1.00 0.80 0.96
## test3 -0.38 0.80 1.00 0.94
## test4 -0.35 0.96 0.94 1.00
行と列を転置する
t(tf)
## c e b a d f g h
## test1 13 7 4 3 0 0 0 0
## test2 2 1 4 2 0 11 7 0
## test3 3 1 0 2 1 9 7 4
## test4 5 2 4 4 1 20 14 4
round(cor(t(tf)),2)
## c e b a d f g h
## c 1.00 0.99 0.37 0.40 -0.40 -0.66 -0.65 -0.40
## e 0.99 1.00 0.41 0.33 -0.50 -0.71 -0.71 -0.50
## b 0.37 0.41 1.00 0.52 -0.58 0.08 0.00 -0.58
## a 0.40 0.33 0.52 1.00 0.30 0.42 0.43 0.30
## d -0.40 -0.50 -0.58 0.30 1.00 0.63 0.71 1.00
## f -0.66 -0.71 0.08 0.42 0.63 1.00 1.00 0.63
## g -0.65 -0.71 0.00 0.43 0.71 1.00 1.00 0.71
## h -0.40 -0.50 -0.58 0.30 1.00 0.63 0.71 1.00
tf <- getFreqDir("univ")
res <-cor(tf)
install.packages("proxy")
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
行と列を転置する
corr <- simil(t(tf))
round(corr, 2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo
## kufs 0.63
## kyoto 0.71 0.80
## osaka1 0.67 0.62 0.75
## osaka2 0.67 0.71 0.82 0.84
## osaka3 0.65 0.76 0.87 0.80 0.89
## tokyo 0.60 0.74 0.81 0.71 0.80 0.84
## waseda 0.71 0.79 0.86 0.75 0.80 0.81 0.76
corr <- simil(t(tf), diag=T)
round(corr, 2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## hiroshima NA
## kufs 0.63 NA
## kyoto 0.71 0.80 NA
## osaka1 0.67 0.62 0.75 NA
## osaka2 0.67 0.71 0.82 0.84 NA
## osaka3 0.65 0.76 0.87 0.80 0.89 NA
## tokyo 0.60 0.74 0.81 0.71 0.80 0.84 NA
## waseda 0.71 0.79 0.86 0.75 0.80 0.81 0.76 NA
\[Cos(x,y)= \frac{\sum x_{i} y_{i}}{\sqrt{\sum x_{i}^2\sum y_{i}^2}} \]
simil(t(tf), method="cosine")
## hiroshima kufs kyoto osaka1 osaka2 osaka3
## kufs 0.6526589
## kyoto 0.7279603 0.8104367
## osaka1 0.6841531 0.6458906 0.7676934
## osaka2 0.6885497 0.7264594 0.8327895 0.8434805
## osaka3 0.6645501 0.7703932 0.8757348 0.8060690 0.8955915
## tokyo 0.6187031 0.7488038 0.8125733 0.7176287 0.8041446 0.8419948
## waseda 0.7278519 0.8020611 0.8653391 0.7658753 0.8107266 0.8200624
## tokyo
## kufs
## kyoto
## osaka1
## osaka2
## osaka3
## tokyo
## waseda 0.7712384
tf <- getFreqDir("univ")
res <-simil(t(tf), method="cosine")
round(res,2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo
## kufs 0.65
## kyoto 0.73 0.81
## osaka1 0.68 0.65 0.77
## osaka2 0.69 0.73 0.83 0.84
## osaka3 0.66 0.77 0.88 0.81 0.90
## tokyo 0.62 0.75 0.81 0.72 0.80 0.84
## waseda 0.73 0.80 0.87 0.77 0.81 0.82 0.77
res2<-simil(t(tf), method="cosine", diag=T)
res2<-as.matrix(res2)
res2[is.na(res2)] <- 1
round(res2,2)
## hiroshima kufs kyoto osaka1 osaka2 osaka3 tokyo waseda
## hiroshima 1.00 0.65 0.73 0.68 0.69 0.66 0.62 0.73
## kufs 0.65 1.00 0.81 0.65 0.73 0.77 0.75 0.80
## kyoto 0.73 0.81 1.00 0.77 0.83 0.88 0.81 0.87
## osaka1 0.68 0.65 0.77 1.00 0.84 0.81 0.72 0.77
## osaka2 0.69 0.73 0.83 0.84 1.00 0.90 0.80 0.81
## osaka3 0.66 0.77 0.88 0.81 0.90 1.00 0.84 0.82
## tokyo 0.62 0.75 0.81 0.72 0.80 0.84 1.00 0.77
## waseda 0.73 0.80 0.87 0.77 0.81 0.82 0.77 1.00
tf <- getFreqDir("univ")
hc <- hclust(dist(t(tf)))
plot(hc)
rect.hclust(hc, k=3, border="red")
tf <- getFreqDir("univ")
hc <- hclust(dist(t(tf), method = "canberra"), method = "ward.D2")
plot(hc)
rect.hclust(hc, k=3, border="red")
ui画面には以下のinput機能を搭載
1. 距離行列計算法の選択:"euclidean", "canberra", "manhattan"
2. クラスター間距離計算法の選択:"average", "complete", "ward.D2"
3. 樹形図内長方形の数:2-5