Phân bố số lượng chữ cái trong từ tiếng Anh

theme_set(theme_minimal())

eng %>% 
  group_by(n_w) %>% 
  count() %>% 
  ggplot(aes(n_w, n)) + geom_col() + 
  labs(x = NULL, y = NULL)

vowels <- c("a", "e", "i", "o", "u")
num_vowels <- vector(mode = "integer", length = 5)
for (j in seq_along(vowels)) {
  num_aux = str_count(eng$V1, vowels[j])
  num_vowels[j] = sum(num_aux)
}

df1 <- data.frame(N = num_vowels, nguyen_am = vowels)

Tần suất xuất hiện các nguyên âm:

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Tần suất xuất hiện của các chữ cái trong tiếng Anh:

u <- strsplit(eng$V1, "")
k <- unlist(u)

let <- eng$V1 %>% strsplit("") %>% unlist()
let <- data.frame(l = let)

let %>% group_by(l) %>% count() %>% 
  mutate(per = 100*n / nrow(let)) %>% 
  mutate_if(is.numeric, function(x) round(x, 2)) %>% 
  ggplot(aes(reorder(l, per), per)) + geom_col() + 
  coord_flip() + 
  labs(x = NULL, y = NULL) + 
  theme_ipsum(grid = "X") + 
  scale_y_continuous(breaks = seq(1, 12, by = 1))

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

``` # Load các gói:

pakg <- c('tm', 
          'SnowballC', 
          'wordcloud', 
          'RColorBrewer')

lapply(pakg, require, character.only = TRUE)

## Loading required package: tm

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Loading required package: SnowballC

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
## 
## [[4]]
## [1] TRUE

library(pdftools)

http://data.library.virginia.edu/reading-pdf-files-into-r-for-text-mining/

mk <- pdf_text("D:/pdftotext/Basic_Econometrics_5th_Edition_by_Damoda.pdf")
corp <- Corpus(VectorSource(mk))

Hoặc bỏ các từ như cengage, learning vì cengage learning là tên

của nhà in. Sẽ không hợp lí nếu tính chúng vào phân tích:

corp <- corp %>% tm_map(removeWords, c("cengage", "learning"))

tdm <- TermDocumentMatrix(corp, control = list(
  # Xóa tất cả các loại dấu câu như chấm, phẩy: 
  removePunctuation = TRUE,
  # Bỏ các Stop Word trong tiếng Anh:
  stopwords = TRUE,
  # Chuyển về chữ không in hoa: 
  tolower = TRUE,
  # Bỏ trích nguồn
  stemming = TRUE,
  # Bỏ các con số:
  removeNumbers = TRUE,
  bounds = list(global = c(3, Inf))))

tdm

## <<TermDocumentMatrix (terms: 3183, documents: 946)>>
## Non-/sparse entries: 98395/2912723
## Sparsity           : 97%
## Maximal term length: 18
## Weighting          : term frequency (tf)

kiểm tra tdm

inspect(tdm[1:10,])

## <<TermDocumentMatrix (terms: 10, documents: 946)>>
## Non-/sparse entries: 331/9129
## Sparsity           : 97%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## Sample             :
##                  Docs
## Terms             166 415 497 567 589 602 777 811 829 834
##   \001the           0   0   0   0   0   0   0   0   0   0
##   \003ln            0   0   0   0   0   0   0   0   0   0
##   \003yt            0   0   0   0   0   0   4   0   0   0
##   â               5  10   0   6   2   0   0   0   0   0
##   â               2   0   0   0   7   0   0   0   8   7
##   â               0   0   4   0   0   7   2   6   0   0
##   â               0   0   1   0   0   0   0   0   0   0
##   âa              0   0   0   0   0   0   0   0   0   0
##   âan             0   0   0   0   0   0   0   0   0   0
##   âbestâ        0   0   1   0   0   0   0   0   0   0

m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)

str(d)

## 'data.frame':    3183 obs. of  2 variables:
##  $ word: Factor w/ 3183 levels "\001the","\003ln",..: 1837 2369 987 3039 3034 2843 384 684 3024 515 ...
##  $ freq: num  4037 2794 2621 2557 2131 ...

# Chuyển hóa factor về character:
d$word <- as.character(d$word)
# Danh sách của 500 từ xuất hiện  nhiều nhất trong
# cuốn sách của Gujarati cùng tần suất tương ứng:
knitr::kable(d %>% slice(1:500))

word	freq
model	4037
regress	2794
estim	2621
variabl	2557
valu	2131
test	1752
can	1594
data	1528
use	1212
coeffici	1150
error	1147
one	1147
statist	1141
follow	1140
time	1102
two	1092
chapter	1081
equat	1063
will	1060
tabl	1045
obtain	1019
given	969
exampl	960
sampl	906
see	890
mean	849
may	832
term	832
linear	825
distribut	811
note	775
function	769
econometr	741
result	739
squar	735
rate	700
varianc	682
hypothesi	674
seri	664
observ	662
incom	653
assumpt	646
figur	626
method	626
differ	622
discuss	596
signific	583
correl	548
normal	546
chang	545
probabl	536
first	533
lag	527
standard	521
also	518
var	496
econom	491
consid	483
depend	476
analysi	469
number	467
part	467
show	467
zero	461
therefor	455
section	451
case	450
now	446
residu	446
problem	444
sinc	439
price	433
assum	429
unit	426
regressor	424
matrix	422
expenditur	418
tâˆ	416
paramet	402
true	402
shown	401
expect	385
period	385
thus	383
consumpt	379
percent	377
measur	375
find	374
increas	374
log	369
autocorrel	367
level	362
random	355
give	351
heteroscedast	351
new	350
known	346
appendix	337
relat	336
general	334
least	333
form	329
specif	321
interv	318
state	318
year	314
comput	313
effect	311
averag	309
product	301
null	297
order	295
exercis	293
includ	293
say	290
intercept	288
simultan	285
let	283
dummi	276
per	276
reject	276
run	276
condit	275
demand	275
three	275
take	274
â€	273
explanatori	273
although	271
classic	270
appli	265
relationship	264
present	263
call	261
point	261
larg	258
popul	257
interest	255
individu	254
equal	252
slope	252
stochast	252
constant	251
practic	251
properti	251
base	248
sum	248
york	247
high	242
suppos	238
preced	236
singl	234
suggest	232
confid	230
interpret	228
theori	227
total	227
multicollinear	225
money	224
determin	223
suppli	222
sever	219
size	219
illustr	218
disturb	217
continu	214
dollar	214
approach	212
like	211
posit	207
compar	206
topic	206
multipl	205
bias	204
critic	204
natur	204
procedur	204
reason	204
wage	203
import	202
real	201
refer	201
origin	200
express	197
gdp	197
partial	196
well	196
transform	195
fit	194
studi	194
usual	194
stationari	193
small	192
adjust	191
reader	191
process	188
short	188
respect	187
sourc	186
fix	185
index	185
know	185
cross	184
ratio	184
output	182
howev	181
previous	181
predict	177
unbias	177
consist	176
question	175
univers	175
various	175
whether	174
vol	173
actual	171
make	171
forecast	169
labor	168
ytâˆ	166
fact	165
happen	164
line	164
restrict	164
negat	162
provid	162
defin	161
independ	161
panel	161
type	161
quarter	158
structur	158
autoregress	155
involv	155
possibl	153
tion	153
cours	152
logit	152
educ	151
good	151
trend	150
applic	149
return	149
rss	149
growth	147
correct	146
inform	146
step	146
cost	145
stock	145
watson	145
want	144
collinear	143
introduc	143
person	143
second	143
empir	142
explain	141
press	140
repres	140
covari	139
likelihood	139
regressand	139
degre	138
elast	138
even	138
four	138
plot	138
reduc	138
appropri	137
altern	136
identifi	136
often	136
henc	135
research	134
name	133
way	133
nonlinear	132
set	132
less	131
must	131
save	131
anoth	130
develop	130
ÏƒÌ	130
famili	129
need	128
consum	127
deviat	127
greater	127
similar	127
much	126
simpl	126
addit	125
answer	125
cov	125
situat	124
capit	123
conclus	123
denot	123
basi	122
choos	122
curv	122
john	122
recal	121
indic	120
Ïƒi	118
clear	117
criterion	117
logarithm	117
chi	116
just	116
limit	116
sale	116
wherea	115
direct	114
exact	114
causal	113
endogen	113
root	113
unemploy	113
purpos	112
techniqu	112
book	111
formula	111
relax	111
scale	111
age	110
concept	110
hold	110
long	110
column	108
deriv	108
durbinâ€	108
hypothes	108
capita	107
infer	107
qualit	106
rule	106
billion	105
found	105
prob	105
among	104
numer	104
probit	104
white	104
write	104
respons	103
serial	103
exogen	102
get	102
account	101
becom	101
current	101
introduct	101
lie	101
seem	101
vector	101
word	101
categori	100
hour	100
invest	100
multipli	100
experi	99
mathemat	99
wealth	99
approxim	98
examin	98
vari	98
âˆš	97
decid	97
notic	97
simpli	97
weight	97
asymptot	96
basic	96
correspond	96
hand	96
joint	96
mani	96
work	96
collect	95
look	95
notat	95
made	94
system	94
â€™s	93
cit	93
easili	93
subject	93
son	92
market	91
matter	91
non	91
higher	90
employ	89
gls	89
review	89
variat	89
abl	88
definit	88
detail	88
ident	88
rank	88
seen	88
element	87
hous	87
iâˆ	87
school	87
divid	86
shall	86
contain	85
idea	85
inflat	85
journal	85
pool	85
due	84
packag	84
group	83
identif	83
ing	83
latter	83
margin	83
otherwis	83
particular	83
sens	83
alreadi	82
factor	82
keep	82
low	82
nonstationari	82
granger	81
pdf	81
pgnp	81
priori	81
quit	81
rather	81
region	81
remain	81
side	81
countri	80
gnp	80
influenc	80
power	80
wiley	80
femal	79
theoret	79
under	79
without	79
âˆž	78
mind	78
percentag	78
quantiti	78
sign	78
presenc	77
verifi	77
â€™â€™	76
differenti	76
prf	76
theorem	76
earlier	75
month	75
pattern	75
salari	75
summari	75
uncorrel	75
associ	74
better	74
blue	74
extens	74
iter	74
maximum	74
proof	74
report	74
stage	74
worker	74
xâ€™s	74
airlin	73
combin	73
con	73
ess	73
necessari	73
accept	72
adapt	72
establish	72
hill	72
larger	72
written	72
common	71
conclud	71
consequ	71
densiti	71
drop	71
economi	71
homoscedast	71
might	71
minimum	71
put	71
absolut	70
cointegr	70

# 500 từ nhiều nhất chiếm 75.9% Số lượng từ tác giả dùng
sum(d$freq[1:500]) / sum(d$freq)

## [1] 0.7672796

# Trong khi đó 500 từ kế tiếp chỉ chiếm 12.5%
sum(d$freq[1:1000]) / sum(d$freq)

## [1] 0.8903234

—————————————————–

Dưới đây chúng ta lọc ra hai bộ dữ liệu riêng biệt.

Bộ thứ nhất là những từ thuộc bộ từ điển tiếng Anh.

Bộ còn lại không thuộc.

—————————————————–

# Bộ thứ nhất chủ yếu là các từ phổ thông: 
gu_eng <- d %>% filter(word  %in% eng$V1) 
# Bộ thứ hai hầu  hết là từ chuyên ngành Thống kê - Kinh tế lượng: 
gu_non_eng <- dplyr::setdiff(d, gu_eng) 

nrow(d)

## [1] 3183

nrow(gu_eng)

## [1] 1472

nrow(gu_non_eng)

## [1] 1711

nrow(gu_non_eng) / nrow(d)

## [1] 0.5375432

knitr::kable(gu_non_eng %>% filter(freq >= 100))

word	freq
estim	2621
variabl	2557
valu	2131
coeffici	1150
equat	1063
tabl	1045
exampl	960
sampl	906
distribut	811
econometr	741
squar	735
varianc	682
seri	664
observ	662
incom	653
assumpt	646
figur	626
signific	583
probabl	536
var	496
econom	491
consid	483
analysi	469
residu	446
sinc	439
assum	429
expenditur	418
tâˆ	416
paramet	402
consumpt	379
measur	375
increas	374
autocorrel	367
heteroscedast	351
relat	336
specif	321
interv	318
comput	313
averag	309
exercis	293
includ	293
simultan	285
dummi	276
condit	275
â€	273
explanatori	273
appli	265
larg	258
popul	257
individu	254
stochast	252
practic	251
properti	251
suppos	238
preced	236
singl	234
confid	230
theori	227
multicollinear	225
determin	223
suppli	222
illustr	218
continu	214
compar	206
multipl	205
natur	204
procedur	204
gdp	197
studi	194
stationari	193
sourc	186
howev	181
unbias	177
univers	175
ytâˆ	166
negat	162
provid	162
defin	161
independ	161
structur	158
autoregress	155
involv	155
possibl	153
tion	153
cours	152
logit	152
educ	151
applic	149
rss	149
introduc	143
empir	142
repres	140
covari	139
regressand	139
degre	138
elast	138
reduc	138
appropri	137
altern	136
identifi	136
henc	135
anoth	130
ÏƒÌ	130
famili	129
consum	127
deviat	127
simpl	126
addit	125
cov	125
situat	124
capit	123
conclus	123
denot	123
basi	122
choos	122
curv	122
recal	121
indic	120
Ïƒi	118
wherea	115
endogen	113
unemploy	113
purpos	112
techniqu	112
deriv	108
durbinâ€	108
hypothes	108
qualit	106
prob	105
numer	104
probit	104
respons	103
exogen	102
becom	101
introduct	101
categori	100
multipli	100

knitr::kable(head(gu_non_eng, 200))

word	freq
estim	2621
variabl	2557
valu	2131
coeffici	1150
equat	1063
tabl	1045
exampl	960
sampl	906
distribut	811
econometr	741
squar	735
varianc	682
seri	664
observ	662
incom	653
assumpt	646
figur	626
signific	583
probabl	536
var	496
econom	491
consid	483
analysi	469
residu	446
sinc	439
assum	429
expenditur	418
tâˆ	416
paramet	402
consumpt	379
measur	375
increas	374
autocorrel	367
heteroscedast	351
relat	336
specif	321
interv	318
comput	313
averag	309
exercis	293
includ	293
simultan	285
dummi	276
condit	275
â€	273
explanatori	273
appli	265
larg	258
popul	257
individu	254
stochast	252
practic	251
properti	251
suppos	238
preced	236
singl	234
confid	230
theori	227
multicollinear	225
determin	223
suppli	222
illustr	218
continu	214
compar	206
multipl	205
natur	204
procedur	204
gdp	197
studi	194
stationari	193
sourc	186
howev	181
unbias	177
univers	175
ytâˆ	166
negat	162
provid	162
defin	161
independ	161
structur	158
autoregress	155
involv	155
possibl	153
tion	153
cours	152
logit	152
educ	151
applic	149
rss	149
introduc	143
empir	142
repres	140
covari	139
regressand	139
degre	138
elast	138
reduc	138
appropri	137
altern	136
identifi	136
henc	135
anoth	130
ÏƒÌ	130
famili	129
consum	127
deviat	127
simpl	126
addit	125
cov	125
situat	124
capit	123
conclus	123
denot	123
basi	122
choos	122
curv	122
recal	121
indic	120
Ïƒi	118
wherea	115
endogen	113
unemploy	113
purpos	112
techniqu	112
deriv	108
durbinâ€	108
hypothes	108
qualit	106
prob	105
numer	104
probit	104
respons	103
exogen	102
becom	101
introduct	101
categori	100
multipli	100
experi	99
mathemat	99
approxim	98
examin	98
vari	98
âˆš	97
decid	97
notic	97
simpli	97
asymptot	96
mani	96
notat	95
â€™s	93
easili	93
gls	89
variat	89
abl	88
definit	88
ident	88
hous	87
iâˆ	87
divid	86
inflat	85
packag	84
identif	83
ing	83
otherwis	83
sens	83
alreadi	82
nonstationari	82
pdf	81
pgnp	81
countri	80
gnp	80
influenc	80
wiley	80
femal	79
theoret	79
âˆž	78
percentag	78
quantiti	78
presenc	77
verifi	77
â€™â€™	76
differenti	76
prf	76
salari	75
summari	75
uncorrel	75
associ	74
extens	74
iter	74
xâ€™s	74
airlin	73
combin	73
necessari	73
conclud	71
consequ	71
densiti	71
economi	71
homoscedast	71
absolut	70
cointegr	70
#—————-	——————–
# Vẽ đá	m mây từ
#—————-	——————–

# Cho 100 từ xuất hiện  nhiều  nhất ở bộ 1: 
par(bg = "black") 
set.seed(15888)
wordcloud(words = gu_eng$word, 
          freq = gu_eng$freq, 
          # Chỉ hiện thị từ nào xuất hiện  ít nhất 100  lần:
          min.freq = 100,
          # Số từ hiển thị trên wordcloud tối đa là 200: 
          max.words = 200, 
          # Ngẫu  nhiên thứ tự: 
          random.order = FALSE, 
          # 35% số từ được hiển thị theo chiều thẳng đứng
          rot.per = 0.35, 
          # Chọn kích cỡ chữ:
          font = 2,
          # Tô màu cho chữ: 
          colors = brewer.pal(8, "Dark2"))

# Cho 100 từ xuất hiện ở bộ 2: 
set.seed(1409)
wordcloud(words = gu_non_eng$word, 
          freq = gu_non_eng$freq, 
          # Chỉ hiện thị từ nào xuất hiện  ít nhất 10  lần:
          min.freq = 10,
          # Số từ hiển thị trên wordcloud tối đa là 200: 
          max.words = 200, 
          # Ngẫu  nhiên thứ tự: 
          random.order = FALSE, 
          # 35% số từ được hiển thị theo chiều thẳng đứng
          rot.per = 0.35, 
          # Chọn kích cỡ chữ:
          font = 2,
          # Tô màu cho chữ: 
          colors = brewer.pal(8, "Dark2"))

pdf to text

Nguyễn Ngọc Bình

September 20, 2017

Phân bố số lượng chữ cái trong từ tiếng Anh

Tần suất xuất hiện các nguyên âm:

Tần suất xuất hiện của các chữ cái trong tiếng Anh:

http://data.library.virginia.edu/reading-pdf-files-into-r-for-text-mining/

Hoặc bỏ các từ như cengage, learning vì cengage learning là tên

của nhà in. Sẽ không hợp lí nếu tính chúng vào phân tích:

kiểm tra tdm

—————————————————–

Dưới đây chúng ta lọc ra hai bộ dữ liệu riêng biệt.

Bộ thứ nhất là những từ thuộc bộ từ điển tiếng Anh.

Bộ còn lại không thuộc.

—————————————————–