Load libraries.
library(readtext)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
Create corpus of students’ essays.
x <- list.files(pattern = "docx") %>%
readtext(ignore_missing_files = T) %>% corpus()
Common Words
x %>% dfm(remove_punct=T,
remove_numbers=T,
remove=stopwords("en")) %>%
textstat_frequency() %>%
filter(frequency>=2) %>%
filter(nchar(feature)>1) %>%
arrange(-frequency, feature) %>%
summarise(word = feature, frequency) %>%
head(50)
word frequency
1 works 82
2 work 37
3 famous 33
4 wrote 28
5 ishiguro 23
6 series 22
7 lewis 20
8 can 19
9 one 19
10 also 18
11 novel 18
12 made 16
13 story 16
14 books 15
15 england 15
16 many 15
17 kazuo 14
18 published 14
19 three 14
20 age 13
21 harry 13
22 later 13
23 stories 13
24 written 13
25 book 12
26 fantasy 12
27 go 12
28 japan 12
29 literature 12
30 narnia 12
31 potter 12
32 references 12
33 four 11
34 ishiguro's 11
35 novels 11
36 people 11
37 world 11
38 chronicles 10
39 first 10
40 prince 10
41 character 9
42 different 9
43 faber 9
44 group 9
45 henry 9
46 life 9
47 never 9
48 nonfiction 9
49 play 9
50 plays 9
Classification Word: Classified
kwic(x, "classified") %>% summarise(pre, keyword, post)
pre keyword post
1 . His work can be classified into 3 sections . First
2 genres and it can be classified into three : fantasy ,
3 genres and it can be classified into three : fantasy ,
4 , his works can be classified into three groups by the
5 " but it is not classified as a ShakeSpearean tragedy .
6 Shakespeare's works , which are classified into comedies , tragedies ,
7 When Ishiguro's works are classified by chronological order , the
Classification Word: Divided
kwic(x, "divided") %>% summarise(pre, keyword, post)
pre keyword post
1 work A Shakespeare's works are divided into four categories : comedy
2 , his work can be divided as before heyday works .
3 Kazuo Ishiguro's works can be divided into three categories by age
In-text citation in APA format
kwic(x, pattern = phrase("\\( . , \\d{4} \\)"),
valuetype="regex") %>%
summarise(intext_citation = keyword)
intext_citation
1 ( Lewis , 2005 )
2 ( Ishiguro , 2006 )
3 ( Ishiguro , 2010 )
4 ( Shakespeare , 1983 )
5 ( Lewis , 2001 )
6 ( Lewis , 2001 )
7 ( Lewis , 1942 )
8 ( Lewis , 1940 )
9 ( Lewis , 1960 )
10 ( Lewis , 1970 )
11 ( Lewis , 2005 )
12 ( Ishiguro , 2006 )
13 ( Ishiguro , 2010 )
14 ( Ishiguro , 1995 )
15 ( Ishiguro , 2006 )
16 ( Wilde , 1997 )
17 ( Lewis , 2001 )
18 ( Lewis , 1942 )
19 ( Lewis , 1940 )
20 ( Lewis , 1960 )
21 ( Lewis , 1970 )
22 ( Ishiguro , 1995 )
In-text citation outside the sentence (INCORRECT)
kwic(x, pattern = phrase("\\. \\( . , \\d{4} \\)"),
valuetype="regex") %>%
summarise(intext_citation = keyword)
intext_citation
1 . ( Lewis , 2005 )
2 . ( Shakespeare , 1983 )
3 . ( Lewis , 2001 )
4 . ( Lewis , 2001 )
5 . ( Lewis , 2005 )
6 . ( Ishiguro , 2010 )
7 . ( Ishiguro , 1995 )
8 . ( Ishiguro , 2006 )
9 . ( Lewis , 2001 )
In-text citations within the sentence (CORRECT)
kwic(x, pattern = phrase("\\( . , \\d{4} \\) \\."),
valuetype="regex") %>%
summarise(intext_citation = keyword)
intext_citation
1 ( Ishiguro , 2006 ) .
2 ( Ishiguro , 2010 ) .
3 ( Shakespeare , 1983 ) .
4 ( Lewis , 1970 ) .
5 ( Ishiguro , 2006 ) .
6 ( Ishiguro , 2010 ) .
7 ( Ishiguro , 1995 ) .
8 ( Ishiguro , 2006 ) .
9 ( Wilde , 1997 ) .
10 ( Lewis , 1970 ) .
11 ( Ishiguro , 1995 ) .
Rare words
x %>% tokens(remove_punct=T,
remove_symbols=T,
remove_number = T,
remove_url = T) %>%
dfm(remove=stopwords("en")) %>%
textstat_frequency() %>%
filter(frequency==1) %>%
summarise(word = feature) %>%
filter(nchar(word)>1) %>%
arrange(word) %>%
slice(4:n()) %>%
slice(-368)
word
1 absolute
2 accidentally
3 accusations
4 achieve
5 acting
6 active
7 actually
8 additionally
9 admirable
10 admittedly
11 adopted
12 affectionate
13 aim
14 almost
15 already
16 annotators
17 appreciate
18 area
19 aristocracy
20 arrest
21 arrested
22 arrive
23 arthur
24 author's
25 authors
26 autobiographies
27 awarded
28 azkaban
29 bankruptcy
30 bbc
31 bearer
32 beast
33 beatings
34 behavior
35 besides
36 bible
37 big
38 billion
39 birth
40 bisexual
41 black
42 bold
43 booker
44 box-office
45 broadcasted
46 brothers
47 brushstrokes
48 care
49 career
50 caused
51 ceremony
52 chamber
53 characters
54 charge
55 chikumashobo
56 circumstances
57 civil
58 collected
59 collections
60 come
61 comparison
62 completely
63 complicated
64 concealment
65 conditions
66 considering
67 consist
68 consisting
69 consists
70 contains
71 contemporary
72 content
73 continuous
74 contrast
75 courage
76 crime
77 crimes
78 criticism
79 d'arc
80 dealing
81 deathly
82 decade
83 decided
84 decorations
85 defeated
86 degree
87 demonic
88 denouement
89 describes
90 destined
91 detective
92 devotion
93 die
94 disadvantage
95 distinctly
96 document
97 documents
98 dollars
99 dorian
100 drama
101 dramas
102 dramatic
103 earnest
104 easy
105 edition
106 edward
107 effective
108 emotion
109 emotionally
110 emotions
111 empathize
112 encounter
113 enough
114 entire
115 essential
116 etc
117 eternal
118 everything
119 evolved
120 except
121 exception
122 experience
123 explained
124 exposure
125 extremely
126 fables
127 fairytale
128 fall
129 false
130 familiar
131 families
132 fan
133 farcical
134 figured
135 find
136 fire
137 five
138 focus
139 following
140 french
141 full
142 full-length
143 furthermore
144 future
145 gave
146 gazette
147 generation
148 generations
149 gentlemen
150 goblet
151 graduation
152 gray
153 gray's
154 grindelwald
155 gross
156 gutenberg
157 gutenberg.org
158 hair
159 hakusuisya
160 half-blood
161 hallows
162 hamlet
163 hand
164 handed
165 happened
166 hard
167 harsh
168 heart
169 height
170 help
171 helped
172 hero's
173 heyday
174 high
175 highest
176 holy
177 home
178 hundred
179 husband
180 ideal
181 identical
182 identities
183 imagine
184 imitating
185 imply
186 imprisoned
187 include
188 increase
189 indecency
190 infidelity
191 influence
192 influenced
193 inn
194 interested
195 interviews
196 introduced
197 ireland
198 irish
199 ironically
200 j.k.rowling
201 jeanne
202 juliet
203 kafka
204 keeps
205 kind
206 labour's
207 language
208 lear
209 learn
210 led
211 left
212 lifetime
213 lines
214 literary
215 little
216 live
217 lord
218 love's
219 macbeth
220 madness
221 mainly
222 majority
223 marry
224 masters
225 may
226 men
227 method
228 middle
229 miserable
230 mishaps
231 mistaken
232 mistress
233 modern
234 morally
235 much
236 music
237 natural
238 naturally
239 near-seduction
240 neither
241 nice
242 nightfall
243 noble
244 nocturnes
245 occurrences
246 oct
247 origin
248 othello
249 overall
250 part3
251 parts
252 passed
253 perfection
254 performed
255 phoenix
256 picture
257 placed
258 places
259 plan
260 planning
261 platonic
262 playwright
263 plot
264 poems
265 poet
266 poetic
267 point
268 politics
269 pomegranates
270 possession
271 potential
272 potter's
273 praise
274 present
275 prisoner
276 prizes
277 problems
278 produced
279 profundis
280 project
281 proof
282 pursues
283 pursuit
284 quite
285 ravenna
286 reached
287 real
288 reason
289 reasons
290 received
291 recognized
292 refer
293 refernce
294 reflected
295 regarded
296 regret
297 relationship
298 release
299 released
300 represents
301 reshowing
302 respectively
303 revenue
304 review
305 rewarded
306 risking
307 romantic
308 romeo
309 roses
310 savile's
311 secrets
312 sections
313 seems
314 seen
315 selfless
316 sensation
317 sense
318 sensibility
319 sentenced
320 sequel
321 serve
322 sets
323 setting
324 several
325 sexual
326 shake
327 shortlisted
328 shows
329 shrew
330 since
331 skin
332 sonetto-syuu
333 song
334 speare
335 special
336 spectors
337 stage
338 stone
339 strike
340 strike's
341 strong
342 student.one
343 subdivided
344 succeed
345 success
346 summary
347 sung
348 surreal
349 syracusans
350 taken
351 taming
352 theft
353 themes
354 thing
355 tough
356 trial
357 trifle
358 true
359 turbulent
360 turmoil
361 turned
362 turns
363 twin
364 u.k.and
365 understand
366 unforgivable
367 unknown
368 used
369 uses
370 valued
371 verona
372 village
373 vividly
374 wars
375 ways
376 wealthy
377 whose
378 william
379 willpower
380 windermere's
381 without
382 works.a
383 wrongful
384 year
Word Count
df <- ntoken(x) %>% data.frame
df %>% ggplot(aes(x=.)) +
geom_density() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
