Load libraries.
library(readtext)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
Build a corpus of the students’ book reports.
x <- list.files() %>% readtext %>% corpus
Check the format of the in-text citations with the students.
kwic(x, pattern = phrase("\\( . , \\d{4} \\)"),
valuetype="regex") %>%
summarise(intext_citation = keyword)
intext_citation
1 ( Shakespear , 1993 )
2 ( Spyri , 2008 )
3 ( Bladon , 2011 )
4 ( Burnett , 2008 )
5 ( Milne , 2002 )
6 ( Bladon , 2011 )
7 ( Shakespeare , 2007 )
8 ( Baum , 2007 )
9 ( Baum , 2007 )
10 ( Dickens , 2009 )
11 ( Bladon , 2012 )
12 ( Leroux , 2005 )
13 ( Spyri , 2008 )
14 ( Colbourn , 2008 )
15 ( Doyle , 2002 )
16 ( Stoker , 2002 )
17 ( Baum , 2007 )
18 ( Bladon , 2011 )
In-text citation outside the sentence (incorrect).
kwic(x, pattern = phrase("\\. \\( . , \\d{4} \\)"),
valuetype="regex") %>%
summarise(intext_citation = keyword)
intext_citation
1 . ( Burnett , 2008 )
2 . ( Baum , 2007 )
3 . ( Bladon , 2012 )
In-text citations within the sentence (correct).
kwic(x, pattern = phrase("\\( . , \\d{4} \\) \\."),
valuetype="regex") %>%
summarise("in-text citation" = keyword)
in-text citation
1 ( Shakespear , 1993 ) .
2 ( Spyri , 2008 ) .
3 ( Bladon , 2011 ) .
4 ( Burnett , 2008 ) .
5 ( Milne , 2002 ) .
6 ( Bladon , 2011 ) .
7 ( Shakespeare , 2007 ) .
8 ( Baum , 2007 ) .
9 ( Baum , 2007 ) .
10 ( Dickens , 2009 ) .
11 ( Bladon , 2012 ) .
12 ( Leroux , 2005 ) .
13 ( Spyri , 2008 ) .
14 ( Colbourn , 2008 ) .
15 ( Doyle , 2002 ) .
16 ( Stoker , 2002 ) .
17 ( Baum , 2007 ) .
18 ( Bladon , 2011 ) .
Check the APA formatting of the references with the class.
kwic(x, pattern = phrase("in APA format"),
window = 40) %>%
summarise(reference = post) %>% unlist %>% str_squish %>% as.factor
[1] ( Shakespear , 1993 ) .
[2] Spyri , J . ( 2008 ) . Heidi . Tokyo , Japan : Macmillan Language House .
[3] Bladon , R . ( 2011 ) . Gandhi . Tokyo , Japan : Macmillan Readers .
[4] Burnett , F . H . ( 2008 ) . The secret garden . Tokyo , Japan : Macmillan Language House .
[5] Milne , J . ( 2008 ) . The black cat . Tokyo , Japan : Macmillan Language House .
[6] Bladon , R . ( 2011 ) . Gandhi . Tokyo , Japan : Macmillan Language House .
[7] Shakespeare , W . ( 2007 ) . A midsummer night's dream . Tokyo , Japan : Macmillan Language House . https://elib.maruzen.co.jp/elib/html/BookDetail/Id/3000006397?6
[8] Baum , F . ( 2007 ) . The wizard of Oz . Tokyo , Japon : Macmillan Language House .
[9] Baum , L . F . ( 2007 ) . The wizard of Oz . Tokyo , Japan : Macmillan Language House .
[10]
[11] Dickens , C . ( 2009 ) . A Christmas Carol . Oxford , England : Oxford Bookworms .
[12] Bladon , R . ( 2012 ) . The story of the Olympics -an unofficial history- . Tokyo , Japan . Macmillan Language House . Retrieved June 15 , 2021 , from https://elib.maruzen.co.jp/elib/html/BookDetail/Id/3000006413?3
[13] Leroux , G . ( 2005 ) . The phantom of the opera . Tokyo , Japan : Macmillan Language House .
[14] Spyri , J . ( 2008 ) . Heidi . Tokyo , Japan : Macmillan Language House .
[15] Colbourn , S . ( 2008 ) . King Arthur and the knights of the round table . Tokyo , Japan : Macmillan Language House .
[16] Doyle , A . C . ( 2002 ) . The speckled band and other stories . Tokyo , Japan : Macmillan Language House .
[17] Stoker , B . ( 2002 ) . Dracula . Tokyo , Japan : Macmillan Language House .
[18] Baum , F . ( 2007 ) . The wizard of OZ . Tokyo , Japan : Macmillan Language House .
[19] Bladon , R . ( 2011 ) . Gandhi . Japan : Macmillan Language House .
18 Levels: ... Stoker , B . ( 2002 ) . Dracula . Tokyo , Japan : Macmillan Language House .
Extract summaries and opinions.
summary_start <- str_locate(x, "citation")[,2]
summary_end <- str_locate(x, "Opinion")[,1]
summary <- str_sub(x, summary_start + 1, summary_end - 1) %>% str_squish
opinion_start <- str_locate(x, "Opinion")[,2]
opinion_end <- str_locate(x, "Reference")[,1]
opinion <- str_sub(x, start = opinion_start +1, end = opinion_end -1) %>% str_squish
Common words in summaries
summary %>% dfm(remove_punct=T,
remove_numbers=T,
remove=stopwords("en")) %>%
textstat_frequency() %>%
filter(frequency>=5) %>%
arrange(-frequency, feature) %>%
summarise(word = feature, frequency)
word frequency
1 dorothy 22
2 one 21
3 went 20
4 witch 20
5 holmes 17
6 back 15
7 day 15
8 gandhi 15
9 go 15
10 house 15
11 heidi 14
12 man 13
13 story 13
14 however 12
15 started 12
16 also 11
17 died 11
18 home 11
19 india 11
20 mr 11
21 oz 11
22 romeo 11
23 wanted 11
24 london 10
25 met 10
26 wizard 10
27 city 9
28 demetrius 9
29 good 9
30 kansas 9
31 room 9
32 saw 9
33 black 8
34 british 8
35 christine 8
36 criminal 8
37 got 8
38 heard 8
39 juliet 8
40 made 8
41 said 8
42 scarecrow 8
43 came 7
44 castle 7
45 clara 7
46 cubitt 7
47 decided 7
48 emerald 7
49 father 7
50 garden 7
51 ghost 7
52 killed 7
53 lived 7
54 people 7
55 uncle 7
56 woke 7
57 born 6
58 cat 6
59 christmas 6
60 eric 6
61 going 6
62 later 6
63 love 6
64 new 6
65 scrooge 6
66 south 6
67 stop 6
68 three 6
69 time 6
70 told 6
71 tried 6
72 walk 6
73 war 6
74 work 6
75 angry 5
76 aunt 5
77 away 5
78 beautiful 5
79 became 5
80 come 5
81 country 5
82 east 5
83 felt 5
84 first 5
85 friend 5
86 get 5
87 helena 5
88 knight 5
89 law 5
90 left 5
91 life 5
92 lion 5
93 living 5
94 loved 5
95 lysander 5
96 meet 5
97 mountain 5
98 name 5
99 olympics 5
100 opera 5
101 person 5
102 phantom 5
103 secret 5
104 shoes 5
105 strange 5
106 tin 5
107 two 5
108 way 5
109 world 5
110 years 5
Common words in opinions
opinion %>% dfm(remove_punct=T,
remove_numbers=T,
remove=stopwords("en")) %>%
textstat_frequency() %>%
filter(frequency>=5) %>%
arrange(-frequency, feature) %>%
summarise(word = feature, frequency)
word frequency
1 story 30
2 like 18
3 book 16
4 think 16
5 thought 15
6 read 13
7 people 12
8 reading 10
9 important 9
10 japanese 8
11 want 8
12 also 7
13 however 7
14 japan 7
15 life 7
16 man 7
17 many 7
18 good 6
19 heidi 6
20 love 6
21 make 6
22 surprised 6
23 can 5
24 dracula 5
25 felt 5
26 history 5
27 just 5
28 learned 5
29 lost 5
30 mary 5
31 one 5
32 wanted 5
33 wizard 5
Rare words
x %>% dfm(remove_punct=T,
remove_symbols=T,
remove_url = T,
remove=stopwords("en"),
remove_numbers = T) %>%
textstat_frequency() %>%
filter(frequency <=4 & frequency>=2) %>%
filter(nchar(feature)>1) %>%
summarise(word = feature) %>%
arrange(word)
word
1 able
2 advertisement
3 age
4 allowed
5 always
6 amazing
7 anger
8 animals
9 another
10 antiques
11 anyone
12 anything
13 apartment
14 aphrodisiac
15 appear
16 appearance
17 appeared
18 around
19 arrested
20 arrived
21 assistant
22 athens
23 attacked
24 bad
25 badly
26 bank
27 banned
28 become
29 bed
30 began
31 believe
32 best
33 blown
34 brain
35 brave
36 breathless
37 broke
38 broom
39 buried
40 burnett
41 busy
42 called
43 campaign
44 campaigned
45 canceled
46 capulet
47 capulet's
48 care
49 carefully
50 case
51 caste
52 caught
53 century
54 chain
55 chance
56 change
57 changed
58 changing
59 clues
60 colbourn
61 conan
62 confidence
63 consulted
64 continued
65 countries
66 course
67 cratchits
68 craven
69 cry
70 crying
71 curse
72 cyclone
73 danger
74 daughter
75 days
76 dead
77 deeds
78 delighted
79 depiction
80 desire
81 detective
82 determined
83 detie
84 dickens
85 dickon
86 difference
87 difficult
88 disappeared
89 disobedience
90 doctor
91 dog
92 door
93 dorothy's
94 doyle
95 drawings
96 dream
97 due
98 easily
99 eastern
100 easy
101 elsie
102 enemy
103 england
104 enjoyed
105 entered
106 eric's
107 escape
108 especially
109 europe
110 eve
111 ever
112 every
113 everything
114 examine
115 example
116 excalibur
117 exciting
118 experience
119 face
120 fact
121 fairies
122 fairly
123 fall
124 families
125 family
126 famous
127 far
128 fear
129 feel
130 feelings
131 find
132 followed
133 forest
134 frankfurt
135 frightened
136 front
137 funeral
138 future
139 gave
140 gewain
141 glad
142 glinda
143 gold
144 government
145 graduating
146 grand
147 grandfather
148 great
149 greed
150 green
151 grew
152 guessed
153 happening
154 happy
155 hated
156 headed
157 held
158 helen
159 helped
160 hermia
161 hermia's
162 high
163 hilton
164 hindi
165 hindu
166 hindus
167 hit
168 hitler
169 honest
170 hope
171 horror
172 imagined
173 importance
174 impressed
175 indian
176 indians
177 inspector
178 interested
179 invited
180 ioc
181 job
182 joined
183 jonathan
184 julia
185 kill
186 knew
187 knights
188 know
189 lady
190 lake
191 land
192 late
193 lawrence
194 lawyer
195 leave
196 leaving
197 leroux
198 letter
199 liked
200 live
201 lives
202 looked
203 lot
204 managers
205 marley
206 marriage
207 married
208 marry
209 martin
210 masquerade
211 material
212 may
213 meeting
214 men
215 might
216 milne
217 mistake
218 money
219 montague
220 moreover
221 morning
222 move
223 moved
224 much
225 murder
226 musical
227 muslim
228 muslims
229 must
230 mysterious
231 nations
232 nature
233 nephew
234 next
235 night
236 non
237 nonviolence
238 norfolk
239 noticed
240 now
241 oberon
242 offered
243 often
244 opened
245 order
246 ordered
247 orders
248 overcame
249 oxford
250 painted
251 pakistan
252 paper
253 parents
254 paris
255 participating
256 party
257 passed
258 past
259 peaceful
260 pearson's
261 people's
262 performance
263 peter
264 peterson
265 places
266 planning
267 pleased
268 poor
269 power
270 prayer
271 problems
272 promised
273 proud
274 provision
275 puck
276 pursued
277 put
278 quickly
279 racial
280 raoul
281 raul
282 realized
283 really
284 received
285 red
286 red-headed
287 refused
288 relationship
289 religions
290 remember
291 request
292 resisted
293 returned
294 round
295 ruby
296 rule
297 run
298 sad
299 salahadin
300 salt
301 scary
302 scene
303 scenes
304 second
305 seeing
306 seeking
307 seemed
308 seen
309 selfish
310 sent
311 series
312 serious
313 servants
314 seven
315 several
316 shakespeare
317 sherlock
318 shocked
319 shop
320 shot
321 showed
322 sick
323 similar
324 since
325 sing
326 sister
327 situation
328 sleep
329 small
330 snake
331 son
332 sound
333 spaulding
334 spirits
335 sports
336 spring
337 spyri
338 stage
339 stand
340 state
341 states
342 stepfather's
343 stingy
344 stoker
345 stolen
346 stopped
347 stories
348 strong
349 student
350 studied
351 subordinates
352 substitute
353 succeeded
354 suddenly
355 supremacy
356 sure
357 surprisingly
358 sword
359 system
360 table
361 take
362 taken
363 talent
364 task
365 taught
366 tell
367 tells
368 terrible
369 thanks
370 therefore
371 though
372 tired
373 together
374 took
375 toto
376 train
377 true
378 ugly
379 uk
380 unfortunately
381 united
382 university
383 unreasonable
384 unusual
385 use
386 various
387 verona
388 violin
389 visit
390 visited
391 voice
392 vowed
393 walked
394 wants
395 wars
396 watson
397 wedding
398 well
399 west
400 whether
401 white
402 wicked
403 wife
404 wilson
405 wish
406 without
407 woman
408 won
409 wonderful
410 word
411 words
412 written
413 ww2
414 young
Punctuation point: You need to put a space after a period.
kwic(x, pattern = "\\.\\S", window = 10,
valuetype="regex") %>%
summarise(keyword)
keyword
1 death.The
2 other.However
3 father.Moreover
4 Athens.So
5 them.Moreover
6 too.So
7 there.One
8 Helena.So
9 Helena.However
10 Helena.After
11 too.Then
12 violently.To
13 dangerous.He
14 magically.Meanwhile
15 Lysander.Afterward
16 died.In
17 brave.Finally
18 happy.I
19 on.Its
20 story.So
21 https://elib.maruzen.co.jp/elib/html/BookDetail/Id/3000006397?6
22 other.Not
23 https://elib.maruzen.co.jp/elib/html/BookDetail/Id/3000006413?3
24 Mr.Sesemann
25 Dracula.There
Summary word count
df <- data.frame(summary)
df$words <- ntoken(summary)
df %>% ggplot(aes(x=words)) +
geom_density() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())

Opinion word count
df <- data.frame(opinion)
df$words <- ntoken(opinion)
df %>% ggplot(aes(x=words)) +
geom_density() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
