library(dplyr)
library(tidytext)
text <- data_frame(id = c(1,2,3,4,5,6,7,8,9),
word = c("holiday","makes","me","happy","but","this","song","is","sad"))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
text
## # A tibble: 9 x 2
## id word
## <dbl> <chr>
## 1 1 holiday
## 2 2 makes
## 3 3 me
## 4 4 happy
## 5 5 but
## 6 6 this
## 7 7 song
## 8 8 is
## 9 9 sad
lexicon <- data_frame(word = c("happy","sad","holiday","funeral"),
sentiment = c("positive","negative","positive","negative"))
lexicon
## # A tibble: 4 x 2
## word sentiment
## <chr> <chr>
## 1 happy positive
## 2 sad negative
## 3 holiday positive
## 4 funeral negative
inner_join(text, lexicon)
## # A tibble: 3 x 3
## id word sentiment
## <dbl> <chr> <chr>
## 1 1 holiday positive
## 2 4 happy positive
## 3 9 sad negative
So, what we need for sentiment analysis is a sentiment lexicon in a tidy data format. Using the textdata
package, we can get the tidy data format for each lexicon with one emotional word (unigram) per row.
library(textdata)
lexicon_bing()
## # A tibble: 6,787 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,777 more rows
lexicon_bing()%>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## * <chr> <int>
## 1 negative 4782
## 2 positive 2005
lexicon_nrc()
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
lexicon_nrc()%>%
count(sentiment)
## # A tibble: 10 x 2
## sentiment n
## * <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231
lexicon_nrc_eil()
## # A tibble: 5,814 x 3
## term score AffectDimension
## <chr> <dbl> <chr>
## 1 outraged 0.964 anger
## 2 brutality 0.959 anger
## 3 hatred 0.953 anger
## 4 hateful 0.94 anger
## 5 terrorize 0.939 anger
## 6 infuriated 0.938 anger
## 7 violently 0.938 anger
## 8 furious 0.929 anger
## 9 enraged 0.927 anger
## 10 furiously 0.927 anger
## # ... with 5,804 more rows
lexicon_nrc_eil() %>%
count(AffectDimension)
## # A tibble: 4 x 2
## AffectDimension n
## * <chr> <int>
## 1 anger 1483
## 2 fear 1765
## 3 joy 1268
## 4 sadness 1298
floor_date
allows us to do this; using “hour” seems to work well for this hourly change in tweets### Collected tweets including "#covid19" or "#covid-19" or "#coronavirus" on April 23th, 2020.
load("covid_tweets_423.RData")
covid_tweets
## # A tibble: 18,224 x 9
## user_id status_id created_at screen_name text lang country lat
## <chr> <chr> <dttm> <chr> <chr> <chr> <chr> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ "@ev~ en United~ 36.0
## 2 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ "Ple~ en United~ 36.9
## 3 2155830~ 12533658~ 2020-04-23 16:51:09 KOROGLU_BA~ "@Ay~ tr Azerba~ 40.2
## 4 7445974~ 12533657~ 2020-04-23 16:51:05 FoodFocusSA "Pre~ en South ~ -26.1
## 5 1558777~ 12533657~ 2020-04-23 16:51:01 opcionsecu~ "#AT~ es Ecuador -1.67
## 6 9989605~ 12533657~ 2020-04-23 16:51:01 amystones4 "Tha~ en United~ 53.7
## 7 1027687~ 12533657~ 2020-04-23 16:51:00 COTACYT "Men~ es Mexico 23.7
## 8 2473827~ 12533657~ 2020-04-23 16:50:54 bkracing123 "The~ en United~ 53.9
## 9 17566234 12533657~ 2020-04-23 16:50:51 AnnStrahm "Thi~ en United~ 37.5
## 10 2267079~ 12533656~ 2020-04-23 16:50:42 JLeonRojas "INF~ es Chile -35.5
## # ... with 18,214 more rows, and 1 more variable: lng <dbl>
library(lubridate)
covid_tweets_hours <- covid_tweets %>%
mutate(hour = floor_date(created_at, unit="hour")) %>%
count(hour)
covid_tweets_hours
## # A tibble: 20 x 2
## hour n
## * <dttm> <int>
## 1 2020-04-22 21:00:00 986
## 2 2020-04-22 22:00:00 986
## 3 2020-04-22 23:00:00 847
## 4 2020-04-23 00:00:00 846
## 5 2020-04-23 01:00:00 836
## 6 2020-04-23 02:00:00 955
## 7 2020-04-23 03:00:00 928
## 8 2020-04-23 04:00:00 692
## 9 2020-04-23 05:00:00 682
## 10 2020-04-23 06:00:00 657
## 11 2020-04-23 07:00:00 716
## 12 2020-04-23 08:00:00 651
## 13 2020-04-23 09:00:00 662
## 14 2020-04-23 10:00:00 721
## 15 2020-04-23 11:00:00 863
## 16 2020-04-23 12:00:00 1091
## 17 2020-04-23 13:00:00 1155
## 18 2020-04-23 14:00:00 1191
## 19 2020-04-23 15:00:00 1430
## 20 2020-04-23 16:00:00 1329
library(ggplot2)
covid_tweets_hours %>%
ggplot(aes(x=hour, y=n)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of hashtagging about COVID-19 on Twitter",
subtitle = "Tweets (N=18,224) were aggregated in 1-hour intervals. Retweets were excluded.")
covid_tweets_lang <- covid_tweets %>%
count(lang, sort=TRUE)
covid_tweets_lang
## # A tibble: 55 x 2
## lang n
## <chr> <int>
## 1 en 10115
## 2 es 2666
## 3 in 1485
## 4 pt 932
## 5 und 842
## 6 fr 527
## 7 hi 289
## 8 it 227
## 9 de 141
## 10 ja 138
## # ... with 45 more rows
#install.packages("ISOcodes")
library(ISOcodes)
ISO_639_2 %>% dplyr::select(lang = Alpha_2, Name)
## lang
## 1 aa
## 2 ab
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## 7 <NA>
## 8 <NA>
## 9 af
## 10 <NA>
## 11 ak
## 12 <NA>
## 13 sq
## 14 <NA>
## 15 <NA>
## 16 <NA>
## 17 am
## 18 <NA>
## 19 <NA>
## 20 <NA>
## 21 ar
## 22 <NA>
## 23 an
## 24 hy
## 25 <NA>
## 26 <NA>
## 27 <NA>
## 28 <NA>
## 29 as
## 30 <NA>
## 31 <NA>
## 32 <NA>
## 33 av
## 34 ae
## 35 <NA>
## 36 ay
## 37 az
## 38 <NA>
## 39 <NA>
## 40 ba
## 41 <NA>
## 42 bm
## 43 <NA>
## 44 eu
## 45 <NA>
## 46 <NA>
## 47 <NA>
## 48 be
## 49 <NA>
## 50 bn
## 51 <NA>
## 52 <NA>
## 53 bh
## 54 <NA>
## 55 <NA>
## 56 bi
## 57 <NA>
## 58 <NA>
## 59 bs
## 60 <NA>
## 61 br
## 62 <NA>
## 63 <NA>
## 64 <NA>
## 65 bg
## 66 my
## 67 <NA>
## 68 <NA>
## 69 <NA>
## 70 <NA>
## 71 ca
## 72 <NA>
## 73 <NA>
## 74 <NA>
## 75 ch
## 76 <NA>
## 77 ce
## 78 <NA>
## 79 zh
## 80 <NA>
## 81 <NA>
## 82 <NA>
## 83 <NA>
## 84 <NA>
## 85 <NA>
## 86 cu
## 87 cv
## 88 <NA>
## 89 <NA>
## 90 <NA>
## 91 <NA>
## 92 kw
## 93 co
## 94 <NA>
## 95 <NA>
## 96 <NA>
## 97 cr
## 98 <NA>
## 99 <NA>
## 100 <NA>
## 101 <NA>
## 102 cs
## 103 <NA>
## 104 da
## 105 <NA>
## 106 <NA>
## 107 <NA>
## 108 <NA>
## 109 <NA>
## 110 <NA>
## 111 dv
## 112 <NA>
## 113 <NA>
## 114 <NA>
## 115 <NA>
## 116 <NA>
## 117 nl
## 118 <NA>
## 119 dz
## 120 <NA>
## 121 <NA>
## 122 <NA>
## 123 <NA>
## 124 en
## 125 <NA>
## 126 eo
## 127 et
## 128 ee
## 129 <NA>
## 130 <NA>
## 131 fo
## 132 <NA>
## 133 fj
## 134 <NA>
## 135 fi
## 136 <NA>
## 137 <NA>
## 138 fr
## 139 <NA>
## 140 <NA>
## 141 <NA>
## 142 <NA>
## 143 fy
## 144 ff
## 145 <NA>
## 146 <NA>
## 147 <NA>
## 148 <NA>
## 149 <NA>
## 150 ka
## 151 de
## 152 <NA>
## 153 <NA>
## 154 gd
## 155 ga
## 156 gl
## 157 gv
## 158 <NA>
## 159 <NA>
## 160 <NA>
## 161 <NA>
## 162 <NA>
## 163 <NA>
## 164 <NA>
## 165 el
## 166 gn
## 167 <NA>
## 168 gu
## 169 <NA>
## 170 <NA>
## 171 ht
## 172 ha
## 173 <NA>
## 174 he
## 175 hz
## 176 <NA>
## 177 <NA>
## 178 hi
## 179 <NA>
## 180 <NA>
## 181 ho
## 182 hr
## 183 <NA>
## 184 hu
## 185 <NA>
## 186 <NA>
## 187 ig
## 188 is
## 189 io
## 190 ii
## 191 <NA>
## 192 iu
## 193 ie
## 194 <NA>
## 195 ia
## 196 <NA>
## 197 id
## 198 <NA>
## 199 <NA>
## 200 ik
## 201 <NA>
## 202 <NA>
## 203 it
## 204 jv
## 205 <NA>
## 206 ja
## 207 <NA>
## 208 <NA>
## 209 <NA>
## 210 <NA>
## 211 <NA>
## 212 kl
## 213 <NA>
## 214 kn
## 215 <NA>
## 216 ks
## 217 kr
## 218 <NA>
## 219 kk
## 220 <NA>
## 221 <NA>
## 222 <NA>
## 223 km
## 224 <NA>
## 225 ki
## 226 rw
## 227 ky
## 228 <NA>
## 229 <NA>
## 230 kv
## 231 kg
## 232 ko
## 233 <NA>
## 234 <NA>
## 235 <NA>
## 236 <NA>
## 237 <NA>
## 238 <NA>
## 239 kj
## 240 <NA>
## 241 ku
## 242 <NA>
## 243 <NA>
## 244 <NA>
## 245 <NA>
## 246 lo
## 247 la
## 248 lv
## 249 <NA>
## 250 li
## 251 ln
## 252 lt
## 253 <NA>
## 254 <NA>
## 255 lb
## 256 <NA>
## 257 lu
## 258 lg
## 259 <NA>
## 260 <NA>
## 261 <NA>
## 262 <NA>
## 263 mk
## 264 <NA>
## 265 <NA>
## 266 mh
## 267 <NA>
## 268 <NA>
## 269 ml
## 270 <NA>
## 271 mi
## 272 <NA>
## 273 mr
## 274 <NA>
## 275 ms
## 276 <NA>
## 277 <NA>
## 278 <NA>
## 279 <NA>
## 280 <NA>
## 281 <NA>
## 282 <NA>
## 283 <NA>
## 284 mg
## 285 mt
## 286 <NA>
## 287 <NA>
## 288 <NA>
## 289 <NA>
## 290 mn
## 291 <NA>
## 292 <NA>
## 293 <NA>
## 294 <NA>
## 295 <NA>
## 296 <NA>
## 297 <NA>
## 298 <NA>
## 299 <NA>
## 300 <NA>
## 301 <NA>
## 302 na
## 303 nv
## 304 nr
## 305 nd
## 306 ng
## 307 <NA>
## 308 ne
## 309 <NA>
## 310 <NA>
## 311 <NA>
## 312 <NA>
## 313 nn
## 314 nb
## 315 <NA>
## 316 <NA>
## 317 no
## 318 <NA>
## 319 <NA>
## 320 <NA>
## 321 <NA>
## 322 ny
## 323 <NA>
## 324 <NA>
## 325 <NA>
## 326 <NA>
## 327 oc
## 328 oj
## 329 or
## 330 om
## 331 <NA>
## 332 os
## 333 <NA>
## 334 <NA>
## 335 <NA>
## 336 <NA>
## 337 <NA>
## 338 <NA>
## 339 pa
## 340 <NA>
## 341 <NA>
## 342 <NA>
## 343 fa
## 344 <NA>
## 345 <NA>
## 346 pi
## 347 pl
## 348 <NA>
## 349 pt
## 350 <NA>
## 351 <NA>
## 352 ps
## 354 qu
## 355 <NA>
## 356 <NA>
## 357 <NA>
## 358 <NA>
## 359 rm
## 360 <NA>
## 361 ro
## 362 rn
## 363 <NA>
## 364 ru
## 365 <NA>
## 366 sg
## 367 <NA>
## 368 <NA>
## 369 <NA>
## 370 <NA>
## 371 sa
## 372 <NA>
## 373 <NA>
## 374 <NA>
## 375 <NA>
## 376 <NA>
## 377 <NA>
## 378 <NA>
## 379 <NA>
## 380 <NA>
## 381 <NA>
## 382 si
## 383 <NA>
## 384 <NA>
## 385 <NA>
## 386 sk
## 387 sl
## 388 <NA>
## 389 se
## 390 <NA>
## 391 <NA>
## 392 <NA>
## 393 sm
## 394 <NA>
## 395 sn
## 396 sd
## 397 <NA>
## 398 <NA>
## 399 so
## 400 <NA>
## 401 st
## 402 es
## 403 sc
## 404 <NA>
## 405 sr
## 406 <NA>
## 407 <NA>
## 408 ss
## 409 <NA>
## 410 su
## 411 <NA>
## 412 <NA>
## 413 sw
## 414 sv
## 415 <NA>
## 416 <NA>
## 417 ty
## 418 <NA>
## 419 ta
## 420 tt
## 421 te
## 422 <NA>
## 423 <NA>
## 424 <NA>
## 425 tg
## 426 tl
## 427 th
## 428 bo
## 429 <NA>
## 430 ti
## 431 <NA>
## 432 <NA>
## 433 <NA>
## 434 <NA>
## 435 <NA>
## 436 <NA>
## 437 to
## 438 <NA>
## 439 <NA>
## 440 tn
## 441 ts
## 442 tk
## 443 <NA>
## 444 <NA>
## 445 tr
## 446 <NA>
## 447 <NA>
## 448 tw
## 449 <NA>
## 450 <NA>
## 451 <NA>
## 452 ug
## 453 uk
## 454 <NA>
## 455 <NA>
## 456 ur
## 457 uz
## 458 <NA>
## 459 ve
## 460 vi
## 461 vo
## 462 <NA>
## 463 <NA>
## 464 <NA>
## 465 <NA>
## 466 <NA>
## 467 cy
## 468 <NA>
## 469 wa
## 470 wo
## 471 <NA>
## 472 xh
## 473 <NA>
## 474 <NA>
## 475 yi
## 476 yo
## 477 <NA>
## 478 <NA>
## 479 <NA>
## 480 <NA>
## 481 <NA>
## 482 za
## 483 <NA>
## 484 zu
## 485 <NA>
## 486 <NA>
## 487 <NA>
## Name
## 1 Afar
## 2 Abkhazian
## 3 Achinese
## 4 Acoli
## 5 Adangme
## 6 Adyghe; Adygei
## 7 Afro-Asiatic languages
## 8 Afrihili
## 9 Afrikaans
## 10 Ainu
## 11 Akan
## 12 Akkadian
## 13 Albanian
## 14 Aleut
## 15 Algonquian languages
## 16 Southern Altai
## 17 Amharic
## 18 English, Old (ca.450-1100)
## 19 Angika
## 20 Apache languages
## 21 Arabic
## 22 Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)
## 23 Aragonese
## 24 Armenian
## 25 Mapudungun; Mapuche
## 26 Arapaho
## 27 Artificial languages
## 28 Arawak
## 29 Assamese
## 30 Asturian; Bable; Leonese; Asturleonese
## 31 Athapascan languages
## 32 Australian languages
## 33 Avaric
## 34 Avestan
## 35 Awadhi
## 36 Aymara
## 37 Azerbaijani
## 38 Banda languages
## 39 Bamileke languages
## 40 Bashkir
## 41 Baluchi
## 42 Bambara
## 43 Balinese
## 44 Basque
## 45 Basa
## 46 Baltic languages
## 47 Beja; Bedawiyet
## 48 Belarusian
## 49 Bemba
## 50 Bengali
## 51 Berber languages
## 52 Bhojpuri
## 53 Bihari languages
## 54 Bikol
## 55 Bini; Edo
## 56 Bislama
## 57 Siksika
## 58 Bantu languages
## 59 Bosnian
## 60 Braj
## 61 Breton
## 62 Batak languages
## 63 Buriat
## 64 Buginese
## 65 Bulgarian
## 66 Burmese
## 67 Blin; Bilin
## 68 Caddo
## 69 Central American Indian languages
## 70 Galibi Carib
## 71 Catalan; Valencian
## 72 Caucasian languages
## 73 Cebuano
## 74 Celtic languages
## 75 Chamorro
## 76 Chibcha
## 77 Chechen
## 78 Chagatai
## 79 Chinese
## 80 Chuukese
## 81 Mari
## 82 Chinook jargon
## 83 Choctaw
## 84 Chipewyan; Dene Suline
## 85 Cherokee
## 86 Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic
## 87 Chuvash
## 88 Cheyenne
## 89 Chamic languages
## 90 Montenegrin
## 91 Coptic
## 92 Cornish
## 93 Corsican
## 94 Creoles and pidgins, English based
## 95 Creoles and pidgins, French-based
## 96 Creoles and pidgins, Portuguese-based
## 97 Cree
## 98 Crimean Tatar; Crimean Turkish
## 99 Creoles and pidgins
## 100 Kashubian
## 101 Cushitic languages
## 102 Czech
## 103 Dakota
## 104 Danish
## 105 Dargwa
## 106 Land Dayak languages
## 107 Delaware
## 108 Slave (Athapascan)
## 109 Dogrib
## 110 Dinka
## 111 Divehi; Dhivehi; Maldivian
## 112 Dogri
## 113 Dravidian languages
## 114 Lower Sorbian
## 115 Duala
## 116 Dutch, Middle (ca.1050-1350)
## 117 Dutch; Flemish
## 118 Dyula
## 119 Dzongkha
## 120 Efik
## 121 Egyptian (Ancient)
## 122 Ekajuk
## 123 Elamite
## 124 English
## 125 English, Middle (1100-1500)
## 126 Esperanto
## 127 Estonian
## 128 Ewe
## 129 Ewondo
## 130 Fang
## 131 Faroese
## 132 Fanti
## 133 Fijian
## 134 Filipino; Pilipino
## 135 Finnish
## 136 Finno-Ugrian languages
## 137 Fon
## 138 French
## 139 French, Middle (ca.1400-1600)
## 140 French, Old (842-ca.1400)
## 141 Northern Frisian
## 142 Eastern Frisian
## 143 Western Frisian
## 144 Fulah
## 145 Friulian
## 146 Ga
## 147 Gayo
## 148 Gbaya
## 149 Germanic languages
## 150 Georgian
## 151 German
## 152 Geez
## 153 Gilbertese
## 154 Gaelic; Scottish Gaelic
## 155 Irish
## 156 Galician
## 157 Manx
## 158 German, Middle High (ca.1050-1500)
## 159 German, Old High (ca.750-1050)
## 160 Gondi
## 161 Gorontalo
## 162 Gothic
## 163 Grebo
## 164 Greek, Ancient (to 1453)
## 165 Greek, Modern (1453-)
## 166 Guarani
## 167 Swiss German; Alemannic; Alsatian
## 168 Gujarati
## 169 Gwich'in
## 170 Haida
## 171 Haitian; Haitian Creole
## 172 Hausa
## 173 Hawaiian
## 174 Hebrew
## 175 Herero
## 176 Hiligaynon
## 177 Himachali languages; Western Pahari languages
## 178 Hindi
## 179 Hittite
## 180 Hmong; Mong
## 181 Hiri Motu
## 182 Croatian
## 183 Upper Sorbian
## 184 Hungarian
## 185 Hupa
## 186 Iban
## 187 Igbo
## 188 Icelandic
## 189 Ido
## 190 Sichuan Yi; Nuosu
## 191 Ijo languages
## 192 Inuktitut
## 193 Interlingue; Occidental
## 194 Iloko
## 195 Interlingua (International Auxiliary Language Association)
## 196 Indic languages
## 197 Indonesian
## 198 Indo-European languages
## 199 Ingush
## 200 Inupiaq
## 201 Iranian languages
## 202 Iroquoian languages
## 203 Italian
## 204 Javanese
## 205 Lojban
## 206 Japanese
## 207 Judeo-Persian
## 208 Judeo-Arabic
## 209 Kara-Kalpak
## 210 Kabyle
## 211 Kachin; Jingpho
## 212 Kalaallisut; Greenlandic
## 213 Kamba
## 214 Kannada
## 215 Karen languages
## 216 Kashmiri
## 217 Kanuri
## 218 Kawi
## 219 Kazakh
## 220 Kabardian
## 221 Khasi
## 222 Khoisan languages
## 223 Central Khmer
## 224 Khotanese; Sakan
## 225 Kikuyu; Gikuyu
## 226 Kinyarwanda
## 227 Kirghiz; Kyrgyz
## 228 Kimbundu
## 229 Konkani
## 230 Komi
## 231 Kongo
## 232 Korean
## 233 Kosraean
## 234 Kpelle
## 235 Karachay-Balkar
## 236 Karelian
## 237 Kru languages
## 238 Kurukh
## 239 Kuanyama; Kwanyama
## 240 Kumyk
## 241 Kurdish
## 242 Kutenai
## 243 Ladino
## 244 Lahnda
## 245 Lamba
## 246 Lao
## 247 Latin
## 248 Latvian
## 249 Lezghian
## 250 Limburgan; Limburger; Limburgish
## 251 Lingala
## 252 Lithuanian
## 253 Mongo
## 254 Lozi
## 255 Luxembourgish; Letzeburgesch
## 256 Luba-Lulua
## 257 Luba-Katanga
## 258 Ganda
## 259 Luiseno
## 260 Lunda
## 261 Luo (Kenya and Tanzania)
## 262 Lushai
## 263 Macedonian
## 264 Madurese
## 265 Magahi
## 266 Marshallese
## 267 Maithili
## 268 Makasar
## 269 Malayalam
## 270 Mandingo
## 271 Maori
## 272 Austronesian languages
## 273 Marathi
## 274 Masai
## 275 Malay
## 276 Moksha
## 277 Mandar
## 278 Mende
## 279 Irish, Middle (900-1200)
## 280 Mi'kmaq; Micmac
## 281 Minangkabau
## 282 Uncoded languages
## 283 Mon-Khmer languages
## 284 Malagasy
## 285 Maltese
## 286 Manchu
## 287 Manipuri
## 288 Manobo languages
## 289 Mohawk
## 290 Mongolian
## 291 Mossi
## 292 Multiple languages
## 293 Munda languages
## 294 Creek
## 295 Mirandese
## 296 Marwari
## 297 Mayan languages
## 298 Erzya
## 299 Nahuatl languages
## 300 North American Indian languages
## 301 Neapolitan
## 302 Nauru
## 303 Navajo; Navaho
## 304 Ndebele, South; South Ndebele
## 305 Ndebele, North; North Ndebele
## 306 Ndonga
## 307 Low German; Low Saxon; German, Low; Saxon, Low
## 308 Nepali
## 309 Nepal Bhasa; Newari
## 310 Nias
## 311 Niger-Kordofanian languages
## 312 Niuean
## 313 Norwegian Nynorsk; Nynorsk, Norwegian
## 314 Bokmal, Norwegian; Norwegian Bokmal
## 315 Nogai
## 316 Norse, Old
## 317 Norwegian
## 318 N'Ko
## 319 Pedi; Sepedi; Northern Sotho
## 320 Nubian languages
## 321 Classical Newari; Old Newari; Classical Nepal Bhasa
## 322 Chichewa; Chewa; Nyanja
## 323 Nyamwezi
## 324 Nyankole
## 325 Nyoro
## 326 Nzima
## 327 Occitan (post 1500)
## 328 Ojibwa
## 329 Oriya
## 330 Oromo
## 331 Osage
## 332 Ossetian; Ossetic
## 333 Turkish, Ottoman (1500-1928)
## 334 Otomian languages
## 335 Papuan languages
## 336 Pangasinan
## 337 Pahlavi
## 338 Pampanga; Kapampangan
## 339 Panjabi; Punjabi
## 340 Papiamento
## 341 Palauan
## 342 Persian, Old (ca.600-400 B.C.)
## 343 Persian
## 344 Philippine languages
## 345 Phoenician
## 346 Pali
## 347 Polish
## 348 Pohnpeian
## 349 Portuguese
## 350 Prakrit languages
## 351 Provencal, Old (to 1500); Occitan, Old (to 1500)
## 352 Pushto; Pashto
## 354 Quechua
## 355 Rajasthani
## 356 Rapanui
## 357 Rarotongan; Cook Islands Maori
## 358 Romance languages
## 359 Romansh
## 360 Romany
## 361 Romanian; Moldavian; Moldovan
## 362 Rundi
## 363 Aromanian; Arumanian; Macedo-Romanian
## 364 Russian
## 365 Sandawe
## 366 Sango
## 367 Yakut
## 368 South American Indian languages
## 369 Salishan languages
## 370 Samaritan Aramaic
## 371 Sanskrit
## 372 Sasak
## 373 Santali
## 374 Sicilian
## 375 Scots
## 376 Selkup
## 377 Semitic languages
## 378 Irish, Old (to 900)
## 379 Sign Languages
## 380 Shan
## 381 Sidamo
## 382 Sinhala; Sinhalese
## 383 Siouan languages
## 384 Sino-Tibetan languages
## 385 Slavic languages
## 386 Slovak
## 387 Slovenian
## 388 Southern Sami
## 389 Northern Sami
## 390 Sami languages
## 391 Lule Sami
## 392 Inari Sami
## 393 Samoan
## 394 Skolt Sami
## 395 Shona
## 396 Sindhi
## 397 Soninke
## 398 Sogdian
## 399 Somali
## 400 Songhai languages
## 401 Sotho, Southern
## 402 Spanish; Castilian
## 403 Sardinian
## 404 Sranan Tongo
## 405 Serbian
## 406 Serer
## 407 Nilo-Saharan languages
## 408 Swati
## 409 Sukuma
## 410 Sundanese
## 411 Susu
## 412 Sumerian
## 413 Swahili
## 414 Swedish
## 415 Classical Syriac
## 416 Syriac
## 417 Tahitian
## 418 Tai languages
## 419 Tamil
## 420 Tatar
## 421 Telugu
## 422 Timne
## 423 Tereno
## 424 Tetum
## 425 Tajik
## 426 Tagalog
## 427 Thai
## 428 Tibetan
## 429 Tigre
## 430 Tigrinya
## 431 Tiv
## 432 Tokelau
## 433 Klingon; tlhIngan-Hol
## 434 Tlingit
## 435 Tamashek
## 436 Tonga (Nyasa)
## 437 Tonga (Tonga Islands)
## 438 Tok Pisin
## 439 Tsimshian
## 440 Tswana
## 441 Tsonga
## 442 Turkmen
## 443 Tumbuka
## 444 Tupi languages
## 445 Turkish
## 446 Altaic languages
## 447 Tuvalu
## 448 Twi
## 449 Tuvinian
## 450 Udmurt
## 451 Ugaritic
## 452 Uighur; Uyghur
## 453 Ukrainian
## 454 Umbundu
## 455 Undetermined
## 456 Urdu
## 457 Uzbek
## 458 Vai
## 459 Venda
## 460 Vietnamese
## 461 Volapuk
## 462 Votic
## 463 Wakashan languages
## 464 Wolaitta; Wolaytta
## 465 Waray
## 466 Washo
## 467 Welsh
## 468 Sorbian languages
## 469 Walloon
## 470 Wolof
## 471 Kalmyk; Oirat
## 472 Xhosa
## 473 Yao
## 474 Yapese
## 475 Yiddish
## 476 Yoruba
## 477 Yupik languages
## 478 Zapotec
## 479 Blissymbols; Blissymbolics; Bliss
## 480 Zenaga
## 481 Standard Moroccan Tamazight
## 482 Zhuang; Chuang
## 483 Zande languages
## 484 Zulu
## 485 Zuni
## 486 No linguistic content; Not applicable
## 487 Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki
ISO_639_2
## Alpha_3_B Alpha_3_T Alpha_2
## 1 aar aar aa
## 2 abk abk ab
## 3 ace ace <NA>
## 4 ach ach <NA>
## 5 ada ada <NA>
## 6 ady ady <NA>
## 7 afa afa <NA>
## 8 afh afh <NA>
## 9 afr afr af
## 10 ain ain <NA>
## 11 aka aka ak
## 12 akk akk <NA>
## 13 alb sqi sq
## 14 ale ale <NA>
## 15 alg alg <NA>
## 16 alt alt <NA>
## 17 amh amh am
## 18 ang ang <NA>
## 19 anp anp <NA>
## 20 apa apa <NA>
## 21 ara ara ar
## 22 arc arc <NA>
## 23 arg arg an
## 24 arm hye hy
## 25 arn arn <NA>
## 26 arp arp <NA>
## 27 art art <NA>
## 28 arw arw <NA>
## 29 asm asm as
## 30 ast ast <NA>
## 31 ath ath <NA>
## 32 aus aus <NA>
## 33 ava ava av
## 34 ave ave ae
## 35 awa awa <NA>
## 36 aym aym ay
## 37 aze aze az
## 38 bad bad <NA>
## 39 bai bai <NA>
## 40 bak bak ba
## 41 bal bal <NA>
## 42 bam bam bm
## 43 ban ban <NA>
## 44 baq eus eu
## 45 bas bas <NA>
## 46 bat bat <NA>
## 47 bej bej <NA>
## 48 bel bel be
## 49 bem bem <NA>
## 50 ben ben bn
## 51 ber ber <NA>
## 52 bho bho <NA>
## 53 bih bih bh
## 54 bik bik <NA>
## 55 bin bin <NA>
## 56 bis bis bi
## 57 bla bla <NA>
## 58 bnt bnt <NA>
## 59 bos bos bs
## 60 bra bra <NA>
## 61 bre bre br
## 62 btk btk <NA>
## 63 bua bua <NA>
## 64 bug bug <NA>
## 65 bul bul bg
## 66 bur mya my
## 67 byn byn <NA>
## 68 cad cad <NA>
## 69 cai cai <NA>
## 70 car car <NA>
## 71 cat cat ca
## 72 cau cau <NA>
## 73 ceb ceb <NA>
## 74 cel cel <NA>
## 75 cha cha ch
## 76 chb chb <NA>
## 77 che che ce
## 78 chg chg <NA>
## 79 chi zho zh
## 80 chk chk <NA>
## 81 chm chm <NA>
## 82 chn chn <NA>
## 83 cho cho <NA>
## 84 chp chp <NA>
## 85 chr chr <NA>
## 86 chu chu cu
## 87 chv chv cv
## 88 chy chy <NA>
## 89 cmc cmc <NA>
## 90 cnr cnr <NA>
## 91 cop cop <NA>
## 92 cor cor kw
## 93 cos cos co
## 94 cpe cpe <NA>
## 95 cpf cpf <NA>
## 96 cpp cpp <NA>
## 97 cre cre cr
## 98 crh crh <NA>
## 99 crp crp <NA>
## 100 csb csb <NA>
## 101 cus cus <NA>
## 102 cze ces cs
## 103 dak dak <NA>
## 104 dan dan da
## 105 dar dar <NA>
## 106 day day <NA>
## 107 del del <NA>
## 108 den den <NA>
## 109 dgr dgr <NA>
## 110 din din <NA>
## 111 div div dv
## 112 doi doi <NA>
## 113 dra dra <NA>
## 114 dsb dsb <NA>
## 115 dua dua <NA>
## 116 dum dum <NA>
## 117 dut nld nl
## 118 dyu dyu <NA>
## 119 dzo dzo dz
## 120 efi efi <NA>
## 121 egy egy <NA>
## 122 eka eka <NA>
## 123 elx elx <NA>
## 124 eng eng en
## 125 enm enm <NA>
## 126 epo epo eo
## 127 est est et
## 128 ewe ewe ee
## 129 ewo ewo <NA>
## 130 fan fan <NA>
## 131 fao fao fo
## 132 fat fat <NA>
## 133 fij fij fj
## 134 fil fil <NA>
## 135 fin fin fi
## 136 fiu fiu <NA>
## 137 fon fon <NA>
## 138 fre fra fr
## 139 frm frm <NA>
## 140 fro fro <NA>
## 141 frr frr <NA>
## 142 frs frs <NA>
## 143 fry fry fy
## 144 ful ful ff
## 145 fur fur <NA>
## 146 gaa gaa <NA>
## 147 gay gay <NA>
## 148 gba gba <NA>
## 149 gem gem <NA>
## 150 geo kat ka
## 151 ger deu de
## 152 gez gez <NA>
## 153 gil gil <NA>
## 154 gla gla gd
## 155 gle gle ga
## 156 glg glg gl
## 157 glv glv gv
## 158 gmh gmh <NA>
## 159 goh goh <NA>
## 160 gon gon <NA>
## 161 gor gor <NA>
## 162 got got <NA>
## 163 grb grb <NA>
## 164 grc grc <NA>
## 165 gre ell el
## 166 grn grn gn
## 167 gsw gsw <NA>
## 168 guj guj gu
## 169 gwi gwi <NA>
## 170 hai hai <NA>
## 171 hat hat ht
## 172 hau hau ha
## 173 haw haw <NA>
## 174 heb heb he
## 175 her her hz
## 176 hil hil <NA>
## 177 him him <NA>
## 178 hin hin hi
## 179 hit hit <NA>
## 180 hmn hmn <NA>
## 181 hmo hmo ho
## 182 hrv hrv hr
## 183 hsb hsb <NA>
## 184 hun hun hu
## 185 hup hup <NA>
## 186 iba iba <NA>
## 187 ibo ibo ig
## 188 ice isl is
## 189 ido ido io
## 190 iii iii ii
## 191 ijo ijo <NA>
## 192 iku iku iu
## 193 ile ile ie
## 194 ilo ilo <NA>
## 195 ina ina ia
## 196 inc inc <NA>
## 197 ind ind id
## 198 ine ine <NA>
## 199 inh inh <NA>
## 200 ipk ipk ik
## 201 ira ira <NA>
## 202 iro iro <NA>
## 203 ita ita it
## 204 jav jav jv
## 205 jbo jbo <NA>
## 206 jpn jpn ja
## 207 jpr jpr <NA>
## 208 jrb jrb <NA>
## 209 kaa kaa <NA>
## 210 kab kab <NA>
## 211 kac kac <NA>
## 212 kal kal kl
## 213 kam kam <NA>
## 214 kan kan kn
## 215 kar kar <NA>
## 216 kas kas ks
## 217 kau kau kr
## 218 kaw kaw <NA>
## 219 kaz kaz kk
## 220 kbd kbd <NA>
## 221 kha kha <NA>
## 222 khi khi <NA>
## 223 khm khm km
## 224 kho kho <NA>
## 225 kik kik ki
## 226 kin kin rw
## 227 kir kir ky
## 228 kmb kmb <NA>
## 229 kok kok <NA>
## 230 kom kom kv
## 231 kon kon kg
## 232 kor kor ko
## 233 kos kos <NA>
## 234 kpe kpe <NA>
## 235 krc krc <NA>
## 236 krl krl <NA>
## 237 kro kro <NA>
## 238 kru kru <NA>
## 239 kua kua kj
## 240 kum kum <NA>
## 241 kur kur ku
## 242 kut kut <NA>
## 243 lad lad <NA>
## 244 lah lah <NA>
## 245 lam lam <NA>
## 246 lao lao lo
## 247 lat lat la
## 248 lav lav lv
## 249 lez lez <NA>
## 250 lim lim li
## 251 lin lin ln
## 252 lit lit lt
## 253 lol lol <NA>
## 254 loz loz <NA>
## 255 ltz ltz lb
## 256 lua lua <NA>
## 257 lub lub lu
## 258 lug lug lg
## 259 lui lui <NA>
## 260 lun lun <NA>
## 261 luo luo <NA>
## 262 lus lus <NA>
## 263 mac mkd mk
## 264 mad mad <NA>
## 265 mag mag <NA>
## 266 mah mah mh
## 267 mai mai <NA>
## 268 mak mak <NA>
## 269 mal mal ml
## 270 man man <NA>
## 271 mao mri mi
## 272 map map <NA>
## 273 mar mar mr
## 274 mas mas <NA>
## 275 may msa ms
## 276 mdf mdf <NA>
## 277 mdr mdr <NA>
## 278 men men <NA>
## 279 mga mga <NA>
## 280 mic mic <NA>
## 281 min min <NA>
## 282 mis mis <NA>
## 283 mkh mkh <NA>
## 284 mlg mlg mg
## 285 mlt mlt mt
## 286 mnc mnc <NA>
## 287 mni mni <NA>
## 288 mno mno <NA>
## 289 moh moh <NA>
## 290 mon mon mn
## 291 mos mos <NA>
## 292 mul mul <NA>
## 293 mun mun <NA>
## 294 mus mus <NA>
## 295 mwl mwl <NA>
## 296 mwr mwr <NA>
## 297 myn myn <NA>
## 298 myv myv <NA>
## 299 nah nah <NA>
## 300 nai nai <NA>
## 301 nap nap <NA>
## 302 nau nau na
## 303 nav nav nv
## 304 nbl nbl nr
## 305 nde nde nd
## 306 ndo ndo ng
## 307 nds nds <NA>
## 308 nep nep ne
## 309 new new <NA>
## 310 nia nia <NA>
## 311 nic nic <NA>
## 312 niu niu <NA>
## 313 nno nno nn
## 314 nob nob nb
## 315 nog nog <NA>
## 316 non non <NA>
## 317 nor nor no
## 318 nqo nqo <NA>
## 319 nso nso <NA>
## 320 nub nub <NA>
## 321 nwc nwc <NA>
## 322 nya nya ny
## 323 nym nym <NA>
## 324 nyn nyn <NA>
## 325 nyo nyo <NA>
## 326 nzi nzi <NA>
## 327 oci oci oc
## 328 oji oji oj
## 329 ori ori or
## 330 orm orm om
## 331 osa osa <NA>
## 332 oss oss os
## 333 ota ota <NA>
## 334 oto oto <NA>
## 335 paa paa <NA>
## 336 pag pag <NA>
## 337 pal pal <NA>
## 338 pam pam <NA>
## 339 pan pan pa
## 340 pap pap <NA>
## 341 pau pau <NA>
## 342 peo peo <NA>
## 343 per fas fa
## 344 phi phi <NA>
## 345 phn phn <NA>
## 346 pli pli pi
## 347 pol pol pl
## 348 pon pon <NA>
## 349 por por pt
## 350 pra pra <NA>
## 351 pro pro <NA>
## 352 pus pus ps
## 354 que que qu
## 355 raj raj <NA>
## 356 rap rap <NA>
## 357 rar rar <NA>
## 358 roa roa <NA>
## 359 roh roh rm
## 360 rom rom <NA>
## 361 rum ron ro
## 362 run run rn
## 363 rup rup <NA>
## 364 rus rus ru
## 365 sad sad <NA>
## 366 sag sag sg
## 367 sah sah <NA>
## 368 sai sai <NA>
## 369 sal sal <NA>
## 370 sam sam <NA>
## 371 san san sa
## 372 sas sas <NA>
## 373 sat sat <NA>
## 374 scn scn <NA>
## 375 sco sco <NA>
## 376 sel sel <NA>
## 377 sem sem <NA>
## 378 sga sga <NA>
## 379 sgn sgn <NA>
## 380 shn shn <NA>
## 381 sid sid <NA>
## 382 sin sin si
## 383 sio sio <NA>
## 384 sit sit <NA>
## 385 sla sla <NA>
## 386 slo slk sk
## 387 slv slv sl
## 388 sma sma <NA>
## 389 sme sme se
## 390 smi smi <NA>
## 391 smj smj <NA>
## 392 smn smn <NA>
## 393 smo smo sm
## 394 sms sms <NA>
## 395 sna sna sn
## 396 snd snd sd
## 397 snk snk <NA>
## 398 sog sog <NA>
## 399 som som so
## 400 son son <NA>
## 401 sot sot st
## 402 spa spa es
## 403 srd srd sc
## 404 srn srn <NA>
## 405 srp srp sr
## 406 srr srr <NA>
## 407 ssa ssa <NA>
## 408 ssw ssw ss
## 409 suk suk <NA>
## 410 sun sun su
## 411 sus sus <NA>
## 412 sux sux <NA>
## 413 swa swa sw
## 414 swe swe sv
## 415 syc syc <NA>
## 416 syr syr <NA>
## 417 tah tah ty
## 418 tai tai <NA>
## 419 tam tam ta
## 420 tat tat tt
## 421 tel tel te
## 422 tem tem <NA>
## 423 ter ter <NA>
## 424 tet tet <NA>
## 425 tgk tgk tg
## 426 tgl tgl tl
## 427 tha tha th
## 428 tib bod bo
## 429 tig tig <NA>
## 430 tir tir ti
## 431 tiv tiv <NA>
## 432 tkl tkl <NA>
## 433 tlh tlh <NA>
## 434 tli tli <NA>
## 435 tmh tmh <NA>
## 436 tog tog <NA>
## 437 ton ton to
## 438 tpi tpi <NA>
## 439 tsi tsi <NA>
## 440 tsn tsn tn
## 441 tso tso ts
## 442 tuk tuk tk
## 443 tum tum <NA>
## 444 tup tup <NA>
## 445 tur tur tr
## 446 tut tut <NA>
## 447 tvl tvl <NA>
## 448 twi twi tw
## 449 tyv tyv <NA>
## 450 udm udm <NA>
## 451 uga uga <NA>
## 452 uig uig ug
## 453 ukr ukr uk
## 454 umb umb <NA>
## 455 und und <NA>
## 456 urd urd ur
## 457 uzb uzb uz
## 458 vai vai <NA>
## 459 ven ven ve
## 460 vie vie vi
## 461 vol vol vo
## 462 vot vot <NA>
## 463 wak wak <NA>
## 464 wal wal <NA>
## 465 war war <NA>
## 466 was was <NA>
## 467 wel cym cy
## 468 wen wen <NA>
## 469 wln wln wa
## 470 wol wol wo
## 471 xal xal <NA>
## 472 xho xho xh
## 473 yao yao <NA>
## 474 yap yap <NA>
## 475 yid yid yi
## 476 yor yor yo
## 477 ypk ypk <NA>
## 478 zap zap <NA>
## 479 zbl zbl <NA>
## 480 zen zen <NA>
## 481 zgh zgh <NA>
## 482 zha zha za
## 483 znd znd <NA>
## 484 zul zul zu
## 485 zun zun <NA>
## 486 zxx zxx <NA>
## 487 zza zza <NA>
## Name
## 1 Afar
## 2 Abkhazian
## 3 Achinese
## 4 Acoli
## 5 Adangme
## 6 Adyghe; Adygei
## 7 Afro-Asiatic languages
## 8 Afrihili
## 9 Afrikaans
## 10 Ainu
## 11 Akan
## 12 Akkadian
## 13 Albanian
## 14 Aleut
## 15 Algonquian languages
## 16 Southern Altai
## 17 Amharic
## 18 English, Old (ca.450-1100)
## 19 Angika
## 20 Apache languages
## 21 Arabic
## 22 Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)
## 23 Aragonese
## 24 Armenian
## 25 Mapudungun; Mapuche
## 26 Arapaho
## 27 Artificial languages
## 28 Arawak
## 29 Assamese
## 30 Asturian; Bable; Leonese; Asturleonese
## 31 Athapascan languages
## 32 Australian languages
## 33 Avaric
## 34 Avestan
## 35 Awadhi
## 36 Aymara
## 37 Azerbaijani
## 38 Banda languages
## 39 Bamileke languages
## 40 Bashkir
## 41 Baluchi
## 42 Bambara
## 43 Balinese
## 44 Basque
## 45 Basa
## 46 Baltic languages
## 47 Beja; Bedawiyet
## 48 Belarusian
## 49 Bemba
## 50 Bengali
## 51 Berber languages
## 52 Bhojpuri
## 53 Bihari languages
## 54 Bikol
## 55 Bini; Edo
## 56 Bislama
## 57 Siksika
## 58 Bantu languages
## 59 Bosnian
## 60 Braj
## 61 Breton
## 62 Batak languages
## 63 Buriat
## 64 Buginese
## 65 Bulgarian
## 66 Burmese
## 67 Blin; Bilin
## 68 Caddo
## 69 Central American Indian languages
## 70 Galibi Carib
## 71 Catalan; Valencian
## 72 Caucasian languages
## 73 Cebuano
## 74 Celtic languages
## 75 Chamorro
## 76 Chibcha
## 77 Chechen
## 78 Chagatai
## 79 Chinese
## 80 Chuukese
## 81 Mari
## 82 Chinook jargon
## 83 Choctaw
## 84 Chipewyan; Dene Suline
## 85 Cherokee
## 86 Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic
## 87 Chuvash
## 88 Cheyenne
## 89 Chamic languages
## 90 Montenegrin
## 91 Coptic
## 92 Cornish
## 93 Corsican
## 94 Creoles and pidgins, English based
## 95 Creoles and pidgins, French-based
## 96 Creoles and pidgins, Portuguese-based
## 97 Cree
## 98 Crimean Tatar; Crimean Turkish
## 99 Creoles and pidgins
## 100 Kashubian
## 101 Cushitic languages
## 102 Czech
## 103 Dakota
## 104 Danish
## 105 Dargwa
## 106 Land Dayak languages
## 107 Delaware
## 108 Slave (Athapascan)
## 109 Dogrib
## 110 Dinka
## 111 Divehi; Dhivehi; Maldivian
## 112 Dogri
## 113 Dravidian languages
## 114 Lower Sorbian
## 115 Duala
## 116 Dutch, Middle (ca.1050-1350)
## 117 Dutch; Flemish
## 118 Dyula
## 119 Dzongkha
## 120 Efik
## 121 Egyptian (Ancient)
## 122 Ekajuk
## 123 Elamite
## 124 English
## 125 English, Middle (1100-1500)
## 126 Esperanto
## 127 Estonian
## 128 Ewe
## 129 Ewondo
## 130 Fang
## 131 Faroese
## 132 Fanti
## 133 Fijian
## 134 Filipino; Pilipino
## 135 Finnish
## 136 Finno-Ugrian languages
## 137 Fon
## 138 French
## 139 French, Middle (ca.1400-1600)
## 140 French, Old (842-ca.1400)
## 141 Northern Frisian
## 142 Eastern Frisian
## 143 Western Frisian
## 144 Fulah
## 145 Friulian
## 146 Ga
## 147 Gayo
## 148 Gbaya
## 149 Germanic languages
## 150 Georgian
## 151 German
## 152 Geez
## 153 Gilbertese
## 154 Gaelic; Scottish Gaelic
## 155 Irish
## 156 Galician
## 157 Manx
## 158 German, Middle High (ca.1050-1500)
## 159 German, Old High (ca.750-1050)
## 160 Gondi
## 161 Gorontalo
## 162 Gothic
## 163 Grebo
## 164 Greek, Ancient (to 1453)
## 165 Greek, Modern (1453-)
## 166 Guarani
## 167 Swiss German; Alemannic; Alsatian
## 168 Gujarati
## 169 Gwich'in
## 170 Haida
## 171 Haitian; Haitian Creole
## 172 Hausa
## 173 Hawaiian
## 174 Hebrew
## 175 Herero
## 176 Hiligaynon
## 177 Himachali languages; Western Pahari languages
## 178 Hindi
## 179 Hittite
## 180 Hmong; Mong
## 181 Hiri Motu
## 182 Croatian
## 183 Upper Sorbian
## 184 Hungarian
## 185 Hupa
## 186 Iban
## 187 Igbo
## 188 Icelandic
## 189 Ido
## 190 Sichuan Yi; Nuosu
## 191 Ijo languages
## 192 Inuktitut
## 193 Interlingue; Occidental
## 194 Iloko
## 195 Interlingua (International Auxiliary Language Association)
## 196 Indic languages
## 197 Indonesian
## 198 Indo-European languages
## 199 Ingush
## 200 Inupiaq
## 201 Iranian languages
## 202 Iroquoian languages
## 203 Italian
## 204 Javanese
## 205 Lojban
## 206 Japanese
## 207 Judeo-Persian
## 208 Judeo-Arabic
## 209 Kara-Kalpak
## 210 Kabyle
## 211 Kachin; Jingpho
## 212 Kalaallisut; Greenlandic
## 213 Kamba
## 214 Kannada
## 215 Karen languages
## 216 Kashmiri
## 217 Kanuri
## 218 Kawi
## 219 Kazakh
## 220 Kabardian
## 221 Khasi
## 222 Khoisan languages
## 223 Central Khmer
## 224 Khotanese; Sakan
## 225 Kikuyu; Gikuyu
## 226 Kinyarwanda
## 227 Kirghiz; Kyrgyz
## 228 Kimbundu
## 229 Konkani
## 230 Komi
## 231 Kongo
## 232 Korean
## 233 Kosraean
## 234 Kpelle
## 235 Karachay-Balkar
## 236 Karelian
## 237 Kru languages
## 238 Kurukh
## 239 Kuanyama; Kwanyama
## 240 Kumyk
## 241 Kurdish
## 242 Kutenai
## 243 Ladino
## 244 Lahnda
## 245 Lamba
## 246 Lao
## 247 Latin
## 248 Latvian
## 249 Lezghian
## 250 Limburgan; Limburger; Limburgish
## 251 Lingala
## 252 Lithuanian
## 253 Mongo
## 254 Lozi
## 255 Luxembourgish; Letzeburgesch
## 256 Luba-Lulua
## 257 Luba-Katanga
## 258 Ganda
## 259 Luiseno
## 260 Lunda
## 261 Luo (Kenya and Tanzania)
## 262 Lushai
## 263 Macedonian
## 264 Madurese
## 265 Magahi
## 266 Marshallese
## 267 Maithili
## 268 Makasar
## 269 Malayalam
## 270 Mandingo
## 271 Maori
## 272 Austronesian languages
## 273 Marathi
## 274 Masai
## 275 Malay
## 276 Moksha
## 277 Mandar
## 278 Mende
## 279 Irish, Middle (900-1200)
## 280 Mi'kmaq; Micmac
## 281 Minangkabau
## 282 Uncoded languages
## 283 Mon-Khmer languages
## 284 Malagasy
## 285 Maltese
## 286 Manchu
## 287 Manipuri
## 288 Manobo languages
## 289 Mohawk
## 290 Mongolian
## 291 Mossi
## 292 Multiple languages
## 293 Munda languages
## 294 Creek
## 295 Mirandese
## 296 Marwari
## 297 Mayan languages
## 298 Erzya
## 299 Nahuatl languages
## 300 North American Indian languages
## 301 Neapolitan
## 302 Nauru
## 303 Navajo; Navaho
## 304 Ndebele, South; South Ndebele
## 305 Ndebele, North; North Ndebele
## 306 Ndonga
## 307 Low German; Low Saxon; German, Low; Saxon, Low
## 308 Nepali
## 309 Nepal Bhasa; Newari
## 310 Nias
## 311 Niger-Kordofanian languages
## 312 Niuean
## 313 Norwegian Nynorsk; Nynorsk, Norwegian
## 314 Bokmal, Norwegian; Norwegian Bokmal
## 315 Nogai
## 316 Norse, Old
## 317 Norwegian
## 318 N'Ko
## 319 Pedi; Sepedi; Northern Sotho
## 320 Nubian languages
## 321 Classical Newari; Old Newari; Classical Nepal Bhasa
## 322 Chichewa; Chewa; Nyanja
## 323 Nyamwezi
## 324 Nyankole
## 325 Nyoro
## 326 Nzima
## 327 Occitan (post 1500)
## 328 Ojibwa
## 329 Oriya
## 330 Oromo
## 331 Osage
## 332 Ossetian; Ossetic
## 333 Turkish, Ottoman (1500-1928)
## 334 Otomian languages
## 335 Papuan languages
## 336 Pangasinan
## 337 Pahlavi
## 338 Pampanga; Kapampangan
## 339 Panjabi; Punjabi
## 340 Papiamento
## 341 Palauan
## 342 Persian, Old (ca.600-400 B.C.)
## 343 Persian
## 344 Philippine languages
## 345 Phoenician
## 346 Pali
## 347 Polish
## 348 Pohnpeian
## 349 Portuguese
## 350 Prakrit languages
## 351 Provencal, Old (to 1500); Occitan, Old (to 1500)
## 352 Pushto; Pashto
## 354 Quechua
## 355 Rajasthani
## 356 Rapanui
## 357 Rarotongan; Cook Islands Maori
## 358 Romance languages
## 359 Romansh
## 360 Romany
## 361 Romanian; Moldavian; Moldovan
## 362 Rundi
## 363 Aromanian; Arumanian; Macedo-Romanian
## 364 Russian
## 365 Sandawe
## 366 Sango
## 367 Yakut
## 368 South American Indian languages
## 369 Salishan languages
## 370 Samaritan Aramaic
## 371 Sanskrit
## 372 Sasak
## 373 Santali
## 374 Sicilian
## 375 Scots
## 376 Selkup
## 377 Semitic languages
## 378 Irish, Old (to 900)
## 379 Sign Languages
## 380 Shan
## 381 Sidamo
## 382 Sinhala; Sinhalese
## 383 Siouan languages
## 384 Sino-Tibetan languages
## 385 Slavic languages
## 386 Slovak
## 387 Slovenian
## 388 Southern Sami
## 389 Northern Sami
## 390 Sami languages
## 391 Lule Sami
## 392 Inari Sami
## 393 Samoan
## 394 Skolt Sami
## 395 Shona
## 396 Sindhi
## 397 Soninke
## 398 Sogdian
## 399 Somali
## 400 Songhai languages
## 401 Sotho, Southern
## 402 Spanish; Castilian
## 403 Sardinian
## 404 Sranan Tongo
## 405 Serbian
## 406 Serer
## 407 Nilo-Saharan languages
## 408 Swati
## 409 Sukuma
## 410 Sundanese
## 411 Susu
## 412 Sumerian
## 413 Swahili
## 414 Swedish
## 415 Classical Syriac
## 416 Syriac
## 417 Tahitian
## 418 Tai languages
## 419 Tamil
## 420 Tatar
## 421 Telugu
## 422 Timne
## 423 Tereno
## 424 Tetum
## 425 Tajik
## 426 Tagalog
## 427 Thai
## 428 Tibetan
## 429 Tigre
## 430 Tigrinya
## 431 Tiv
## 432 Tokelau
## 433 Klingon; tlhIngan-Hol
## 434 Tlingit
## 435 Tamashek
## 436 Tonga (Nyasa)
## 437 Tonga (Tonga Islands)
## 438 Tok Pisin
## 439 Tsimshian
## 440 Tswana
## 441 Tsonga
## 442 Turkmen
## 443 Tumbuka
## 444 Tupi languages
## 445 Turkish
## 446 Altaic languages
## 447 Tuvalu
## 448 Twi
## 449 Tuvinian
## 450 Udmurt
## 451 Ugaritic
## 452 Uighur; Uyghur
## 453 Ukrainian
## 454 Umbundu
## 455 Undetermined
## 456 Urdu
## 457 Uzbek
## 458 Vai
## 459 Venda
## 460 Vietnamese
## 461 Volapuk
## 462 Votic
## 463 Wakashan languages
## 464 Wolaitta; Wolaytta
## 465 Waray
## 466 Washo
## 467 Welsh
## 468 Sorbian languages
## 469 Walloon
## 470 Wolof
## 471 Kalmyk; Oirat
## 472 Xhosa
## 473 Yao
## 474 Yapese
## 475 Yiddish
## 476 Yoruba
## 477 Yupik languages
## 478 Zapotec
## 479 Blissymbols; Blissymbolics; Bliss
## 480 Zenaga
## 481 Standard Moroccan Tamazight
## 482 Zhuang; Chuang
## 483 Zande languages
## 484 Zulu
## 485 Zuni
## 486 No linguistic content; Not applicable
## 487 Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki
covid_tweets_lang %>% inner_join(ISO_639_2 %>% dplyr::select(lang = Alpha_2, Name))
## # A tibble: 52 x 3
## lang n Name
## <chr> <int> <chr>
## 1 en 10115 English
## 2 es 2666 Spanish; Castilian
## 3 pt 932 Portuguese
## 4 fr 527 French
## 5 hi 289 Hindi
## 6 it 227 Italian
## 7 de 141 German
## 8 ja 138 Japanese
## 9 tl 113 Tagalog
## 10 tr 111 Turkish
## # ... with 42 more rows
View(covid_tweets_lang)
covid_tweets_lang %>%
inner_join(ISO_639_2 %>% dplyr::select(lang = Alpha_2, Name)) %>%
mutate(language = reorder(Name, n)) %>%
#filter(n > 4) %>%
ggplot(aes(x=language, y=n)) +
geom_col() +
coord_flip()
covid_tweets_country <- covid_tweets %>%
count(country, sort=TRUE)
View(covid_tweets_country)
covid_tweets_country %>%
mutate(country = reorder(country, n)) %>%
top_n(20,n) %>%
ggplot(aes(x=country, y=n)) +
geom_col() +
coord_flip()
covid_tweets_geo <- covid_tweets %>%
group_by(lng,lat,country) %>%
summarise(sum = n())
## `summarise()` has grouped output by 'lng', 'lat'. You can override using the `.groups` argument.
covid_tweets %>% count(lng, lat, country)
## # A tibble: 7,130 x 4
## lng lat country n
## <dbl> <dbl> <chr> <int>
## 1 -167. 23.7 United States 1
## 2 -162. 60.8 United States 1
## 3 -159. 22.2 United States 1
## 4 -159. 22.0 United States 1
## 5 -158. 21.4 United States 1
## 6 -158. 21.3 United States 1
## 7 -158. 21.4 United States 1
## 8 -158. 21.3 United States 1
## 9 -158. 21.3 United States 1
## 10 -158. 21.3 United States 1
## # ... with 7,120 more rows
covid_tweets_geo
## # A tibble: 7,130 x 4
## # Groups: lng, lat [7,129]
## lng lat country sum
## <dbl> <dbl> <chr> <int>
## 1 -167. 23.7 United States 1
## 2 -162. 60.8 United States 1
## 3 -159. 22.2 United States 1
## 4 -159. 22.0 United States 1
## 5 -158. 21.4 United States 1
## 6 -158. 21.3 United States 1
## 7 -158. 21.4 United States 1
## 8 -158. 21.3 United States 1
## 9 -158. 21.3 United States 1
## 10 -158. 21.3 United States 1
## # ... with 7,120 more rows
#install.packages("maps")
#install.packages("viridis")
#install.packages("rnaturalearth")
library(maps)
library(viridis)
## Loading required package: viridisLite
library(rnaturalearth)
# World map
world_map <- map_data("world")
# Plot lat and lng points onto map
ggplot(data=world_map, aes(x=long, y=lat)) +
geom_polygon(aes(group=group, fill=region)) +
geom_point(data = covid_tweets_geo, aes(x=lng, y=lat, size=sum),
colour="tomato", alpha=0.5) +
xlab("Longitude") + ylab("Latitutde") +
ggtitle("Twitter Map on #COVID-19") +
scale_fill_viridis_d()+
theme_void()+
theme(legend.position = "none")
# Mapping Europe
some.eu.countries <- c(
"Portugal", "Spain", "France", "Switzerland", "Germany",
"Austria", "Belgium", "UK", "United Kingdom","Netherlands", "The Netherlands",
"Denmark", "Poland", "Italy",
"Croatia", "Slovenia", "Hungary", "Slovakia",
"Czech Republic"
)
some.eu.maps <- map_data("world", region = some.eu.countries)
ggplot(data=some.eu.maps, aes(x=long, y=lat)) +
geom_polygon(aes(group=group, fill=region)) +
geom_point(data=covid_tweets_geo %>% filter(country %in% some.eu.countries),
aes(x=lng, y=lat, size=sum),
colour="tomato", alpha=0.5) +
ggtitle("Twitter Map on #COVID-19") +
scale_fill_viridis_d()+
theme_void()+
theme(legend.position = "none")
# Mapping US
usmap <- map_data("state")
ggplot(data=usmap, aes(x=long, y=lat)) +
geom_polygon(aes(group=group, fill=region)) +
geom_point(data=covid_tweets_geo %>%
filter(country == "United States") %>%
filter(sum > 19),
aes(x=lng, y=lat, size=sum),
colour="tomato", alpha=0.5) +
ggtitle("Twitter Map on #COVID-19") +
scale_fill_viridis_d()+
theme_void()+
theme(legend.position = "none")
We know how to convert our tweet data into a tidy text data format.
library(stringr)
library(stopwords)
covid_tweets # This dataset contains 18,224 tweets about COVID-19 including geo-location information.
## # A tibble: 18,224 x 9
## user_id status_id created_at screen_name text lang country lat
## <chr> <chr> <dttm> <chr> <chr> <chr> <chr> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ "@ev~ en United~ 36.0
## 2 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ "Ple~ en United~ 36.9
## 3 2155830~ 12533658~ 2020-04-23 16:51:09 KOROGLU_BA~ "@Ay~ tr Azerba~ 40.2
## 4 7445974~ 12533657~ 2020-04-23 16:51:05 FoodFocusSA "Pre~ en South ~ -26.1
## 5 1558777~ 12533657~ 2020-04-23 16:51:01 opcionsecu~ "#AT~ es Ecuador -1.67
## 6 9989605~ 12533657~ 2020-04-23 16:51:01 amystones4 "Tha~ en United~ 53.7
## 7 1027687~ 12533657~ 2020-04-23 16:51:00 COTACYT "Men~ es Mexico 23.7
## 8 2473827~ 12533657~ 2020-04-23 16:50:54 bkracing123 "The~ en United~ 53.9
## 9 17566234 12533657~ 2020-04-23 16:50:51 AnnStrahm "Thi~ en United~ 37.5
## 10 2267079~ 12533656~ 2020-04-23 16:50:42 JLeonRojas "INF~ es Chile -35.5
## # ... with 18,214 more rows, and 1 more variable: lng <dbl>
covid_tweets_tidy <- covid_tweets %>%
filter(lang == "en") %>% # Selecting tweets only written in English
mutate(hour = floor_date(created_at, unit="hour")) %>% # Creating a variable to aggregate tweets into the hour-long unit of time
mutate(text = str_replace_all(text, "[#@]?[^[:ascii:]]+", " ")) %>% # Removing non-ASCII characters
mutate(text = str_replace_all(text, "&|<|>|"|RT", " ")) %>% # Removing HTML tags and retweet marker
unnest_tweets(word, text) %>% # Splitting text into words by unnest_tweets
filter(!word %in% stopwords()) %>% # Removing words matched by any element in stopwords() vector
filter(str_detect(word, "[a-z]")) # Selecting words that should contain any alphbetical letter
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
covid_tweets_tidy
## # A tibble: 167,641 x 10
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## # ... with 167,631 more rows, and 2 more variables: hour <dttm>, word <chr>
When text is formed into tidy data, we are ready to do sentiment analysis using inner_join
covid_tweets_tidy %>%
inner_join(lexicon_bing()) # Joining with the bing lexicon
## Joining, by = "word"
## # A tibble: 15,936 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 4 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 5 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 6 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 7 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 8 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 9 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## 10 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## # ... with 15,926 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
covid_tweets_tidy %>%
inner_join(lexicon_nrc())
## Joining, by = "word"
## # A tibble: 57,217 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## # ... with 57,207 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) # Joining with the NRC-EIL by "word"="term". A character vector of variables to join by; using a variable with a common name across the two data sets; To join by a variable with different names, we specify that the variable "word" in covid_tweets_tidy is matched to the variable "term" in lexicon_nrc_eil().
## # A tibble: 23,895 x 12
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 5 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 6 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 7 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 8 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 9 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 10 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## # ... with 23,885 more rows, and 4 more variables: hour <dttm>, word <chr>,
## # score <dbl>, AffectDimension <chr>
After joining a tidy text data set with a sentiment lexicon, we can count the sentiment variable by the time variable
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, sentiment) # Counting sentiments by hour
## Joining, by = "word"
## # A tibble: 40 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 negative 516
## 2 2020-04-22 21:00:00 positive 525
## 3 2020-04-22 22:00:00 negative 524
## 4 2020-04-22 22:00:00 positive 507
## 5 2020-04-22 23:00:00 negative 349
## 6 2020-04-22 23:00:00 positive 418
## 7 2020-04-23 00:00:00 negative 356
## 8 2020-04-23 00:00:00 positive 405
## 9 2020-04-23 01:00:00 negative 370
## 10 2020-04-23 01:00:00 positive 397
## # ... with 30 more rows
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(hour, sentiment)
## Joining, by = "word"
## # A tibble: 200 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 anger 223
## 2 2020-04-22 21:00:00 anticipation 379
## 3 2020-04-22 21:00:00 disgust 179
## 4 2020-04-22 21:00:00 fear 384
## 5 2020-04-22 21:00:00 joy 280
## 6 2020-04-22 21:00:00 negative 577
## 7 2020-04-22 21:00:00 positive 720
## 8 2020-04-22 21:00:00 sadness 289
## 9 2020-04-22 21:00:00 surprise 201
## 10 2020-04-22 21:00:00 trust 462
## # ... with 190 more rows
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(hour, AffectDimension)
## # A tibble: 80 x 3
## hour AffectDimension n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 anger 237
## 2 2020-04-22 21:00:00 fear 467
## 3 2020-04-22 21:00:00 joy 536
## 4 2020-04-22 21:00:00 sadness 338
## 5 2020-04-22 22:00:00 anger 244
## 6 2020-04-22 22:00:00 fear 478
## 7 2020-04-22 22:00:00 joy 548
## 8 2020-04-22 22:00:00 sadness 330
## 9 2020-04-22 23:00:00 anger 146
## 10 2020-04-22 23:00:00 fear 320
## # ... with 70 more rows
Let’s visualize the time trend of sentiment in tweets toward COVID-19
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(hour, AffectDimension) %>%
ggplot(aes(x=hour, y=n, colour=AffectDimension)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC-EIL was used to measure sentiment in tweets")
We can analyze word counts that contribute to positive and negative sentiment in tweets. By implementing count()
here with arguments of both word
and sentiment
, we find out how much each word contributed to each sentiment.
# Word count on tweets
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) # Counting words by sentiments
## Joining, by = "word"
## # A tibble: 1,973 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 negative virus 282
## 6 positive thank 255
## 7 positive safe 234
## 8 positive well 232
## 9 positive trump 223
## 10 negative crisis 212
## # ... with 1,963 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="positive") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 751 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 positive thank 255
## 6 positive safe 234
## 7 positive well 232
## 8 positive trump 223
## 9 positive support 205
## 10 positive great 187
## # ... with 741 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="negative") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 1,222 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 negative virus 282
## 2 negative crisis 212
## 3 negative death 197
## 4 negative died 133
## 5 negative hard 98
## 6 negative infected 98
## 7 negative lost 85
## 8 negative die 84
## 9 negative risk 83
## 10 negative sick 83
## # ... with 1,212 more rows
The words like, positive, & trump are to be removed from the list of positive words because their meaning is not related to positive feelings in the context of COVID-19; And I also want to remove the word virus from the list of negative words because it is likely to be used in a way of indicating “Coronavirus”.
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(sentiment, word, sort=TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
arrange(sentiment, desc(n)) %>%
ungroup
## Joining, by = "word"
## Selecting by n
## # A tibble: 102 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 anger fight 217
## 2 anger death 197
## 3 anger money 119
## 4 anger fighting 115
## 5 anger disease 85
## 6 anger hit 73
## 7 anger dying 72
## 8 anger bad 69
## 9 anger feeling 51
## 10 anger challenge 44
## # ... with 92 more rows
The words virus in “negative”, don in “positive” and “trust”, trump in “surprise” are to be excluded from the analysis using the NRC lexicon.
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(AffectDimension, word) %>%
group_by(AffectDimension) %>%
top_n(10) %>%
arrange(AffectDimension, desc(n)) %>%
ungroup
## Selecting by n
## # A tibble: 40 x 3
## AffectDimension word n
## <chr> <chr> <int>
## 1 anger fight 217
## 2 anger death 197
## 3 anger money 119
## 4 anger fighting 115
## 5 anger disease 85
## 6 anger hit 73
## 7 anger dying 72
## 8 anger bad 69
## 9 anger feeling 51
## 10 anger challenge 44
## # ... with 30 more rows
The words positive in “joy”, don in “positive” and “trust”, trump in “surprise” are to be excluded from the analysis using the NRC lexicon.
words_out <- c("like", "positive", "trump", "virus", "don")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
filter(!word %in% words_out) %>%
count(hour, AffectDimension) %>%
ggplot(aes(x=hour, y=n, colour=AffectDimension)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC-EIL was used to measure sentiment in tweets")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment)
## Joining, by = "word"
## # A tibble: 8,929 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 8,919 more rows
*Let’s consider how we can calculate the net score of sentiment by tweet: Sum of positive words minus sum of negative words in each tweet
*To do so, we need to have two separate columns for positive and negative scores
*And, there will be also some days with no emotional words in tweets
*So, we will use spread()
from tidyr package
spread()
takes three principal arguments:Spread() function
This yields a frequency table where the observations of sentiment for each tweet are spread across multiple rows, 9,559 observations from 7,203 tweets of 4 variables (hour
,status_id
, sentiment
, n
)
library(tibble)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, status_id, sentiment)
## Joining, by = "word"
## # A tibble: 9,559 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 9,549 more rows
Using spread()
to key on sentiment
with values from n
, this becomes 7,203 observations of 4 variables (hour
,status_id
, negative
, positive
)
library(tidyr)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0)
## Joining, by = "word"
## # A tibble: 7,203 x 4
## hour status_id negative positive
## <dttm> <chr> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1
## 2 2020-04-22 21:00:00 1253067622519681024 1 0
## 3 2020-04-22 21:00:00 1253067629759209472 2 0
## 4 2020-04-22 21:00:00 1253067638491807748 2 0
## 5 2020-04-22 21:00:00 1253067644388982785 1 3
## 6 2020-04-22 21:00:00 1253067656690839552 0 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 1
## 10 2020-04-22 21:00:00 1253067782016688128 0 1
## # ... with 7,193 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 -4
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 -1
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 -2
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 -2
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 2
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 -2
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 1
## # ... with 6,856 more rows
# Assigning each tweet with either positive or negative sentiment by the net score
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <chr>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 Negative
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 Negative
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 Negative
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 Negative
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 Positive
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 Positive
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 Positive
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 Positive
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 Negative
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 Positive
## # ... with 6,856 more rows
# ifelse(test, yes, no) returns a value with the same shape as test which is filled with elements selected from either yes or no depending on whether the element of test is TRUE or FALSE.
covid_tweets_bing <- covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
# Now we are going to plot these net sentiment scores across hour-long bins. Note that we are plotting against the hour variable on the x-axis that keeps track of posted time in tweets
covid_tweets_bing %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour,y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum of Tweets",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"