Quiz1 - Group1
Please find the book and line number in Jane Austen books which has the most number of trust words.
- use
nrc
sentiments
- please remove stop words
- hint:
count(book,linenumber, sentiment)
Expected outcome:
Solution
library(tidytext)
library(tidyverse)
library(janeaustenr)
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment=="trust") %>%
count(book,linenumber, sentiment) %>%
arrange(desc(n)) %>%
head(2)
Quiz1 - Group2
Please find the book and line number in Jane Austen books which has the most number of positive words.
- use
bing
sentiments
- please remove stop words
- hint:
count(book,linenumber, sentiment)
Expected outcome:
Solution
library(tidytext)
library(janeaustenr)
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment=="positive") %>%
count(book,linenumber, sentiment) %>%
arrange(desc(n)) %>%
head(2)
Quiz2 - Group1
Find the highest correlated “positive” (nrc) words in “Emma” (sections of 20 lines, word count threshold is 10)
Expected output:
Solution
library(janeaustenr)
library(tidytext)
library(widyr)
library(tidyverse)
austen_books() %>%
filter(book=="Emma") %>%
mutate(section = row_number() %/% 20) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment=="positive") %>%
group_by(word) %>%
filter(n() >= 10) %>% # try lower numbers and see what happens
pairwise_cor(word, section, sort = TRUE) %>%
head(2)
Joining, by = "word"
Note: this quiz gave different result in most of student computers, most likely due to a bug in widyr
package
Quiz2 - Group2
Find the top 2 highest correlated words with the word pride
in sections of 20 lines in all Jane Austen books. (Word count threshold is 10)
Expected output:
pride |
|
0.2301 |
pride |
|
0.1817 |
Solution
library(janeaustenr)
library(tidytext)
library(widyr)
library(tidyverse)
austen_books() %>%
mutate(section = row_number() %/% 20) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
group_by(word) %>%
filter(n() >= 10) %>% # try lower numbers and see what happens
pairwise_cor(word, section, sort = TRUE) %>%
filter(item1=="pride") %>%
head(2)
Note: this quiz gave different result in most of student computers, most likely due to a bug in widyr
package
Quiz3 - Group1 and Group2
Please make a linear model of displacement versus mpg from mtcars lm(mpg ~ disp)
and answer the questions:
if a car has 250 displacement, what would its mpg be?
which car has the highest (positive) residual?
if lm(mpg ~ cyl) has R squared of 0.726, which model, disp
or cyl
is better for predicting mpg
Solution
library(broom)
# a
lmfit <- lm(mpg ~ disp, mtcars)
dummy <- data_frame(disp=250)
predict(lmfit,dummy)
1
19.29607
# b
augment(lmfit) %>%
arrange(-.resid) %>%
head(1)
# residual is difference between actual value and predicted value (actual-prediction)
# thus Corolla has 7.2 more mileage than expected mileage for its displacement
# c
summary(lmfit)$r.squared #OR glance(lmfit)
[1] 0.7183433
# higher R-squared means better fit/model
Quiz4 - Group1 and Group2
Please do two different kmeans cluster on mtcars
- regular data, 3 clusters, 10 starts
- scaled data, 3 clusters, 10 starts
then show the table of cluster matches
Solution
set.seed(1)
k1 <- kmeans(mtcars, centers= 3, nstart = 10)
k2 <- kmeans(scale(mtcars), centers= 3, nstart = 10 )
table(k1$cluster, k2$cluster)
1 2 3
1 3 13 0
2 2 0 7
3 0 2 5
LS0tCnRpdGxlOiAiUXVpemVzIGFuZCBTb2x1dGlvbnMiCmF1dGhvcjogYWxwZXIgeWlsbWF6CmRhdGU6IEphbiAzcmQsIDIwMTgKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKPHN0eWxlIHR5cGU9InRleHQvY3NzIj4KLnRhYmxlIHsKICAgIHdpZHRoOiA0MCU7Cn0KPC9zdHlsZT4KCiMgUXVpejEgLSBHcm91cDEKClBsZWFzZSBmaW5kIHRoZSBib29rIGFuZCBsaW5lIG51bWJlciBpbiBKYW5lIEF1c3RlbiBib29rcyB3aGljaCBoYXMgdGhlIG1vc3QgbnVtYmVyIG9mIHRydXN0IHdvcmRzLgoKKiB1c2UgYG5yY2Agc2VudGltZW50cwoqIHBsZWFzZSByZW1vdmUgc3RvcCB3b3JkcwoqIGhpbnQ6IGBjb3VudChib29rLGxpbmVudW1iZXIsIHNlbnRpbWVudClgCgpFeHBlY3RlZCBvdXRjb21lOgoKfCBib29rIHwgbGluZW51bWJlciB8IHNlbnRpbWVudCB8IG4gfAp8LS0tLS0tfC0tLS0tLS0tLS0tLXwtLS0tLS0tLS0tLXwtLS18CnwgLi4uICB8ICAgIC4uLiAgICAgfCAgdHJ1c3QgICAgfCA1IHwKfCAuLi4gIHwgICAgLi4uICAgICB8ICAgLi4uICAgICB8IC4gfAoKCiMjIFNvbHV0aW9uCgpgYGB7ciBzb2x1dGlvbi1xejEtZ3IxLCB3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHRpZHl0ZXh0KQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShqYW5lYXVzdGVucikKCmF1c3Rlbl9ib29rcygpICU+JQogIGdyb3VwX2J5KGJvb2spICU+JQogIG11dGF0ZShsaW5lbnVtYmVyID0gcm93X251bWJlcigpKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KSAlPiUKICBhbnRpX2pvaW4oc3RvcF93b3JkcykgJT4lCiAgaW5uZXJfam9pbihnZXRfc2VudGltZW50cygibnJjIikpICU+JQogIGZpbHRlcihzZW50aW1lbnQ9PSJ0cnVzdCIpICU+JSAKICBjb3VudChib29rLGxpbmVudW1iZXIsIHNlbnRpbWVudCkgJT4lIAogIGFycmFuZ2UoZGVzYyhuKSkgJT4lIAogIGhlYWQoMikKYGBgCgojIFF1aXoxIC0gR3JvdXAyCgpQbGVhc2UgZmluZCB0aGUgYm9vayBhbmQgbGluZSBudW1iZXIgaW4gSmFuZSBBdXN0ZW4gYm9va3Mgd2hpY2ggaGFzIHRoZSBtb3N0IG51bWJlciBvZiBwb3NpdGl2ZSB3b3Jkcy4KCiogdXNlIGBiaW5nYCBzZW50aW1lbnRzCiogcGxlYXNlIHJlbW92ZSBzdG9wIHdvcmRzCiogaGludDogYGNvdW50KGJvb2ssbGluZW51bWJlciwgc2VudGltZW50KWAKCkV4cGVjdGVkIG91dGNvbWU6Cgp8IGJvb2sgfCBsaW5lbnVtYmVyIHwgc2VudGltZW50IHwgbiB8CnwtLS0tLS18LS0tLS0tLS0tLS0tfC0tLS0tLS0tLS0tfC0tLXwKfCAuLi4gIHwgICAgLi4uICAgICB8IHBvc2l0aXZlICB8IDYgfAp8IC4uLiAgfCAgICAuLi4gICAgIHwgICAuLi4gICAgIHwgLiB8CgojIyBTb2x1dGlvbgoKYGBge3Igc29sdXRpb24tcXoxLWdyMiwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoamFuZWF1c3RlbnIpCgphdXN0ZW5fYm9va3MoKSAlPiUKICBncm91cF9ieShib29rKSAlPiUKICBtdXRhdGUobGluZW51bWJlciA9IHJvd19udW1iZXIoKSkgJT4lCiAgdW5ncm91cCgpICU+JQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JQogIGlubmVyX2pvaW4oZ2V0X3NlbnRpbWVudHMoImJpbmciKSkgJT4lCiAgZmlsdGVyKHNlbnRpbWVudD09InBvc2l0aXZlIikgJT4lIAogIGNvdW50KGJvb2ssbGluZW51bWJlciwgc2VudGltZW50KSAlPiUgCiAgYXJyYW5nZShkZXNjKG4pKSAlPiUgCiAgaGVhZCgyKQpgYGAKCiMgUXVpejIgLSBHcm91cDEKCkZpbmQgdGhlICBoaWdoZXN0IGNvcnJlbGF0ZWQgInBvc2l0aXZlIiAobnJjKSB3b3JkcyBpbiAiRW1tYSIgKHNlY3Rpb25zIG9mIDIwIGxpbmVzLCB3b3JkIGNvdW50IHRocmVzaG9sZCBpcyAxMCkKCkV4cGVjdGVkIG91dHB1dDoKCnwgaXRlbTEgfCBpdGVtMiB8IGNvcnJlbGF0aW9uIHwKfC0tLS0tLS18LS0tLS0tLXwtLS0tLS0tLS0tLS0tfAp8IGx1Y2t5IHwgICAgICAgfCAgIDAuNjMyOTUwMiB8CgojIyBTb2x1dGlvbgoKYGBge3Igc29sdXRpb24tcXoyLWdyMX0KbGlicmFyeShqYW5lYXVzdGVucikKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeSh3aWR5cikKbGlicmFyeSh0aWR5dmVyc2UpCgphdXN0ZW5fYm9va3MoKSAlPiUKICBmaWx0ZXIoYm9vaz09IkVtbWEiKSAlPiUgCiAgbXV0YXRlKHNlY3Rpb24gPSByb3dfbnVtYmVyKCkgJS8lIDIwKSAlPiUKICBmaWx0ZXIoc2VjdGlvbiA+IDApICU+JQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkgJT4lCiAgZmlsdGVyKCF3b3JkICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgCiAgaW5uZXJfam9pbihnZXRfc2VudGltZW50cygibnJjIikpICU+JSAKICBmaWx0ZXIoc2VudGltZW50PT0icG9zaXRpdmUiKSAlPiUgCiAgZ3JvdXBfYnkod29yZCkgJT4lCiAgZmlsdGVyKG4oKSA+PSAxMCkgJT4lICAgIyB0cnkgbG93ZXIgbnVtYmVycyBhbmQgc2VlIHdoYXQgaGFwcGVucwogIHBhaXJ3aXNlX2Nvcih3b3JkLCBzZWN0aW9uLCBzb3J0ID0gVFJVRSkgJT4lIAogIGhlYWQoMikKYGBgCgo+IE5vdGU6IHRoaXMgcXVpeiBnYXZlIGRpZmZlcmVudCByZXN1bHQgaW4gbW9zdCBvZiBzdHVkZW50IGNvbXB1dGVycywgbW9zdCBsaWtlbHkgZHVlIHRvIGEgYnVnIGluIGB3aWR5cmAgcGFja2FnZQoKIyBRdWl6MiAtIEdyb3VwMgoKRmluZCB0aGUgdG9wIDIgaGlnaGVzdCBjb3JyZWxhdGVkIHdvcmRzIHdpdGggdGhlIHdvcmQgYHByaWRlYCBpbiBzZWN0aW9ucyBvZiAyMCBsaW5lcyBpbiBhbGwgSmFuZSBBdXN0ZW4gYm9va3MuIChXb3JkIGNvdW50IHRocmVzaG9sZCBpcyAxMCkKCkV4cGVjdGVkIG91dHB1dDoKCnwgaXRlbTEgfCBpdGVtMiB8IGNvcnJlbGF0aW9uIHwKfC0tLS0tLS18LS0tLS0tLXwtLS0tLS0tLS0tLS0tfAp8IHByaWRlIHwgICAgICAgfCAgIDAuMjMwMSAgICB8CnwgcHJpZGUgfCAgICAgICB8ICAgMC4xODE3ICAgIHwKCiMjIFNvbHV0aW9uCgpgYGB7ciBzb2x1dGlvbi1xejItZ3IyfQpsaWJyYXJ5KGphbmVhdXN0ZW5yKQpsaWJyYXJ5KHRpZHl0ZXh0KQpsaWJyYXJ5KHdpZHlyKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKCmF1c3Rlbl9ib29rcygpICU+JQogIG11dGF0ZShzZWN0aW9uID0gcm93X251bWJlcigpICUvJSAyMCkgJT4lCiAgZmlsdGVyKHNlY3Rpb24gPiAwKSAlPiUKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQogIGZpbHRlcighd29yZCAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lIAogIGdyb3VwX2J5KHdvcmQpICU+JQogIGZpbHRlcihuKCkgPj0gMTApICU+JSAgICMgdHJ5IGxvd2VyIG51bWJlcnMgYW5kIHNlZSB3aGF0IGhhcHBlbnMKICBwYWlyd2lzZV9jb3Iod29yZCwgc2VjdGlvbiwgc29ydCA9IFRSVUUpICU+JSAKICBmaWx0ZXIoaXRlbTE9PSJwcmlkZSIpICU+JSAKICBoZWFkKDIpCmBgYAoKPiBOb3RlOiB0aGlzIHF1aXogZ2F2ZSBkaWZmZXJlbnQgcmVzdWx0IGluIG1vc3Qgb2Ygc3R1ZGVudCBjb21wdXRlcnMsIG1vc3QgbGlrZWx5IGR1ZSB0byBhIGJ1ZyBpbiBgd2lkeXJgIHBhY2thZ2UKCiMgUXVpejMgLSBHcm91cDEgYW5kIEdyb3VwMgoKUGxlYXNlIG1ha2UgYSBsaW5lYXIgbW9kZWwgb2YgZGlzcGxhY2VtZW50IHZlcnN1cyBtcGcgCmZyb20gbXRjYXJzIGBsbShtcGcgfiBkaXNwKWAgYW5kIGFuc3dlciB0aGUgcXVlc3Rpb25zOgoKYS4gaWYgYSBjYXIgaGFzIDI1MCBkaXNwbGFjZW1lbnQsIHdoYXQgd291bGQgaXRzIG1wZyBiZT8KCmIuIHdoaWNoIGNhciBoYXMgdGhlIGhpZ2hlc3QgKHBvc2l0aXZlKSByZXNpZHVhbD8KCmMuIGlmIGxtKG1wZyB+IGN5bCkgaGFzIFIgc3F1YXJlZCBvZiAwLjcyNiwgd2hpY2ggbW9kZWwsCmBkaXNwYCBvciBgY3lsYCBpcyBiZXR0ZXIgZm9yIHByZWRpY3RpbmcgYG1wZ2AKCiMjIFNvbHV0aW9uCgpgYGB7ciBzb2x1dGlvbi1xejN9CmxpYnJhcnkoYnJvb20pCgojIGEKbG1maXQgPC0gbG0obXBnIH4gZGlzcCwgbXRjYXJzKQpkdW1teSA8LSBkYXRhX2ZyYW1lKGRpc3A9MjUwKQpwcmVkaWN0KGxtZml0LGR1bW15KQoKIyBiCmF1Z21lbnQobG1maXQpICU+JSAKICBhcnJhbmdlKC0ucmVzaWQpICU+JSAKICBoZWFkKDEpCgojIHJlc2lkdWFsIGlzIGRpZmZlcmVuY2UgYmV0d2VlbiBhY3R1YWwgdmFsdWUgYW5kIHByZWRpY3RlZCB2YWx1ZSAoYWN0dWFsLXByZWRpY3Rpb24pCiMgdGh1cyBDb3JvbGxhIGhhcyA3LjIgbW9yZSBtaWxlYWdlIHRoYW4gZXhwZWN0ZWQgbWlsZWFnZSBmb3IgaXRzIGRpc3BsYWNlbWVudAoKIyBjCnN1bW1hcnkobG1maXQpJHIuc3F1YXJlZCAgICNPUiBnbGFuY2UobG1maXQpCgojIGhpZ2hlciBSLXNxdWFyZWQgbWVhbnMgYmV0dGVyIGZpdC9tb2RlbApgYGAKCiMgUXVpejQgLSBHcm91cDEgYW5kIEdyb3VwMgoKUGxlYXNlIGRvIHR3byBkaWZmZXJlbnQga21lYW5zIGNsdXN0ZXIgb24gbXRjYXJzCgoxLiByZWd1bGFyIGRhdGEsICAzIGNsdXN0ZXJzLCAxMCBzdGFydHMKMi4gc2NhbGVkIGRhdGEsIDMgY2x1c3RlcnMsIDEwIHN0YXJ0cwoKdGhlbiBzaG93IHRoZSB0YWJsZSBvZiBjbHVzdGVyIG1hdGNoZXMKCnwgICB8IDEgfCAyIHwgMyB8CnwtLS18LS0tfC0tLXwtLS18CnwgMSB8ICAgfCAgIHwgICB8CnwgMiB8ICAgfCAgIHwgICB8CnwgMyB8ICAgfCAgIHwgICB8CgojIyBTb2x1dGlvbgoKYGBge3Igc29sdXRpb24tcXo0fQpzZXQuc2VlZCgxKQprMSA8LSBrbWVhbnMobXRjYXJzLCBjZW50ZXJzPSAzLCBuc3RhcnQgPSAxMCkKazIgPC0ga21lYW5zKHNjYWxlKG10Y2FycyksIGNlbnRlcnM9IDMsIG5zdGFydCA9IDEwICkKCnRhYmxlKGsxJGNsdXN0ZXIsIGsyJGNsdXN0ZXIpCmBgYA==