Montag corpus

d_raw <- read_lines("100Books.txt")  %>%
  as.data.frame() %>%
  rename("text" = ".") 

book_titles <- d_raw %>%
  filter(str_detect(text, regex("^Title", ignore_case = TRUE))) %>%
  rename(title = text) %>%
  mutate(book_id = 1:n(),
         title = str_replace(title, "Title: ", "")) %>%
  select(book_id, title)

author_names <- d_raw %>%
  filter(str_detect(text, regex("^Author", ignore_case = TRUE))) %>%
  rename(author = text) %>%
  mutate(book_id = 1:n(),
         author = str_replace(author, "Author: ", "")) %>%
  select(book_id, author)

d_clean <- d_raw %>%
    mutate(book_id = cumsum(str_detect(text, regex("^Title",
                                                 ignore_case = TRUE)))) %>%
    left_join(book_titles) %>%
    left_join(author_names) %>%
    filter(!str_detect(text, regex("^Title",
                                                 ignore_case = TRUE)),
           !str_detect(text, regex("^Author",
                                                 ignore_case = TRUE)),
           text != "") %>%
    group_by(book_id) %>%
    mutate(line_number = row_number(),
           text = as.character(text)) %>%
    ungroup() %>%
    select(book_id, title, author, line_number, text)

tidy_books <- d_clean %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

Glasglow norms

(Gender + others)

Overall distribution of words in Glasglow dataset:

norms <- read_csv("GlasgowNorms.csv") %>%
  select(word, GEND_M, contains("_M")) 
  
norms %>%
  gather("norm", "rating", -1) %>%
  ggplot(aes(x = rating, fill = norm)) +
    ggtitle("Overall distribution of words in the Glasglow corpus")+
    geom_histogram() +
    facet_wrap(~norm) +
    scale_x_continuous(name="mean rating", 
                     limits=c(1, 7),
                     breaks=1:7) +
    theme(legend.position = "none")

overall_means <- norms %>%
  gather("norm", "rating", -1) %>%
  group_by(norm) %>%
  summarize(mean_rating = mean(rating))
tidy_with_norms <- tidy_books %>%
  left_join(norms)

prop_na_type <- tidy_with_norms %>%
  distinct(word, .keep_all = T) %>%
  summarize(prop_na = length(which(is.na(AROU_M)))/n()) 
  

prop_na_token <- tidy_with_norms %>%
  summarize(prop_na = length(which(is.na(AROU_M)))/n())

In the Montag corpus, 0.71 word types are missing from the Glasgow dataset, and 0.56 word tokens are missing. That’s a lot! This is excluding stop words.

Mean ratings by book

They’re sorted here from most feminine to most masculine.

norm_means <- tidy_with_norms %>%
  filter(!is.na(AROU_M)) %>%
  gather("norm", "value", -1:-5) %>%
  group_by(title, norm) %>%
  multi_boot_standard(col = "value")

norm_means_wide <- norm_means %>%
  select(norm, mean) %>%
  spread(norm, mean)  %>%
  select(title, GEND_M, everything()) %>%
  arrange(GEND_M)

kable(norm_means_wide, "html") %>%
  kable_styling() %>%
  scroll_box(width = "900px", height = "400px")
title GEND_M AOA_M AROU_M CNC_M DOM_M FAM_M IMAG_M SIZE_M VAL_M
Are You My Mother? 2.879871 1.982348 4.730443 6.231887 5.186159 6.261409 6.407782 3.671593 6.554930
I’m a Big Sister 3.103378 2.089367 5.124778 5.452287 5.130496 6.274178 5.679209 3.794165 6.487039
Olivia 3.150059 2.078612 5.111709 5.067576 5.174948 6.469619 5.604781 4.565805 6.840743
The Napping House 3.216098 1.984250 4.413992 6.511882 5.168945 6.321110 6.634562 3.314182 6.235493
The Keeping Quilt 3.218312 2.509381 5.163754 5.508935 5.258974 6.171731 5.892695 4.243024 6.433777
Froggy Goes to Bed 3.268729 2.058830 4.749232 5.408907 5.270314 6.426044 5.862726 4.123032 6.347890
The Gardener 3.304785 2.526293 4.972109 4.701747 5.231332 6.094612 5.076769 4.249203 6.531361
Goodnight Moon 3.308257 2.162133 4.498900 5.808663 5.223987 6.161727 6.023600 3.643257 5.805837
Chrysanthemum 3.321298 2.397101 4.879673 5.185603 5.181773 6.180429 5.491685 4.041037 6.295210
Guess How Much I Love You 3.339215 2.074413 5.705767 4.872810 5.187974 6.500077 5.626562 4.847974 6.682480
That Is Not a Good Idea! 3.443810 2.694332 5.326655 4.135081 5.682810 6.191413 4.605871 4.402723 6.609716
Love You Forever 3.449142 1.997692 5.066229 5.361519 5.152816 6.448902 5.786263 4.352145 6.426892
Blueberries for Sal 3.450261 2.333277 4.754171 5.437453 5.031806 5.998254 5.660260 4.282521 5.958398
The Very Hungry Caterpillar 3.456509 2.242612 4.746448 5.870112 5.346355 6.212930 6.192767 3.111900 6.096612
The Other Side 3.475363 2.446223 5.023791 5.197628 5.292035 6.034397 5.649253 4.250019 6.115749
Goldilicious 3.497268 2.681417 5.299192 4.804519 5.291810 5.821343 5.327056 4.176464 6.218912
Bunny Party 3.525550 2.541211 4.515424 5.624957 5.081531 5.914953 5.985942 3.384690 5.611593
Miss Rumphius 3.526887 2.473838 5.087463 5.204692 5.300889 6.146247 5.655902 4.534586 6.204991
Madeline 3.570897 2.298021 4.950416 5.314637 5.031940 6.252868 5.774436 4.127324 5.599351
No, David! 3.588456 2.366422 5.410478 4.422000 5.480678 6.257367 4.571144 4.164811 6.404989
If You Give a Mouse a Cookie 3.592555 2.492816 4.291004 5.511375 5.376720 6.063443 5.672280 3.350773 5.570984
The Runaway Bunny 3.599798 2.205602 4.795622 5.936251 5.444356 5.951631 6.107491 3.919160 6.271744
A Bad Case of Stripes 3.600147 2.779837 4.802784 4.928138 5.002743 6.109106 5.202277 4.065205 5.475596
Winter Days in the Big Woods 3.603702 2.437101 4.700574 5.304132 5.240619 6.152938 5.613899 3.960563 5.795044
The Little House 3.606224 2.430056 4.978716 5.360449 5.296437 6.329488 5.814937 4.644129 5.994435
George and Martha 3.610204 2.518593 4.735402 4.965436 5.366871 6.054468 5.289184 3.613066 5.776062
Lilly’s Purple Plastic Purse 3.623605 2.590626 4.921749 4.916743 5.109928 6.159762 5.231384 4.078727 5.846000
The Day the Crayons Quit 3.638812 2.318237 4.918208 5.017775 5.312439 6.134921 5.550484 3.870408 5.951026
Bread and Jam for Frances 3.639618 2.303836 4.434378 5.672922 5.189734 6.237732 5.908904 3.331147 5.937306
Llama Llama Red Pajama 3.642517 3.005633 4.483658 5.951799 5.170346 5.348203 6.243872 3.739282 5.832249
Charlie and the New Baby 3.661958 2.622219 4.861391 4.552031 5.024608 6.158943 4.858067 4.533146 5.990233
Maisy Goes to the Library 3.663787 2.191772 4.837879 5.540374 5.613551 6.342323 5.715930 3.706523 6.228900
Corduroy 3.672295 2.381225 4.724137 5.606351 5.304488 6.143871 5.903399 4.178999 5.849963
Ladybug Girl at the Beach 3.680304 2.495313 5.078944 5.316366 5.204298 6.037101 5.619229 4.325341 6.062864
If You Give a Moose a Muffin 3.693568 2.845121 4.127418 5.400771 5.122015 5.825194 5.486915 3.476629 5.339285
Dragons Love Tacos 3.697023 2.855474 5.817548 4.586335 5.330191 5.977286 5.179218 4.393903 5.961768
The Story of Babar 3.698835 2.615915 5.172171 5.041435 5.371864 6.098174 5.498804 4.463075 5.990737
The Hat 3.717974 2.350077 4.799771 5.389348 5.343312 6.080985 5.683290 3.797866 5.851904
The Cat in The Hat 3.718268 2.071545 4.534855 5.497236 5.290339 6.176746 5.833292 3.635524 5.679916
Olivia. . . and the Missing Toy 3.728953 2.120322 4.915203 5.306595 5.208822 6.179957 5.689283 4.053422 6.063402
When Dinosaurs came with everything 3.735169 2.403213 4.726022 5.144559 5.297357 6.158292 5.405267 4.018446 5.735076
Stellaluna 3.741456 2.552927 4.753029 5.207499 5.130343 5.989660 5.479317 3.907528 5.598039
Knuffle Bunny 3.748156 2.391656 4.205888 5.140192 5.032328 5.975796 5.476072 4.212860 5.619156
Show Dog 3.755079 2.447858 5.397989 5.256034 5.645063 6.125705 5.617972 3.896450 6.283912
Arthur Writes a Story 3.759676 2.511993 5.205109 4.641862 5.591221 6.197965 5.000187 4.356074 6.315461
Harry the Dirty Dog 3.793141 2.200187 4.906271 5.195354 5.386329 6.358506 5.656996 3.985316 5.817326
The Tale of Peter Rabbit 3.796699 2.428412 4.487753 5.307069 5.069957 6.123512 5.577879 3.760844 5.485631
The Pigeon Finds a Hot Dog! 3.802503 2.719959 5.907110 4.688765 5.921459 6.261655 4.968810 4.053700 6.738910
Bark, George 3.803778 2.458231 4.737374 5.144633 5.006693 5.825816 5.464453 4.286133 5.664525
Arnie the Doughnut 3.805446 2.705824 4.881735 4.763236 5.281375 6.097876 5.039313 3.969472 5.784246
Good Night Gorilla 3.828070 2.197204 5.210770 5.278559 5.352537 6.284204 6.366881 5.222885 5.882911
The True Story of the 3 little pigs! 3.839225 2.302066 4.698364 5.230486 5.112881 6.151233 5.395213 3.818195 5.550903
Angelina Ice Skates 3.840994 2.575502 5.456192 4.962255 5.288398 6.019411 5.481790 4.408659 5.708934
Own Moon 3.857728 2.353969 4.729061 5.056184 4.976064 6.207420 5.493563 4.186062 5.468881
Llama llama home with mama 3.861944 2.874577 4.189732 5.669177 5.070894 5.648668 5.892945 3.844761 5.433192
Brown Bear, Brown Bear, What Do You See? 3.870788 1.956304 4.600795 5.910995 5.233253 6.268884 6.493820 3.553071 5.749784
Curious George Takes a Job 3.878306 2.562943 5.004884 5.118078 5.196054 6.153117 5.452422 4.352235 5.791660
Sylvester and the Magic Pebble 3.880423 2.565943 4.935594 5.040011 5.203579 6.073406 5.476945 4.243735 5.820365
Chicka Chicka 1-2-3 3.882694 2.243694 5.038219 5.006081 5.283811 6.209044 5.461111 4.297781 5.722994
The snowy Day 3.912584 2.322279 5.040917 5.599022 5.246695 6.090645 6.003909 4.189419 5.758986
The Berenstain Bears and the Green-Eyed Monster 3.919837 2.323256 5.094096 5.173877 5.577699 6.285029 5.660875 4.326474 6.211390
The Polar Express 3.920272 2.736671 4.742018 5.281853 5.138463 5.935973 5.503872 4.103306 5.702445
Make Way for Ducklings 3.930766 2.591968 4.643307 5.150114 5.190308 6.010672 5.307334 4.192636 5.764136
How to Train a Train 3.956575 2.475112 4.934430 4.563371 5.309672 6.224243 4.946721 4.490934 5.872781
The Lorax 3.959208 2.724175 4.697856 4.610853 5.134621 6.060518 4.936405 4.175701 5.364925
How Do Dinosaurs Say Good Night? 3.960200 2.239476 5.455509 5.464982 5.452345 6.094282 5.996030 4.641536 5.744245
Bear Wants More 3.969614 2.566654 4.856542 5.562244 5.180375 5.835007 5.793840 3.917210 5.595190
A Sick Day for Amos McGee 3.974591 2.414804 4.740794 5.449071 5.174104 6.011820 5.719473 3.973918 5.587451
Alexander and the Terrible, Horrible, No Good, Very Bad Day 3.981589 2.426658 4.803243 5.019414 5.006770 6.248085 5.314369 4.143863 5.018539
Cloudy With a Chance of Meatballs 3.992663 2.530938 4.752740 5.320130 5.014543 6.202190 5.541864 4.093104 5.531803
Curious George 3.995664 2.386339 4.842073 5.192259 5.211394 6.101234 5.539326 4.076225 5.479910
The Berenstain Bears Forget Their Manners 3.999720 2.729148 4.640387 4.716552 5.249532 6.017779 5.006963 4.268844 5.353006
Harold and the Purple Crayon 4.003790 2.383317 4.699695 5.550149 5.071673 6.111456 5.803965 4.303050 5.653373
There’s an Alligator Under My Bed 4.010175 2.466112 4.657203 5.650378 5.020466 6.055713 5.763966 3.809456 5.564981
Oh, the Places You’ll Go 4.051500 2.739320 5.058751 4.465362 5.335014 5.974333 4.854330 4.294310 5.501745
Horton Hears a Who! 4.057146 2.817751 4.759080 4.939367 5.173266 5.872778 5.079839 4.110376 5.262768
Duck on a Bike 4.065606 2.101797 4.578267 6.067242 5.390860 5.876964 6.227335 3.466038 5.771536
The Paper Bag Princess 4.098313 2.470772 5.286769 5.057650 5.237548 5.965694 5.598914 4.437180 5.484196
Where the Wild Things Are 4.106162 2.619646 5.703213 4.673569 5.178954 6.063539 5.159253 4.671764 5.179461
The Duckling gets a cookie!? 4.116222 2.581544 4.535326 5.089948 5.271348 6.003459 5.263281 3.761485 5.868193
The Story of Ferdinand 4.143433 2.640733 4.947643 5.076806 5.319862 5.929817 5.418011 4.006192 5.080484
Clifford at the Circus 4.156323 2.698451 4.878131 5.315628 5.112783 5.837976 5.554636 4.057277 5.302236
The Little Engine That Could 4.168277 2.348698 4.959819 5.625008 5.529076 6.006844 5.943951 4.177009 5.894237
Don’t Let the Pigeon Drive the Bus 4.169540 2.699108 4.361184 5.032128 5.436964 6.001124 5.306232 4.156528 5.826080
Chicka Chicka Boom Boom 4.216716 2.493509 5.068942 4.985553 5.266605 5.945772 5.322188 4.554191 5.377847
This is Not My Hat 4.233072 2.619700 4.572122 4.118511 5.174706 6.100822 4.539189 4.161200 4.961150
Train 4.237143 2.665114 4.506199 5.689424 5.172233 6.029113 5.900524 4.371434 5.603390
Mike Mulligan and his Steam Shovel 4.278656 2.732265 4.465279 5.344629 5.114557 6.024510 5.545740 4.221812 5.429473
Green Eggs and Ham 4.298217 1.932838 4.572838 6.146496 5.296555 6.439499 6.356887 3.884836 5.658373
Caps for Sale 4.312667 2.269363 4.380586 5.388035 5.419624 6.188985 5.899287 3.841615 5.323349
Dear Zoo 4.369700 2.459812 5.536562 4.261213 5.618988 5.930162 4.877237 4.434775 4.955662
Click, Clack, Moo Cows that Type 4.383226 2.721035 3.936421 5.308230 5.353345 5.894016 5.471048 3.637914 5.216732
The Giving Tree 4.383520 2.124385 4.775152 5.369548 5.418615 6.277986 5.706312 4.352319 5.994998
Maisy Goes Camping 4.409527 2.721009 4.398977 5.459523 5.303373 5.872255 5.808691 3.766491 5.377218
Pete the Cat: The Wheels on the Bus 4.412219 2.028946 3.657239 6.478211 5.088477 6.473200 6.467965 4.198854 5.384435
The Grouchy Ladybug 4.472176 2.608950 5.218272 5.474086 5.651815 5.818219 5.754042 4.245812 5.031170
The Carrot Seed 4.487731 2.092119 4.469981 5.942337 5.007688 6.320069 6.283181 4.035025 5.701275
Little Blue Truck Leads the Way 4.503360 2.625326 4.793552 5.112463 5.226235 6.127606 5.570459 4.409746 5.451022
Dinosaur Rescue 4.611310 2.627396 4.752250 4.472894 4.869712 5.974671 4.856633 4.760427 5.313483
Trashy Town 4.735243 2.920319 4.246592 5.486077 4.969766 5.713066 5.793869 4.218017 4.413351
norm_means %>%
  left_join(overall_means) %>%
  group_by(norm) %>%
    ggplot(aes(x = mean, fill = norm)) +
    geom_histogram() +
    facet_wrap(~norm) +
    ggtitle("mean rating by book") +
    geom_vline(aes(xintercept = mean_rating), linetype = 2) +
    scale_x_continuous(name = "mean rating", 
                     limits = c(1, 7),
                     breaks = 1:7) +
    theme(legend.position = "none")

SCALE LABELS: (left to right)

Suprisingly, books are overall slightly more feminine than the overall sample of words. All the other norms look relatively sensible.

Are the properties of books correlated?

Correlation matrix (color indicates signifance at the .05 level)

corr_mat <- cor(norm_means_wide[,-1], 
                use = "pairwise.complete.obs")

p.mat <- corrplot::cor.mtest(norm_means_wide[,-1], 
                  use = "pairwise.complete.obs")$p

cols = rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot::corrplot(corr_mat, method = "color",  col = cols,
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)

More masculine books have words that tend to be learned later, less familiar, more negatively valenced, and lower arousal.

A regression predicting the gender of the book with all other norms:

lm(GEND_M ~ AOA_M+AROU_M+CNC_M+DOM_M+FAM_M + 
     IMAG_M+ SIZE_M + VAL_M, d = norm_means_wide) %>%
  summary()
## 
## Call:
## lm(formula = GEND_M ~ AOA_M + AROU_M + CNC_M + DOM_M + FAM_M + 
##     IMAG_M + SIZE_M + VAL_M, data = norm_means_wide)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43667 -0.11362 -0.00394  0.10384  0.68349 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.11896    1.72910   1.225   0.2236    
## AOA_M       -0.01639    0.15737  -0.104   0.9173    
## AROU_M      -0.18027    0.08465  -2.130   0.0359 *  
## CNC_M        0.38392    0.17900   2.145   0.0346 *  
## DOM_M        0.78014    0.12872   6.061 3.03e-08 ***
## FAM_M        0.06719    0.18126   0.371   0.7117    
## IMAG_M      -0.31607    0.17696  -1.786   0.0774 .  
## SIZE_M       0.43638    0.07778   5.610 2.16e-07 ***
## VAL_M       -0.67743    0.05962 -11.362  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1925 on 91 degrees of freedom
## Multiple R-squared:  0.7278, Adjusted R-squared:  0.7039 
## F-statistic: 30.41 on 8 and 91 DF,  p-value: < 2.2e-16