This project is to analyze a series of banned books by the American Library Association Compared to a non-banned book that would be considered a control. I chose the book “Frankenstein” because it was the most downloaded book over the past 30 days. I will be selecting the top 3 most banned books based on downloads from the website https://gutenberg.org/ebooks/bookshelf/422. The three most popular books are “The Adventures of Huckleberry Finn” by Mark Twain, “Ulysses” by James Joyce, and “The Awakening and other Short Stories” by Kate Chopin. I will be looking at their most common words first to look and see if there is any particular words that show off a red flag as ban worthy. Then to secure evidence I will determining the sedimentary values of the most harsh words to see if the number of harsh words are the reason for the banning of the books. Negative values are represented as -1 through -4, -4 being the lowest and harshest words. I will be only using -4 and -3 as it is most useful to see the harshest words. In order to see the rate of negative words in the books I will be taking the number of -4 and -3 words for each book and dividing them to the total number of words in the book to get a ratio.
First Ill load the packages.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(textdata)
library(readr)
Frankenstein <- read_csv("C:/Users/hamme/OneDrive/Desktop/R - Media/Text/Frankenstein.txt",
col_names = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 6387 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Frankenstein)
Frankenstein %>%
unnest_tokens(word, X1) -> Frankenstein_words
Frankenstein_words %>%
count(word, sort =TRUE)
## # A tibble: 7,071 x 2
## word n
## <chr> <int>
## 1 the 4194
## 2 and 2976
## 3 i 2846
## 4 of 2642
## 5 to 2089
## 6 my 1776
## 7 a 1391
## 8 in 1128
## 9 was 1021
## 10 that 1017
## # ... with 7,061 more rows
This shows the top ten words.
Frankenstein_words %>%
anti_join(stop_words) %>%
count(word, sort=TRUE) %>%
head(10)
## Joining, by = "word"
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 life 115
## 2 father 112
## 3 eyes 104
## 4 time 98
## 5 night 90
## 6 elizabeth 88
## 7 found 87
## 8 mind 85
## 9 heart 81
## 10 day 80
This shows the -4 words for the books
Frankenstein_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-4)) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Frankenstein of -4 Words" )+
ylab("Words") + xlab("Number of Times used")+ coord_flip()
## Joining, by = "word"
This shows the -3 words for the book.
Frankenstein_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3)) -> FR_THREE
## Joining, by = "word"
view(FR_THREE)
Frankenstein_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3)) %>%
head(15)%>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Frankenstein -3 Words" )+
xlab("Words") + ylab("Number of Times used")+ coord_flip()
## Joining, by = "word"
library(readr)
Huck <- read_csv("C:/Users/hamme/OneDrive/Desktop/R - Media/Text/Huck.txt",
col_names = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 9099 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Huck)
Huck %>%
unnest_tokens(word, X1) -> Huck_words
Huck_words %>%
count(word, sort =TRUE)
## # A tibble: 6,595 x 2
## word n
## <chr> <int>
## 1 and 6351
## 2 the 4787
## 3 i 3272
## 4 a 3148
## 5 to 2933
## 6 it 2326
## 7 was 2042
## 8 he 1657
## 9 of 1636
## 10 in 1433
## # ... with 6,585 more rows
Huck_words %>%
anti_join(stop_words) %>%
count(word, sort=TRUE) %>%
head(10)
## Joining, by = "word"
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 jim 335
## 2 time 324
## 3 warn't 290
## 4 de 252
## 5 en 235
## 6 tom 214
## 7 pretty 160
## 8 nigger 155
## 9 king 148
## 10 told 144
Huck_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-4))%>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Huck Finn Number of -4 Words" )+
ylab("Words") + xlab("Number of Times used")+coord_flip()
## Joining, by = "word"
Huck_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3))-> HF_Three
## Joining, by = "word"
view(HF_Three)
Huck_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3)) %>%
head(15) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Top 15 Huck Finn -3 Words" )+
xlab("Words") + ylab("Number of Times used")+ coord_flip()
## Joining, by = "word"
library(readr)
Ulysses <- read_csv("C:/Users/hamme/OneDrive/Desktop/R - Media/Text/Ulysses.txt",
col_names = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 25454 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Ulysses)
Ulysses %>%
unnest_tokens(word, X1) -> Ulysses_words
Ulysses_words %>%
count(word, sort =TRUE)
## # A tibble: 30,644 x 2
## word n
## <chr> <int>
## 1 the 14902
## 2 of 8141
## 3 and 7207
## 4 a 6493
## 5 to 4954
## 6 in 4930
## 7 he 4029
## 8 his 3328
## 9 i 2680
## 10 that 2603
## # ... with 30,634 more rows
Ulysses_words %>%
anti_join(stop_words) %>%
count(word, sort=TRUE) %>%
head(10)
## Joining, by = "word"
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 bloom 933
## 2 stephen 503
## 3 time 376
## 4 eyes 329
## 5 hand 304
## 6 street 293
## 7 father 277
## 8 day 250
## 9 round 239
## 10 night 232
Ulysses_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-4))-> US_four
## Joining, by = "word"
view(US_four)
Ulysses_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-4)) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Ulysses Number of -4 Words" ) +
ylab("Words") + xlab("Number of Times used") + coord_flip()
## Joining, by = "word"
Ulysses_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3))-> US_Three
## Joining, by = "word"
view(US_Three)
Ulysses_words%>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3)) %>%
head(15) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Top 15 Ulysses -3 Words" )+
xlab("Words") + ylab("Number of Times used")+ coord_flip()
## Joining, by = "word"
library(readr)
Awaken <- read_csv("C:/Users/hamme/OneDrive/Desktop/R - Media/Text/Awaken.txt",
col_names = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 5834 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Awaken)
Awaken %>%
unnest_tokens(word, X1) -> Awaken_words
Awaken_words %>%
count(word, sort =TRUE)
## # A tibble: 6,911 x 2
## word n
## <chr> <int>
## 1 the 3499
## 2 and 2096
## 3 to 1814
## 4 her 1639
## 5 of 1587
## 6 she 1558
## 7 a 1464
## 8 was 1081
## 9 in 959
## 10 he 858
## # ... with 6,901 more rows
Awaken_words%>%
anti_join(stop_words) %>%
count(word, sort=TRUE) %>%
head(10)
## Joining, by = "word"
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 edna 287
## 2 pontellier 180
## 3 robert 149
## 4 madame 138
## 5 day 98
## 6 house 97
## 7 time 95
## 8 don’t 94
## 9 eyes 94
## 10 night 93
Awaken_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-4)) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Awakening Number of -4 Words" )+
ylab("Words") + xlab("Number of Times used")+ coord_flip()
## Joining, by = "word"
Awaken_words %>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3))-> AW_Three
## Joining, by = "word"
view(AW_Three)
Awaken_words%>%
inner_join(get_sentiments('afinn')) %>%
count(word, value,sort = TRUE) %>%
arrange(desc(n)) %>%
filter(value == c(-3)) %>%
head(15) %>%
ggplot(aes(reorder(word,n),n, fill=word))+ geom_col()+ggtitle("Top 15 Awakening -3 Words" )+
xlab("Words") + ylab("Number of Times used")+ coord_flip()
## Joining, by = "word"