Barplots

Now lets take a look at some ggplot2 barplots

We’ll start with making a dataframe based on the tooth data

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

And now lets make a second dataframe

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Lets load up ggplot2

library(ggplot2)

Lets set our parameters for ggplot

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = -0.3)

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") +
  scale_color_manual(values = c("blue", "gold", "red"))

This is kinda hard to see, so lets change the fill

f + geom_col(aes(fill = dose)) + 
  scale_fill_manual(values = c("blue", "gold", "red"))

Ok how do we do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_stack()) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))
p

Now lets add those labels to the dodged barplot

p + geom_text(
  aes(label = len, group = supp),
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

Now what if we want to add labels to our stacked barplots? For this we need dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

Now lets recreate our stacked graphs

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(fill = supp), width = 0.7) +
  geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

Boxplots

Lets look at some boxplots

data("ToothGrowth")

Lets change thr dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() +
    theme(legend.position = "top")
)

Lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") +
  stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included, and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

Lets put our x axis in descending order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

We can also change our boxplot colors by groups

tg + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = c("indianred", "blue4", "green4"))

What if we want to display our data subset by oj vs vitamin C?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#999999", "#E69F00"))

tg2

We can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

This is one of my favorite functions so far ^^^

Histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 56), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now lets load dplyr

library(dplyr)

mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") +
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Dotplots

First lets load the required packages

library(ggplot2)

Lets set our theme

theme_set(
  theme_dark() +
    theme(legend.position = "top")
)

First lets initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
  stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets add a boxplot and a dotplot together

tg + geom_boxplot(width = 0.5) + 
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") + 
  stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') +
  scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center',
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lineplots

Now lets change it up and look at some lineplots

We’ll start by mkaing a custom dataframe like the tooth dataset. This way we can see the lines and stuff that we’re modifying

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1,", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC  D1, 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ  D1, 10.0
## 6   OJ   D2 29.5

Now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
)

Now lets do some basic line plots. First we will build a function to display all the different line types.

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch = "", ylim = c(0.,6), xlim = c(0,0.7), axes = FALSE, xlab = "", ylab = "")
  for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty = i, lwd = 3)
  text(rep(0.1,6), 0:6, labels = c("0.'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'",
                                     "4.'dotdash'", "5.'longdash'", "6,'twodash'"))
  par(mar=oldPar$mar, font = oldPar$font)
  
}

generateRLineTypes()

Now lets build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))

p + geom_line() + geom_point()

Now lets modify the line type and color

p + geom_line(linetype = "dashed", color = "steelblue") +
  geom_point(color = "steelblue")

Now lets try a step graph, which indicates a threshold type progression

p + geom_step() + geom_point()

Now lets move on to mkaing multiple groups. First we’ll make our ggplot object

p <- ggplot(df2, aes(x = dose, y = len, group = supp))

Now lets change line types and point shapes by group

p + geom_line(aes(linetype = supp, color = supp)) +
  geom_point(aes(shape = supp, color = supp)) +
  scale_color_manual(values = c("red", "blue"))

Now lets look at line plots with a numeric x axis

df3 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("0.5", "1", "2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df3
##   supp dose  len
## 1   VC  0.5  6.8
## 2   VC    1 15.0
## 3   VC    2 33.0
## 4   OJ  0.5  4.2
## 5   OJ    1 10.0
## 6   OJ    2 29.5

Now lets plot where both axises are treated as continuous labels

df3$dose <- as.numeric(as.vector (df3$dose))
ggplot(data = df3, aes(x = dose, y = len, group = supp, color = supp)) +
  geom_line() + geom_point()

Now lets look at a line graph with having the x axis as dates. We’ll use the built in economics time series for this example.

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data = economics, aes(x = date, y = pop)) +
  geom_line()

Now lets subset the data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

We can also change the line size, for instance by another varable like unemployment rate

ggplot(data = economics, aes(x = date, y = pop)) +
  geom_line(aes(size = unemploy/pop))

We can also plot multiple time series data

ggplot(economics, aes(x = date)) +
  geom_line(aes(y = psavert), color = "darkred") +
  geom_line(aes(y = uempmed), color = "steelblue", linetype = "twodash")

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date)) +
  geom_area(aes(y = psavert), fill = "#999999",
            color = "#999999", alpha = 0.5) +
  geom_area(aes(y = uempmed), fill = "#E69F00",
            color = "#E69F00", alpha = 0.5)

Ridgeplots

library(ggplot2)
library(ggridges)

Now lets load some sample data

?airquality
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air
## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
  geom_density_ridges_gradient() + 
  scale_fill_viridis(option = "C", name = "Temp")
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all of our data.

library(tidyr)

airquality %>%
gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) +
  geom_density_ridges() + 
  facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

Density Plots

A density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata %>%
  group_by(sex) %>%
summarise(grp.mean = mean(weight))

Now lets load the graphing package

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

d + geom_density() +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now lets change the y axis to count instead of density

a + geom_density(aes(y = stat(count)), fill = "lightgrey") +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
## Warning: `stat(count)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

d + geom_density(aes(color = sex)) +
  scale_color_manual(values = c("darkgrey", "gold"))

Lastly, lets fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") + 
  scale_color_manual(values = c("grey", "gold")) +
  scale_fill_manual(values = c("grey", "gold"))

Plotly

Plotly Line Plots

First lets load our required package

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Lets start with a scatterplot of the Orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age", age, "circ:", circumference)
)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataset

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)

We’ll use the random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or a line and add labels.

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~circumference,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
   add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "Plot or Orange data with switchable trace", 
    updatemenus = list(
      list(
        type = "downdrop",
        y = 0.8,
        buttons = list(
          list(method = "restyle",
               args = list("mode", "markers"),
               label = "Marker"
          ),
          list(method = "restyle",
               args = list("mode", "lines"),
               labels = "Lines"
        )
      )
    )
  )
)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Plotly 3D

First lets load pur required packages

library(plotly)

Now lets create a random 3D matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))

Now lets plot our 3D data

plot_ly(d, x = ~x, y = ~y, z = ~z) %>%
  add_surface()

Lets add s=more aspects to it, such as topography

plot_ly(d, x = ~x, y = ~y, z = ~z) %>%
  add_surface(
    contours = list(
      z = list(
        show = TRUE,
        usecolormap = TRUE,
        highlightcolor = "FF0000",
        project = list(z = TRUE)
      )
    )
  )

Now lets look at a 3D scatter plot

plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

Error Bars

First, lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position = 'top')
)

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE),
    len = mean(len),
    stderr = std.error(len, na.rm = TRUE)
  )

df.summary
## # A tibble: 3 × 4
##   dose     sd   len stderr
##   <fct> <dbl> <dbl>  <dbl>
## 1 0.5    4.50  10.6     NA
## 2 1      4.42  19.7     NA
## 3 2      3.77  26.1     NA

Lets now look at some key functions

  • geom_crossbar() for hollow bars with middle indicated by a horizontal line
  • geom_errorbar() for error bars
  • geom_errorbarh() for horizontal error bar
  • geom_linerange() for drawing an interval represented by a vertical line
  • geom_pointrange() for creating an interval represented by a vertical line; with a point in the middle

lets start by creating a ggplot object

tg <- ggplot(
  df.summary,
  aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
)

Now lets look at the most basic error bars

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2) +
  geom_point(size = 1.5)

Now lets create horizontal error bars by manipulating our graph

ggplot(df.summary, aes(x = len, y = dose, xmin = len-sd, xmax = len+sd)) +
  geom_point() +
  geom_errorbar(height = 0.2)
## Warning in geom_errorbar(height = 0.2): Ignoring unknown parameters: `height`

This just gives you an idea of error bars on the horizontal axis

Now lets look at adding jitter points (actual measurements) to our data.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgrey", trim = FALSE) +
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)

Now how about w/ a line graph?

ggplot(df.summary, aes(dose, len)) +
  geom_line(aes(group = 1)) + # always specidy this when you have 1 line
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr), width = 0.2) +
  geom_point(size = 2)

#Where are my error bars???

Now lets make a bar graph with half error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgrey", color = "black") +
  geom_errorbar(aes(ymin = len, ymax = len+stderr), width = 0.2)

You can see that by not specifying wmin = len-stderr, we have in essence cut our error bar in half.

How about we add jitter points to line plots? We need to use the original df for the jitter plot, and the summary df for the geom layers.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_line(aes(group = 1), data = df.summary) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr),
    data = df.summary, width = 0.2) + 
  geom_point(data = df.summary, size = 0.2)

What about adding jitterpoints to a barplot?

ggplot(df, aes(dose, len)) +
  geom_col(data = df.summary, fill = NA, color = "black") +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr),
                data = df.summary, width = 0.2)

What if we wanted to have our error bars per group? (OJ vs VC)

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise(
    sd = sd(len),
    stderr = std.error(len),
    len = mean(len)
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) +
  geom_pointrange(
    aes(ymin = len - stderr, ymax = len + stderr, color = supp),
    position = position_dodge(0.3)) +
  scale_color_manual(values = c("indianred", "lightblue"))

How about line plots with multiple error bars

ggplot(df.summary2, aes(dose, len)) +
  geom_line(aes(linetype = supp, group = supp)) +
  geom_point() +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), width = 0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len)) +
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr, group = supp),
    width = 0.2, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color = supp)) +
  geom_jitter(position = position_dodge(0.2)) +
  geom_line(aes(group = supp), data = df.summary2) +
  geom_point() +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), data = df.summary2, width = 0.2)

“Last one” he says

ggplot(df, aes(dose, len, color = supp)) +
  geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white") +
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr), data = df.summary2,
    width = 0.2, position = position_dodge(0.8)) +
  scale_color_manual(values = c("indianred", "lightblue")) +
  theme(legend.position = "top")

ECDF Plots

Now lets do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below the threshold.

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58)))

Now lets look at our dataframe

head(wdata, 5)
##   sex   weight
## 1   F 53.79293
## 2   F 55.27743
## 3   F 56.08444
## 4   F 52.65430
## 5   F 55.42912

Now lets load our plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now lets create our ECDF plot

ggplot(wdata, aes(x = weight)) +
  stat_ecdf(aes(color = sex, linetype = sex),
            geom = "step", size = 1.5) +
  scale_color_manual(values = c("#00AFBB", "#E7B900")) +
  labs(y = "weight")

QQ Plots

Now lets look at qqplots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some data

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

create a qq plot of the weight

ggplot(wdata, aes(sample = weight)) +
  stat_qq(aes(color = sex)) +
  scale_color_manual(values = c("indianred", "lightblue")) +
  labs(y = "weight")

#Inidan red and light blue are better than mustard yellow :)
library(ggpubr)

ggqqplot(wdata, x = "weight",
         color = "sex",
         palettes = c("indianred", "lightblue"),
         ggtheme = theme_pubclean())

Now what would a norm distribution look like?

#install.packages(mnonr)

library(mnonr)

data2 <- mnonr::mnonr(n = 1000, p = 2, ms = 3, mk = 61, Sigma = matrix(c(1,0.5,0.5,1), 2,2), initial = NULL)

data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample = V1)) +
  stat_qq()

Now, normal plotted data

ggqqplot(data2, x = "V1",
         palette = "indianred",
         ggtheme = theme_pubclean())

Facet Plots

Lets look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

First lets create a nice boxplot, lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x = dose, y = len)) +
  geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("indianred", "lightblue"))

p

Now lets look at the gvgplot facet function

p + facet_grid(rows = vars(supp))

Now lets do a facet with mult. variables

p + facet_grid(rows = vars(dose), cols = vars(supp))

p

Now lets look at the facet_wrap function. This allows facets to be placed side by side.

p + facet_wrap(vars(dose), ncol = 3)

Now how do we combine multiple plots using ggarange()

lets start by making some basic plots. First we will define a color palette and data

my3cols <- c("indianred", "lightblue", "green")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = my3cols)

Now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
                       binaxis = 'y', stackdir = 'center') +
  scale_color_manual(values = my3cols) +
  scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x = date, y = psavert)) +
  geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure

This looks great, but we can make it look even better

figure2 <- ggarrange(
  lp,
  ggarrange(bxp, dp, ncol = 2, labels = c("B", "C")),
  nrow = 2,
  labels = "A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure2

ok this looks really good, but you’ll notice that there are 2 legends that are the same

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly, we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")
## file saved to facetfigure.pdf

We can also export mult. figures to a pdf

ggexport(box, lp, dp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

Lastly, we can export to pdf w/ mult. pages and mult. columns

ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol = 1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Heatmaps

Lets get started w/ heatmaps

library(heatmap3)

Now lets get our data

data <-  ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heatmap

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

Now lets play with colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now lets apply our color selections

heatmap(data2, ColSideColors = cc)

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
        col = colorRampPalette(brewer.pal(8, "PiYG")) (25))

Theres more that we can customize

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
## 
##     plotCI
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(data2, ColSideColors = cc,
          col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Outlier Detection

Missing Values

Missing Values

If you encounter an unusual value in your dataset, and simply want to move on to the rest of your analysis, you have 2 options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y,3,20))

In this instance, y is the width of the diamond, so anything 3 mm or above 20 mm is exlcuded.

I don’t recommend this option; just because there is one bad measurement doesn’t mean they are all bad.

Instead, I recommend replacing the unusual values with missing values

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values should not pass silently into the night.

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to suppress that warning you can use na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled. So, you might want to compare the scheduled departure times for canelled and non-cancelled times.

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes (sched_dep_time)) + 
  geom_freqpoly(mapping = aes(color = cancelled), bindwidth = 1/4)
## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwidth = 1/4):
## Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers

What if we want to know what our outliers are?

First we need to load the required libraries.

library(outliers)
library(ggplot2)
library(readxl)

And reload the dataset because we removed outliers

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Lets create a function using the grubb test to identify all outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribution.

grubbs.flag <- function(x) {
#lets create a variable called outliers and save nothing in it, and add to the variable as we identify them
  outliers <- NULL
# we'll create a variable called test to identify which univariate we are testing
  test <- x
# now using the outliers package, use grubbs.test to find outliers in our variable
  grubbs.result <- grubbs.test(test)
# lets get the p values of all tested variables
  pv <- grubbs.result$p.value
# now lets search through our p values for ones that are outside of 0.5
  while(pv < 0.05) {
# anything with a p value less than 0.5, we add to our empty outliers vector
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, " ")[[1]][3]))
# now we want to remove those outliers from our test variable 
    test <- x[!x %in% outliers]
# and run the grubbs test again without the outliers
    grubbs.result <- grubbs.test(test)
# and save the new p values
    pv <- grubbs.result$p.value
  }
return(data.frame(x=x, outliers = (x %in% outliers)))
  
}
#identified_outliers <- grubbs.flag(Air_data$set)

Now we can create a histogram where the outliers were

ggplot(grubbs.flag(Air_data$AH), aes(x = Air_data$AH, color = outliers, fill = outliers)) +
  geom_histogram(bindwidth = diff(range(Air_data$AH))) +
  theme_bw()
## Warning in geom_histogram(bindwidth = diff(range(Air_data$AH))): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Covariance

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) + 
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

Its hard to see the difference in distribution because the counts differ so much.

ggplot(diamonds)+
  geom_bar(mapping = aes(x = cut))

To make comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one.

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe thats because the frequency of polygons are a little hard to interpret.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

We see much less info about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average!

Lets look at some car data

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it 90 degrees.

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))+
  coord_flip()

To visualize the correltion between two continuous variables, we can use a scatter plot.

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

Scatterplots become less useful as the size of your dataset grows, because we get overplot. We can fix this by using the alpha aesthetic

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

Exploratory Data Analysis (pt 1-5)

First lets load a required library

library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

College_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(College_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabetically by college?

alphabetical <- (College_Data) %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset with the filter function

Louisiana_Cases <- filter(College_Data, state == "Louisiana")

Lets filter out a smaller amount of states

South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")

Lets look at some time series data

First we’ll load required libararies

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

Lets create group_by object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states starting reporting.

Days_since_first_reported <- tally(state_cases)

Lets visualize some data

First lets start with some definitions

Data - obvious - the stuff we want to visualize

Layer - made of geometric elements and requisite statistical info. Include geometric objects which represents the plot.

Scales = used to map values in the data space that is used for creation of values (color, size, shape, ect.)

Coordinate system - describes how the data coordinates are mapped together in relation to the plan on the graphing

Faceting - how to break up data in to subsets to display multiple types or groups of data

theme - controls the finer points of the display, such as font size and background color

options(repr.plot.width = 6, repr.plot.height = 6)

class(College_Data)
## [1] "data.frame"
head(College_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now lets take a look at a different dataset

iris <- as.data.frame(iris)

class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Lets start by creating a scatter plot of the college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point() +
  theme_minimal()

Lets color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Lets color coordinate iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
  geom_point() +
  theme_minimal()

Lets run a simple histogram of our Louisiana Case Data

hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", main = "Total College Covid-19 Infections (Louisiana)")

Lets run a simple histogram for the iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county)) +
  xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid-19 Cases in Louisiana")
## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lets create a ggplot for the iris data

histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

histogram_iris + geom_histogram(bindiwth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")
## Warning in geom_histogram(bindiwth = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindiwth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Maybe a density plot makes more sense for our college data

ggplot(South_Cases) +
  geom_density(aes(x = cases, fill = state), alpha = 0.25)

Lets do it with the iris data

ggplot(iris) +
  geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

Lets look at violin plots for iris

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
  geom_violin() +
  theme_classic() +
  theme(legend.position = "none")

Now lets try the south data

ggplot(data = South_Cases, aes(x = state, y = cases, color = state)) +
  geom_violin() +
  theme_gray() +
  theme(legend.position = "none")

Now lets take a look at risidual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner aroind the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.

Lets start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
  geom_point(aes(x = .fitted, y = .resid))

Now lets look at the souther state cases

ggplot(lm(cases ~ cases_2021, data = South_Cases)) +
  geom_point(aes(x = .fitted, y = .resid))

A linear model is not a good call for the state cases

Now lets do some correlations

obesity <- read.csv("Obesity_insurance.csv")
library(tidyr)
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Lets look at the structure of the dataset

str(obesity)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Lets look at the column classes

class(obesity)
## [1] "data.frame"

And get a summary of distribution of variables

summary(obesity)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now lets look at the distribution for insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now lets look at correlations. The cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or 2 data frames. The cov() command on the other hand examines the covariance. The cor.test() command carries out a test as to the significance of the correlation

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

This test uses a spearman Rho correltation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = "kendall")
## [1] 0.08252397

This correlation measures strength of a correlation between -1 and 1.

Noe lets look at the Tietjen = Moore test. This is used for univariate datasets. The algorithm depicts the detection of the outliers in a univariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n = length(dataSeries)
  # compute the absolute residuals
  r = abs(dataSeries - mean(dataSeries))
  # sort data according to size of residual
  df = data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  #create a subset of the data without the largest values
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  #Compuet the sums of squares.
  ksub = (subdataSeries = mean(subdataSeries))**2
  all = (df$dataSeries - mean(df$dataSeries))**2
  # compute the test statistic.
  sum(ksub)/sum(all)
}

This function helps to compute the absolute residuals and sorts the data according to the size of the residuals. Later, we will focus on the computation of sum of squares.

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  # Compute critical values based on simulation.
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
}

This function helps us to compute critical values based on simulation data. Now lets demonstrate these functions with simple data and obesity dataset for evaluating this algorithm.

The critical region for the Teirtjen-Moore is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and comuting the Tietjen-Moore test statistic. Typically, 10,000 random samples are used. The values of the Tietjen_Moore statistic obtained from the data is compared to this reference distribution. The values of the test stat is between zero and one. If there are no outliers in the data, the test stat is close to 1. Ig there are outliers, the test stat will be closer to 0. Thus, the test is always a lower, one-tailed test regardless of which test stat is used, Lk or Ek.

First we will look at charges

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 100)
## $T
## [1] 0.0005906641
## 
## $Talpha
##          50% 
## 3.166558e-07

Let check out bmi

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] 0.01878681
## 
## $Talpha
##          50% 
## 2.634795e-07

Probability Plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or for example, you may wonder “what is the likelihood that a person has a BMI of exactly ___? In this case, you would need to retrieve the density of the BMI distribution at values 140. The BMI distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Lets create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) +
  geom_point()

This gives us the probability of every single point occurring

Now lets use the pnorm function for more info

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) +
  geom_point()

What if we want to find the probability of the bmi being greater than 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)
## [1] "6.29 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.06287869

What about the probability that a bmi is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)),2), "%")
}

pp_less(40)
## [1] "93.71 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.9371213

What if we want to find the area in between?

pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Lets use the pnorm function. We need to assume a normal distribution for this.

What bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769

What if you want a random sampling of values within your distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-Wilk Test

so now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size.

shapiro.test(obesity$charges[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

Now lets check out age

shapiro.test(obesity$age[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16

And lastly bmi

shapiro.test(obesity$bmi[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

First lets load our packages

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(readxl)

#Air_data <- read_xlsx("AirQualityUCI.xlsx")

Date - date of measurement time - time of measurement CO(GT) - average hourly CO2 PT08,s1(CO) - tin oxide hourly average sensor response NMHC - average hourly non-metallic hydrocarbon concentration C6HC - average benzene concentration PT08.S3 (NMHC)- titania average hourly sensor response NOX - average hourly NOX conc. NO2 - average hourly NO2 conc. T - Temper RH - relative humidity AH - Absolute humidity

#str(Air_data)
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Lets get rid of the date in the time column

#Air_data$Time <- as_hms(Air_data$time)

#glimpse(Air_data)
#plot(Air_data$Ah, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

#t.test(Air_data$RH, Air_data$AH)

Text Mining (pt 1&2)

First we’ll look at the unnest_token function

Lets start by looking at an EMily Dickenson passage

text <- c("Because I could not stop from Death - ",
          "He kindly stopped for me - ",
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop from Death - "
## [2] "He kindly stopped for me - "           
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"

This is a typical character vector that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "Because I could not stop from Death - "
## 2     2 "He kindly stopped for me - "           
## 3     3 "The Carriage held but just Ourselves -"
## 4     4 "and Immortality"

Reminder: A tibble is a modern class of data frame within R. Its available in the dplyr and tibble packages, that has a convienent print method, will not convert strongs to factors, and does not use row names. Tibbles are great for use with tidy tools.

Next we will use the ‘unnest_tokens’ functions.

First we have the output column name that will be crerated as the text is unnested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeaustenr package to analysze some Jane Austen texts. There are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  dplyr::mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()

original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

To work with this as a tiday dataset, we need to restructure it n the on-token-per-row format, which as we saw earlier is done with the unnest_tokens() function

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word,text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function uses the tokenziers package to separate each line of text in the original dataframe into tokens.

The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs can be used.

Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr.

Often in text analysis, we will want to remove stop words. Stop words are words that are NOT useful for an analysis. These include words like the, of, to, and, and so forth.

We can remove stop words (kept in the tidytext dataset ’stop_words) with an anti_join().

data(stop_words)

tidy_books <- tidy_books

The stop words dataset in the tidytext package contains stop words from three lexicons. We can use them all together, as we have three, or filter() to only use one set of stop words if thats more appropriate for your analysis.

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 14,520 × 2
##    word      n
##    <chr> <int>
##  1 the   26351
##  2 to    24044
##  3 and   22515
##  4 of    21178
##  5 a     13408
##  6 her   13055
##  7 i     12006
##  8 in    11217
##  9 was   11204
## 10 it    10234
## # ℹ 14,510 more rows

Beacuse we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most common words.

library(ggplot2)

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL, x = "word count")

The gutenbergr package

This package provides access to the public domain works from the gutenberg project (www.gutenber.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().

Word frequencies

Lets look at some biology texts, starting with Darwin

The voyage of the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227 The decent of man, and selection in relation to sex - 2300

We can access these works susing the gutenberg_download() and the Project Gutenberg IDnumbers

library(gutenbergr)

darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
## Warning: ! Could not download a book at
##   http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.

Lets break these into tokens

tidy_darwin <- darwin %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

Lets check out what the most common darwin words are.

tidy_darwin %>%
  count(word, sort = TRUE)
## # A tibble: 22,315 × 2
##    word        n
##    <chr>   <int>
##  1 male     1620
##  2 species  1452
##  3 males    1286
##  4 female   1173
##  5 birds    1124
##  6 sexes    1062
##  7 animals  1013
##  8 females  1003
##  9 sexual    749
## 10 vol       712
## # ℹ 22,305 more rows

Now lets get some work from THomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and Adaptation - 63540

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")

Lets tokenize THM

tidy_morgan <- morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

What are THM’s most common words?

tidy_morgan %>%
  count(word, sort = TRUE)
## # A tibble: 13,855 × 2
##    word             n
##    <chr>        <int>
##  1 species        869
##  2 regeneration   814
##  3 piece          702
##  4 cut            669
##  5 male           668
##  6 forms          631
##  7 selection      604
##  8 cells          576
##  9 found          552
## 10 development    546
## # ℹ 13,845 more rows

Lastly, lets look at Thomas Henry Huxley

Evidence as to mans place in nature - 2931 On the reception of the Origin of Species - 2089 Evolution and Ethics, and other essays - 2940 Science and Culture, and other essays - 52344

huxley <- gutenberg_download(c(2931, 2940, 52344), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
## Warning: ! Could not download a book at
##   http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg//2/9/4/2940/2940.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
tidy_huxley <- huxley %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tidy_huxley %>%
  count(word, sort = TRUE)
## # A tibble: 11,251 × 2
##    word          n
##    <chr>     <int>
##  1 knowledge   184
##  2 de          170
##  3 animals     155
##  4 animal      151
##  5 science     150
##  6 life        149
##  7 time        135
##  8 body        126
##  9 nature      111
## 10 physical    105
## # ℹ 11,241 more rows

Now lets calculate the frequency for each word for the works of Darwin, Morgan, and Huxley by binding the frames together.

library(tidyr)

frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                       mutate(tidy_darwin, author = "Charles Darwin"),
                       mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
   mutate(word = str_extract(word, "[a-z']+")) %>%
   count(author, word) %>%
   group_by(author) %>%
   mutate(proportion = n/ sum(n)) %>%
   select(-n) %>%
   pivot_wider(names_from = author, values_from = proportion) %>%
   pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")
   
frequency
## # A tibble: 87,084 × 3
##    word    author               proportion
##    <chr>   <chr>                     <dbl>
##  1 a       Thomas Hunt Morgan   0.000638  
##  2 a       Thomas Henry Huxley  0.0000163 
##  3 a       Charles Darwin       0.0000489 
##  4 ab      Thomas Hunt Morgan   0.0000512 
##  5 ab      Thomas Henry Huxley  0.0000186 
##  6 ab      Charles Darwin       0.00000466
##  7 abaiss  Thomas Hunt Morgan  NA         
##  8 abaiss  Thomas Henry Huxley NA         
##  9 abaiss  Charles Darwin       0.00000466
## 10 abandon Thomas Hunt Morgan   0.00000233
## # ℹ 87,074 more rows

Now we need to change the table so that each author has its own row

frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)

frequency2
## # A tibble: 29,028 × 4
##    word        `Thomas Hunt Morgan` `Thomas Henry Huxley` `Charles Darwin`
##    <chr>                      <dbl>                 <dbl>            <dbl>
##  1 a                     0.000638              0.0000163        0.0000489 
##  2 ab                    0.0000512             0.0000186        0.00000466
##  3 abaiss               NA                    NA                0.00000466
##  4 abandon               0.00000233           NA                0.00000233
##  5 abandoned             0.00000466            0.00000233       0.00000233
##  6 abashed              NA                    NA                0.00000233
##  7 abatement            NA                     0.00000233       0.00000233
##  8 abbot                NA                     0.00000466       0.00000233
##  9 abbott               NA                    NA                0.00000466
## 10 abbreviated          NA                    NA                0.00000931
## # ℹ 29,018 more rows

Now lets plot

library(tidyr)

frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                       mutate(tidy_darwin, author = "Charles Darwin"),
                       mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
  mutate(word = str_extract(word, "[a-z]+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/ sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from = author, values_from = proportion) %>%
  pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")

frequency
## # A tibble: 86,541 × 3
##    word    author               proportion
##    <chr>   <chr>                     <dbl>
##  1 a       Thomas Hunt Morgan   0.000638  
##  2 a       Thomas Henry Huxley  0.0000163 
##  3 a       Charles Darwin       0.0000489 
##  4 ab      Thomas Hunt Morgan   0.0000512 
##  5 ab      Thomas Henry Huxley  0.0000186 
##  6 ab      Charles Darwin       0.00000466
##  7 abaiss  Thomas Hunt Morgan  NA         
##  8 abaiss  Thomas Henry Huxley NA         
##  9 abaiss  Charles Darwin       0.00000466
## 10 abandon Thomas Hunt Morgan   0.00000233
## # ℹ 86,531 more rows

Sentiment Analysis (pt 1-3)

The sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticpation, disgust, fear, joy, sadness, surprise and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.

The function get_sentiments() allows us to get the specific sentiments with the measures for each one.

installed.packages("textdata")
##      Package LibPath Version Priority Depends Imports LinkingTo Suggests
##      Enhances License License_is_FOSS License_restricts_use OS_type Archs
##      MD5sum NeedsCompilation Built
library(tidytext)
library(textdata)

afinn <- read.csv("afinn.csv")

afinn
##                    word value
## 1               abandon    -2
## 2             abandoned    -2
## 3              abandons    -2
## 4              abducted    -2
## 5             abduction    -2
## 6            abductions    -2
## 7                 abhor    -3
## 8              abhorred    -3
## 9             abhorrent    -3
## 10               abhors    -3
## 11            abilities     2
## 12              ability     2
## 13               aboard     1
## 14             absentee    -1
## 15            absentees    -1
## 16              absolve     2
## 17             absolved     2
## 18             absolves     2
## 19            absolving     2
## 20             absorbed     1
## 21                abuse    -3
## 22               abused    -3
## 23               abuses    -3
## 24              abusive    -3
## 25               accept     1
## 26             accepted     1
## 27            accepting     1
## 28              accepts     1
## 29             accident    -2
## 30           accidental    -2
## 31         accidentally    -2
## 32            accidents    -2
## 33           accomplish     2
## 34         accomplished     2
## 35         accomplishes     2
## 36           accusation    -2
## 37          accusations    -2
## 38               accuse    -2
## 39              accused    -2
## 40              accuses    -2
## 41             accusing    -2
## 42                 ache    -2
## 43           achievable     1
## 44               aching    -2
## 45               acquit     2
## 46              acquits     2
## 47            acquitted     2
## 48           acquitting     2
## 49          acrimonious    -3
## 50               active     1
## 51             adequate     1
## 52               admire     3
## 53              admired     3
## 54              admires     3
## 55             admiring     3
## 56                admit    -1
## 57               admits    -1
## 58             admitted    -1
## 59             admonish    -2
## 60           admonished    -2
## 61                adopt     1
## 62               adopts     1
## 63             adorable     3
## 64                adore     3
## 65               adored     3
## 66               adores     3
## 67             advanced     1
## 68            advantage     2
## 69           advantages     2
## 70            adventure     2
## 71           adventures     2
## 72          adventurous     2
## 73             affected    -1
## 74            affection     3
## 75         affectionate     3
## 76            afflicted    -1
## 77            affronted    -1
## 78               afraid    -2
## 79            aggravate    -2
## 80           aggravated    -2
## 81           aggravates    -2
## 82          aggravating    -2
## 83           aggression    -2
## 84          aggressions    -2
## 85           aggressive    -2
## 86               aghast    -2
## 87                 agog     2
## 88              agonise    -3
## 89             agonised    -3
## 90             agonises    -3
## 91            agonising    -3
## 92              agonize    -3
## 93             agonized    -3
## 94             agonizes    -3
## 95            agonizing    -3
## 96                agree     1
## 97            agreeable     2
## 98               agreed     1
## 99            agreement     1
## 100              agrees     1
## 101               alarm    -2
## 102             alarmed    -2
## 103            alarmist    -2
## 104           alarmists    -2
## 105                alas    -1
## 106               alert    -1
## 107          alienation    -2
## 108               alive     1
## 109            allergic    -2
## 110               allow     1
## 111               alone    -2
## 112               amaze     2
## 113              amazed     2
## 114              amazes     2
## 115             amazing     4
## 116           ambitious     2
## 117          ambivalent    -1
## 118               amuse     3
## 119              amused     3
## 120           amusement     3
## 121          amusements     3
## 122               anger    -3
## 123              angers    -3
## 124               angry    -3
## 125             anguish    -3
## 126           anguished    -3
## 127           animosity    -2
## 128               annoy    -2
## 129           annoyance    -2
## 130             annoyed    -2
## 131            annoying    -2
## 132              annoys    -2
## 133        antagonistic    -2
## 134                anti    -1
## 135        anticipation     1
## 136             anxiety    -2
## 137             anxious    -2
## 138           apathetic    -3
## 139              apathy    -3
## 140             apeshit    -3
## 141         apocalyptic    -2
## 142           apologise    -1
## 143          apologised    -1
## 144          apologises    -1
## 145         apologising    -1
## 146           apologize    -1
## 147          apologized    -1
## 148          apologizes    -1
## 149         apologizing    -1
## 150             apology    -1
## 151            appalled    -2
## 152           appalling    -2
## 153             appease     2
## 154            appeased     2
## 155            appeases     2
## 156           appeasing     2
## 157             applaud     2
## 158           applauded     2
## 159          applauding     2
## 160            applauds     2
## 161            applause     2
## 162          appreciate     2
## 163         appreciated     2
## 164         appreciates     2
## 165        appreciating     2
## 166        appreciation     2
## 167        apprehensive    -2
## 168            approval     2
## 169            approved     2
## 170            approves     2
## 171              ardent     1
## 172              arrest    -2
## 173            arrested    -3
## 174             arrests    -2
## 175            arrogant    -2
## 176              ashame    -2
## 177             ashamed    -2
## 178                 ass    -4
## 179       assassination    -3
## 180      assassinations    -3
## 181               asset     2
## 182              assets     2
## 183          assfucking    -4
## 184             asshole    -4
## 185          astonished     2
## 186             astound     3
## 187           astounded     3
## 188          astounding     3
## 189        astoundingly     3
## 190            astounds     3
## 191              attack    -1
## 192            attacked    -1
## 193           attacking    -1
## 194             attacks    -1
## 195             attract     1
## 196           attracted     1
## 197          attracting     2
## 198          attraction     2
## 199         attractions     2
## 200            attracts     1
## 201           audacious     3
## 202           authority     1
## 203               avert    -1
## 204             averted    -1
## 205              averts    -1
## 206                avid     2
## 207               avoid    -1
## 208             avoided    -1
## 209              avoids    -1
## 210               await    -1
## 211             awaited    -1
## 212              awaits    -1
## 213               award     3
## 214             awarded     3
## 215              awards     3
## 216             awesome     4
## 217               awful    -3
## 218             awkward    -2
## 219                 axe    -1
## 220                axed    -1
## 221              backed     1
## 222             backing     2
## 223               backs     1
## 224                 bad    -3
## 225              badass    -3
## 226               badly    -3
## 227             bailout    -2
## 228           bamboozle    -2
## 229          bamboozled    -2
## 230          bamboozles    -2
## 231                 ban    -2
## 232              banish    -1
## 233            bankrupt    -3
## 234            bankster    -3
## 235              banned    -2
## 236             bargain     2
## 237             barrier    -2
## 238             bastard    -5
## 239            bastards    -5
## 240              battle    -1
## 241             battles    -1
## 242              beaten    -2
## 243            beatific     3
## 244             beating    -1
## 245            beauties     3
## 246           beautiful     3
## 247         beautifully     3
## 248            beautify     3
## 249            belittle    -2
## 250           belittled    -2
## 251             beloved     3
## 252             benefit     2
## 253            benefits     2
## 254          benefitted     2
## 255         benefitting     2
## 256             bereave    -2
## 257            bereaved    -2
## 258            bereaves    -2
## 259           bereaving    -2
## 260                best     3
## 261              betray    -3
## 262            betrayal    -3
## 263            betrayed    -3
## 264           betraying    -3
## 265             betrays    -3
## 266              better     2
## 267                bias    -1
## 268              biased    -2
## 269                 big     1
## 270               bitch    -5
## 271             bitches    -5
## 272              bitter    -2
## 273            bitterly    -2
## 274             bizarre    -2
## 275                blah    -2
## 276               blame    -2
## 277              blamed    -2
## 278              blames    -2
## 279             blaming    -2
## 280               bless     2
## 281             blesses     2
## 282            blessing     3
## 283               blind    -1
## 284               bliss     3
## 285            blissful     3
## 286              blithe     2
## 287               block    -1
## 288         blockbuster     3
## 289             blocked    -1
## 290            blocking    -1
## 291              blocks    -1
## 292              bloody    -3
## 293              blurry    -2
## 294            boastful    -2
## 295                bold     2
## 296              boldly     2
## 297                bomb    -1
## 298               boost     1
## 299             boosted     1
## 300            boosting     1
## 301              boosts     1
## 302                bore    -2
## 303               bored    -2
## 304              boring    -3
## 305              bother    -2
## 306            bothered    -2
## 307             bothers    -2
## 308          bothersome    -2
## 309             boycott    -2
## 310           boycotted    -2
## 311          boycotting    -2
## 312            boycotts    -2
## 313        brainwashing    -3
## 314               brave     2
## 315        breakthrough     3
## 316        breathtaking     5
## 317               bribe    -3
## 318              bright     1
## 319           brightest     2
## 320          brightness     1
## 321           brilliant     4
## 322               brisk     2
## 323               broke    -1
## 324              broken    -1
## 325            brooding    -2
## 326             bullied    -2
## 327            bullshit    -4
## 328               bully    -2
## 329            bullying    -2
## 330              bummer    -2
## 331             buoyant     2
## 332              burden    -2
## 333            burdened    -2
## 334           burdening    -2
## 335             burdens    -2
## 336                calm     2
## 337              calmed     2
## 338             calming     2
## 339               calms     2
## 340         can't stand    -3
## 341              cancel    -1
## 342           cancelled    -1
## 343          cancelling    -1
## 344             cancels    -1
## 345              cancer    -1
## 346             capable     1
## 347          captivated     3
## 348                care     2
## 349            carefree     1
## 350             careful     2
## 351           carefully     2
## 352            careless    -2
## 353               cares     2
## 354          cashing in    -2
## 355            casualty    -2
## 356         catastrophe    -3
## 357        catastrophic    -4
## 358            cautious    -1
## 359           celebrate     3
## 360          celebrated     3
## 361          celebrates     3
## 362         celebrating     3
## 363              censor    -2
## 364            censored    -2
## 365             censors    -2
## 366             certain     1
## 367             chagrin    -2
## 368           chagrined    -2
## 369           challenge    -1
## 370              chance     2
## 371             chances     2
## 372               chaos    -2
## 373             chaotic    -2
## 374             charged    -3
## 375             charges    -2
## 376               charm     3
## 377            charming     3
## 378           charmless    -3
## 379            chastise    -3
## 380           chastised    -3
## 381           chastises    -3
## 382          chastising    -3
## 383               cheat    -3
## 384             cheated    -3
## 385             cheater    -3
## 386            cheaters    -3
## 387              cheats    -3
## 388               cheer     2
## 389             cheered     2
## 390            cheerful     2
## 391            cheering     2
## 392           cheerless    -2
## 393              cheers     2
## 394              cheery     3
## 395             cherish     2
## 396           cherished     2
## 397           cherishes     2
## 398          cherishing     2
## 399                chic     2
## 400            childish    -2
## 401            chilling    -1
## 402               choke    -2
## 403              choked    -2
## 404              chokes    -2
## 405             choking    -2
## 406           clarifies     2
## 407             clarity     2
## 408               clash    -2
## 409              classy     3
## 410               clean     2
## 411             cleaner     2
## 412               clear     1
## 413             cleared     1
## 414             clearly     1
## 415              clears     1
## 416              clever     2
## 417             clouded    -1
## 418            clueless    -2
## 419                cock    -5
## 420          cocksucker    -5
## 421         cocksuckers    -5
## 422               cocky    -2
## 423             coerced    -2
## 424            collapse    -2
## 425           collapsed    -2
## 426           collapses    -2
## 427          collapsing    -2
## 428             collide    -1
## 429            collides    -1
## 430           colliding    -1
## 431           collision    -2
## 432          collisions    -2
## 433           colluding    -3
## 434              combat    -1
## 435             combats    -1
## 436              comedy     1
## 437             comfort     2
## 438         comfortable     2
## 439          comforting     2
## 440            comforts     2
## 441             commend     2
## 442           commended     2
## 443              commit     1
## 444          commitment     2
## 445             commits     1
## 446           committed     1
## 447          committing     1
## 448       compassionate     2
## 449           compelled     1
## 450           competent     2
## 451         competitive     2
## 452          complacent    -2
## 453            complain    -2
## 454          complained    -2
## 455           complains    -2
## 456       comprehensive     2
## 457          conciliate     2
## 458         conciliated     2
## 459         conciliates     2
## 460        conciliating     2
## 461             condemn    -2
## 462        condemnation    -2
## 463           condemned    -2
## 464            condemns    -2
## 465          confidence     2
## 466           confident     2
## 467            conflict    -2
## 468         conflicting    -2
## 469         conflictive    -2
## 470           conflicts    -2
## 471             confuse    -2
## 472            confused    -2
## 473           confusing    -2
## 474            congrats     2
## 475        congratulate     2
## 476      congratulation     2
## 477     congratulations     2
## 478             consent     2
## 479            consents     2
## 480          consolable     2
## 481          conspiracy    -3
## 482         constrained    -2
## 483           contagion    -2
## 484          contagions    -2
## 485          contagious    -1
## 486            contempt    -2
## 487        contemptuous    -2
## 488      contemptuously    -2
## 489             contend    -1
## 490           contender    -1
## 491          contending    -1
## 492         contentious    -2
## 493         contestable    -2
## 494       controversial    -2
## 495     controversially    -2
## 496            convince     1
## 497           convinced     1
## 498           convinces     1
## 499           convivial     2
## 500                cool     1
## 501          cool stuff     3
## 502            cornered    -2
## 503              corpse    -1
## 504              costly    -2
## 505             courage     2
## 506          courageous     2
## 507           courteous     2
## 508            courtesy     2
## 509            cover-up    -3
## 510              coward    -2
## 511            cowardly    -2
## 512            coziness     2
## 513               cramp    -1
## 514                crap    -3
## 515               crash    -2
## 516             crazier    -2
## 517            craziest    -2
## 518               crazy    -2
## 519            creative     2
## 520         crestfallen    -2
## 521               cried    -2
## 522               cries    -2
## 523               crime    -3
## 524            criminal    -3
## 525           criminals    -3
## 526              crisis    -3
## 527              critic    -2
## 528           criticism    -2
## 529           criticize    -2
## 530          criticized    -2
## 531          criticizes    -2
## 532         criticizing    -2
## 533             critics    -2
## 534               cruel    -3
## 535             cruelty    -3
## 536               crush    -1
## 537             crushed    -2
## 538             crushes    -1
## 539            crushing    -1
## 540                 cry    -1
## 541              crying    -2
## 542                cunt    -5
## 543             curious     1
## 544               curse    -1
## 545                 cut    -1
## 546                cute     2
## 547                cuts    -1
## 548             cutting    -1
## 549               cynic    -2
## 550             cynical    -2
## 551            cynicism    -2
## 552              damage    -3
## 553             damages    -3
## 554                damn    -4
## 555              damned    -4
## 556              damnit    -4
## 557              danger    -2
## 558           daredevil     2
## 559              daring     2
## 560             darkest    -2
## 561            darkness    -1
## 562           dauntless     2
## 563                dead    -3
## 564            deadlock    -2
## 565           deafening    -1
## 566                dear     2
## 567              dearly     3
## 568               death    -2
## 569            debonair     2
## 570                debt    -2
## 571              deceit    -3
## 572           deceitful    -3
## 573             deceive    -3
## 574            deceived    -3
## 575            deceives    -3
## 576           deceiving    -3
## 577           deception    -3
## 578            decisive     1
## 579           dedicated     2
## 580            defeated    -2
## 581              defect    -3
## 582             defects    -3
## 583            defender     2
## 584           defenders     2
## 585         defenseless    -2
## 586               defer    -1
## 587           deferring    -1
## 588             defiant    -1
## 589             deficit    -2
## 590             degrade    -2
## 591            degraded    -2
## 592            degrades    -2
## 593          dehumanize    -2
## 594         dehumanized    -2
## 595         dehumanizes    -2
## 596        dehumanizing    -2
## 597              deject    -2
## 598            dejected    -2
## 599           dejecting    -2
## 600             dejects    -2
## 601               delay    -1
## 602             delayed    -1
## 603             delight     3
## 604           delighted     3
## 605          delighting     3
## 606            delights     3
## 607              demand    -1
## 608            demanded    -1
## 609           demanding    -1
## 610             demands    -1
## 611       demonstration    -1
## 612         demoralized    -2
## 613              denied    -2
## 614              denier    -2
## 615             deniers    -2
## 616              denies    -2
## 617            denounce    -2
## 618           denounces    -2
## 619                deny    -2
## 620             denying    -2
## 621           depressed    -2
## 622          depressing    -2
## 623              derail    -2
## 624            derailed    -2
## 625             derails    -2
## 626              deride    -2
## 627             derided    -2
## 628             derides    -2
## 629            deriding    -2
## 630            derision    -2
## 631           desirable     2
## 632              desire     1
## 633             desired     2
## 634            desirous     2
## 635             despair    -3
## 636          despairing    -3
## 637            despairs    -3
## 638           desperate    -3
## 639         desperately    -3
## 640          despondent    -3
## 641             destroy    -3
## 642           destroyed    -3
## 643          destroying    -3
## 644            destroys    -3
## 645         destruction    -3
## 646         destructive    -3
## 647            detached    -1
## 648              detain    -2
## 649            detained    -2
## 650           detention    -2
## 651          determined     2
## 652           devastate    -2
## 653          devastated    -2
## 654         devastating    -2
## 655             devoted     3
## 656             diamond     1
## 657                dick    -4
## 658            dickhead    -4
## 659                 die    -3
## 660                died    -3
## 661           difficult    -1
## 662           diffident    -2
## 663             dilemma    -1
## 664             dipshit    -3
## 665                dire    -3
## 666             direful    -3
## 667                dirt    -2
## 668             dirtier    -2
## 669            dirtiest    -2
## 670               dirty    -2
## 671           disabling    -1
## 672        disadvantage    -2
## 673       disadvantaged    -2
## 674           disappear    -1
## 675         disappeared    -1
## 676          disappears    -1
## 677          disappoint    -2
## 678        disappointed    -2
## 679       disappointing    -2
## 680      disappointment    -2
## 681     disappointments    -2
## 682         disappoints    -2
## 683            disaster    -2
## 684           disasters    -2
## 685          disastrous    -3
## 686          disbelieve    -2
## 687             discard    -1
## 688           discarded    -1
## 689          discarding    -1
## 690            discards    -1
## 691        disconsolate    -2
## 692      disconsolation    -2
## 693        discontented    -2
## 694             discord    -2
## 695          discounted    -1
## 696         discouraged    -2
## 697         discredited    -2
## 698             disdain    -2
## 699            disgrace    -2
## 700           disgraced    -2
## 701            disguise    -1
## 702           disguised    -1
## 703           disguises    -1
## 704          disguising    -1
## 705             disgust    -3
## 706           disgusted    -3
## 707          disgusting    -3
## 708        disheartened    -2
## 709           dishonest    -2
## 710       disillusioned    -2
## 711         disinclined    -2
## 712          disjointed    -2
## 713             dislike    -2
## 714              dismal    -2
## 715            dismayed    -2
## 716            disorder    -2
## 717        disorganized    -2
## 718         disoriented    -2
## 719           disparage    -2
## 720          disparaged    -2
## 721          disparages    -2
## 722         disparaging    -2
## 723          displeased    -2
## 724             dispute    -2
## 725            disputed    -2
## 726            disputes    -2
## 727           disputing    -2
## 728        disqualified    -2
## 729            disquiet    -2
## 730           disregard    -2
## 731         disregarded    -2
## 732        disregarding    -2
## 733          disregards    -2
## 734          disrespect    -2
## 735        disrespected    -2
## 736          disruption    -2
## 737         disruptions    -2
## 738          disruptive    -2
## 739        dissatisfied    -2
## 740             distort    -2
## 741           distorted    -2
## 742          distorting    -2
## 743            distorts    -2
## 744            distract    -2
## 745          distracted    -2
## 746         distraction    -2
## 747           distracts    -2
## 748            distress    -2
## 749          distressed    -2
## 750          distresses    -2
## 751         distressing    -2
## 752            distrust    -3
## 753         distrustful    -3
## 754             disturb    -2
## 755           disturbed    -2
## 756          disturbing    -2
## 757            disturbs    -2
## 758           dithering    -2
## 759               dizzy    -1
## 760             dodging    -2
## 761               dodgy    -2
## 762       does not work    -3
## 763            dolorous    -2
## 764           dont like    -2
## 765                doom    -2
## 766              doomed    -2
## 767               doubt    -1
## 768             doubted    -1
## 769            doubtful    -1
## 770            doubting    -1
## 771              doubts    -1
## 772              douche    -3
## 773           douchebag    -3
## 774            downcast    -2
## 775         downhearted    -2
## 776            downside    -2
## 777                drag    -1
## 778             dragged    -1
## 779               drags    -1
## 780             drained    -2
## 781               dread    -2
## 782             dreaded    -2
## 783            dreadful    -3
## 784            dreading    -2
## 785               dream     1
## 786              dreams     1
## 787              dreary    -2
## 788              droopy    -2
## 789                drop    -1
## 790               drown    -2
## 791             drowned    -2
## 792              drowns    -2
## 793               drunk    -2
## 794             dubious    -2
## 795                 dud    -2
## 796                dull    -2
## 797                dumb    -3
## 798             dumbass    -3
## 799                dump    -1
## 800              dumped    -2
## 801               dumps    -1
## 802                dupe    -2
## 803               duped    -2
## 804         dysfunction    -2
## 805               eager     2
## 806             earnest     2
## 807                ease     2
## 808                easy     1
## 809            ecstatic     4
## 810               eerie    -2
## 811                eery    -2
## 812           effective     2
## 813         effectively     2
## 814              elated     3
## 815             elation     3
## 816             elegant     2
## 817           elegantly     2
## 818           embarrass    -2
## 819         embarrassed    -2
## 820         embarrasses    -2
## 821        embarrassing    -2
## 822       embarrassment    -2
## 823          embittered    -2
## 824             embrace     1
## 825           emergency    -2
## 826          empathetic     2
## 827           emptiness    -1
## 828               empty    -1
## 829           enchanted     2
## 830           encourage     2
## 831          encouraged     2
## 832       encouragement     2
## 833          encourages     2
## 834             endorse     2
## 835            endorsed     2
## 836         endorsement     2
## 837            endorses     2
## 838             enemies    -2
## 839               enemy    -2
## 840           energetic     2
## 841              engage     1
## 842             engages     1
## 843           engrossed     1
## 844               enjoy     2
## 845            enjoying     2
## 846              enjoys     2
## 847           enlighten     2
## 848         enlightened     2
## 849        enlightening     2
## 850          enlightens     2
## 851               ennui    -2
## 852              enrage    -2
## 853             enraged    -2
## 854             enrages    -2
## 855            enraging    -2
## 856           enrapture     3
## 857             enslave    -2
## 858            enslaved    -2
## 859            enslaves    -2
## 860              ensure     1
## 861            ensuring     1
## 862        enterprising     1
## 863        entertaining     2
## 864             enthral     3
## 865        enthusiastic     3
## 866            entitled     1
## 867           entrusted     2
## 868              envies    -1
## 869             envious    -2
## 870                envy    -1
## 871             envying    -1
## 872           erroneous    -2
## 873               error    -2
## 874              errors    -2
## 875              escape    -1
## 876             escapes    -1
## 877            escaping    -1
## 878            esteemed     2
## 879             ethical     2
## 880            euphoria     3
## 881            euphoric     4
## 882            eviction    -1
## 883                evil    -3
## 884          exaggerate    -2
## 885         exaggerated    -2
## 886         exaggerates    -2
## 887        exaggerating    -2
## 888         exasperated     2
## 889          excellence     3
## 890           excellent     3
## 891              excite     3
## 892             excited     3
## 893          excitement     3
## 894            exciting     3
## 895             exclude    -1
## 896            excluded    -2
## 897           exclusion    -1
## 898           exclusive     2
## 899              excuse    -1
## 900              exempt    -1
## 901           exhausted    -2
## 902         exhilarated     3
## 903         exhilarates     3
## 904        exhilarating     3
## 905           exonerate     2
## 906          exonerated     2
## 907          exonerates     2
## 908         exonerating     2
## 909              expand     1
## 910             expands     1
## 911               expel    -2
## 912            expelled    -2
## 913           expelling    -2
## 914              expels    -2
## 915             exploit    -2
## 916           exploited    -2
## 917          exploiting    -2
## 918            exploits    -2
## 919         exploration     1
## 920        explorations     1
## 921              expose    -1
## 922             exposed    -1
## 923             exposes    -1
## 924            exposing    -1
## 925              extend     1
## 926             extends     1
## 927           exuberant     4
## 928            exultant     3
## 929          exultantly     3
## 930            fabulous     4
## 931                 fad    -2
## 932                 fag    -3
## 933              faggot    -3
## 934             faggots    -3
## 935                fail    -2
## 936              failed    -2
## 937             failing    -2
## 938               fails    -2
## 939             failure    -2
## 940            failures    -2
## 941        fainthearted    -2
## 942                fair     2
## 943               faith     1
## 944            faithful     3
## 945                fake    -3
## 946               fakes    -3
## 947              faking    -3
## 948              fallen    -2
## 949             falling    -1
## 950           falsified    -3
## 951             falsify    -3
## 952                fame     1
## 953                 fan     3
## 954           fantastic     4
## 955               farce    -1
## 956           fascinate     3
## 957          fascinated     3
## 958          fascinates     3
## 959         fascinating     3
## 960             fascist    -2
## 961            fascists    -2
## 962          fatalities    -3
## 963            fatality    -3
## 964             fatigue    -2
## 965            fatigued    -2
## 966            fatigues    -2
## 967           fatiguing    -2
## 968               favor     2
## 969             favored     2
## 970            favorite     2
## 971           favorited     2
## 972           favorites     2
## 973              favors     2
## 974                fear    -2
## 975             fearful    -2
## 976             fearing    -2
## 977            fearless     2
## 978            fearsome    -2
## 979              fed up    -3
## 980              feeble    -2
## 981             feeling     1
## 982            felonies    -3
## 983              felony    -3
## 984             fervent     2
## 985              fervid     2
## 986             festive     2
## 987              fiasco    -3
## 988             fidgety    -2
## 989               fight    -1
## 990                fine     2
## 991                fire    -2
## 992               fired    -2
## 993              firing    -2
## 994                 fit     1
## 995             fitness     1
## 996            flagship     2
## 997               flees    -1
## 998                flop    -2
## 999               flops    -2
## 1000                flu    -2
## 1001          flustered    -2
## 1002            focused     2
## 1003               fond     2
## 1004           fondness     2
## 1005               fool    -2
## 1006            foolish    -2
## 1007              fools    -2
## 1008             forced    -1
## 1009        foreclosure    -2
## 1010       foreclosures    -2
## 1011             forget    -1
## 1012          forgetful    -2
## 1013            forgive     1
## 1014          forgiving     1
## 1015          forgotten    -1
## 1016          fortunate     2
## 1017            frantic    -1
## 1018              fraud    -4
## 1019             frauds    -4
## 1020          fraudster    -4
## 1021         fraudsters    -4
## 1022        fraudulence    -4
## 1023         fraudulent    -4
## 1024               free     1
## 1025            freedom     2
## 1026             frenzy    -3
## 1027              fresh     1
## 1028           friendly     2
## 1029             fright    -2
## 1030         frightened    -2
## 1031        frightening    -3
## 1032             frikin    -2
## 1033             frisky     2
## 1034           frowning    -1
## 1035          frustrate    -2
## 1036         frustrated    -2
## 1037         frustrates    -2
## 1038        frustrating    -2
## 1039        frustration    -2
## 1040                ftw     3
## 1041               fuck    -4
## 1042             fucked    -4
## 1043             fucker    -4
## 1044            fuckers    -4
## 1045           fuckface    -4
## 1046           fuckhead    -4
## 1047            fucking    -4
## 1048           fucktard    -4
## 1049                fud    -3
## 1050              fuked    -4
## 1051             fuking    -4
## 1052            fulfill     2
## 1053          fulfilled     2
## 1054           fulfills     2
## 1055             fuming    -2
## 1056                fun     4
## 1057            funeral    -1
## 1058           funerals    -1
## 1059              funky     2
## 1060            funnier     4
## 1061              funny     4
## 1062            furious    -3
## 1063             futile     2
## 1064                gag    -2
## 1065             gagged    -2
## 1066               gain     2
## 1067             gained     2
## 1068            gaining     2
## 1069              gains     2
## 1070            gallant     3
## 1071          gallantly     3
## 1072          gallantry     3
## 1073           generous     2
## 1074             genial     3
## 1075              ghost    -1
## 1076              giddy    -2
## 1077               gift     2
## 1078               glad     3
## 1079          glamorous     3
## 1080         glamourous     3
## 1081               glee     3
## 1082            gleeful     3
## 1083              gloom    -1
## 1084             gloomy    -2
## 1085           glorious     2
## 1086              glory     2
## 1087               glum    -2
## 1088                god     1
## 1089            goddamn    -3
## 1090            godsend     4
## 1091               good     3
## 1092           goodness     3
## 1093              grace     1
## 1094           gracious     3
## 1095              grand     3
## 1096              grant     1
## 1097            granted     1
## 1098           granting     1
## 1099             grants     1
## 1100           grateful     3
## 1101      gratification     2
## 1102              grave    -2
## 1103               gray    -1
## 1104              great     3
## 1105            greater     3
## 1106           greatest     3
## 1107              greed    -3
## 1108             greedy    -2
## 1109         green wash    -3
## 1110      green washing    -3
## 1111          greenwash    -3
## 1112        greenwasher    -3
## 1113       greenwashers    -3
## 1114       greenwashing    -3
## 1115              greet     1
## 1116            greeted     1
## 1117           greeting     1
## 1118          greetings     2
## 1119             greets     1
## 1120               grey    -1
## 1121              grief    -2
## 1122            grieved    -2
## 1123              gross    -2
## 1124            growing     1
## 1125             growth     2
## 1126          guarantee     1
## 1127              guilt    -3
## 1128             guilty    -3
## 1129        gullibility    -2
## 1130           gullible    -2
## 1131                gun    -1
## 1132                 ha     2
## 1133             hacked    -1
## 1134               haha     3
## 1135             hahaha     3
## 1136            hahahah     3
## 1137               hail     2
## 1138             hailed     2
## 1139            hapless    -2
## 1140        haplessness    -2
## 1141          happiness     3
## 1142              happy     3
## 1143               hard    -1
## 1144            hardier     2
## 1145           hardship    -2
## 1146              hardy     2
## 1147               harm    -2
## 1148             harmed    -2
## 1149            harmful    -2
## 1150            harming    -2
## 1151              harms    -2
## 1152            harried    -2
## 1153              harsh    -2
## 1154            harsher    -2
## 1155           harshest    -2
## 1156               hate    -3
## 1157              hated    -3
## 1158             haters    -3
## 1159              hates    -3
## 1160             hating    -3
## 1161              haunt    -1
## 1162            haunted    -2
## 1163           haunting     1
## 1164             haunts    -1
## 1165              havoc    -2
## 1166            healthy     2
## 1167      heartbreaking    -3
## 1168        heartbroken    -3
## 1169          heartfelt     3
## 1170             heaven     2
## 1171           heavenly     4
## 1172       heavyhearted    -2
## 1173               hell    -4
## 1174               help     2
## 1175            helpful     2
## 1176            helping     2
## 1177           helpless    -2
## 1178              helps     2
## 1179               hero     2
## 1180             heroes     2
## 1181             heroic     3
## 1182           hesitant    -2
## 1183           hesitate    -2
## 1184                hid    -1
## 1185               hide    -1
## 1186              hides    -1
## 1187             hiding    -1
## 1188          highlight     2
## 1189          hilarious     2
## 1190          hindrance    -2
## 1191               hoax    -2
## 1192           homesick    -2
## 1193             honest     2
## 1194              honor     2
## 1195            honored     2
## 1196           honoring     2
## 1197             honour     2
## 1198           honoured     2
## 1199          honouring     2
## 1200           hooligan    -2
## 1201        hooliganism    -2
## 1202          hooligans    -2
## 1203               hope     2
## 1204            hopeful     2
## 1205          hopefully     2
## 1206           hopeless    -2
## 1207       hopelessness    -2
## 1208              hopes     2
## 1209             hoping     2
## 1210         horrendous    -3
## 1211           horrible    -3
## 1212           horrific    -3
## 1213          horrified    -3
## 1214            hostile    -2
## 1215           huckster    -2
## 1216                hug     2
## 1217               huge     1
## 1218               hugs     2
## 1219           humerous     3
## 1220         humiliated    -3
## 1221        humiliation    -3
## 1222              humor     2
## 1223           humorous     2
## 1224             humour     2
## 1225          humourous     2
## 1226             hunger    -2
## 1227             hurrah     5
## 1228               hurt    -2
## 1229            hurting    -2
## 1230              hurts    -2
## 1231       hypocritical    -2
## 1232           hysteria    -3
## 1233         hysterical    -3
## 1234          hysterics    -3
## 1235              idiot    -3
## 1236            idiotic    -3
## 1237          ignorance    -2
## 1238           ignorant    -2
## 1239             ignore    -1
## 1240            ignored    -2
## 1241            ignores    -1
## 1242                ill    -2
## 1243            illegal    -3
## 1244         illiteracy    -2
## 1245            illness    -2
## 1246          illnesses    -2
## 1247           imbecile    -3
## 1248        immobilized    -1
## 1249           immortal     2
## 1250             immune     1
## 1251          impatient    -2
## 1252          imperfect    -2
## 1253         importance     2
## 1254          important     2
## 1255             impose    -1
## 1256            imposed    -1
## 1257            imposes    -1
## 1258           imposing    -1
## 1259           impotent    -2
## 1260            impress     3
## 1261          impressed     3
## 1262          impresses     3
## 1263         impressive     3
## 1264         imprisoned    -2
## 1265            improve     2
## 1266           improved     2
## 1267        improvement     2
## 1268           improves     2
## 1269          improving     2
## 1270          inability    -2
## 1271           inaction    -2
## 1272         inadequate    -2
## 1273          incapable    -2
## 1274      incapacitated    -2
## 1275           incensed    -2
## 1276       incompetence    -2
## 1277        incompetent    -2
## 1278      inconsiderate    -2
## 1279      inconvenience    -2
## 1280       inconvenient    -2
## 1281           increase     1
## 1282          increased     1
## 1283         indecisive    -2
## 1284     indestructible     2
## 1285       indifference    -2
## 1286        indifferent    -2
## 1287          indignant    -2
## 1288        indignation    -2
## 1289       indoctrinate    -2
## 1290      indoctrinated    -2
## 1291      indoctrinates    -2
## 1292     indoctrinating    -2
## 1293        ineffective    -2
## 1294      ineffectively    -2
## 1295         infatuated     2
## 1296        infatuation     2
## 1297           infected    -2
## 1298           inferior    -2
## 1299           inflamed    -2
## 1300        influential     2
## 1301       infringement    -2
## 1302          infuriate    -2
## 1303         infuriated    -2
## 1304         infuriates    -2
## 1305        infuriating    -2
## 1306            inhibit    -1
## 1307            injured    -2
## 1308             injury    -2
## 1309          injustice    -2
## 1310           innovate     1
## 1311          innovates     1
## 1312         innovation     1
## 1313         innovative     2
## 1314        inquisition    -2
## 1315        inquisitive     2
## 1316             insane    -2
## 1317           insanity    -2
## 1318           insecure    -2
## 1319        insensitive    -2
## 1320      insensitivity    -2
## 1321      insignificant    -2
## 1322            insipid    -2
## 1323        inspiration     2
## 1324      inspirational     2
## 1325            inspire     2
## 1326           inspired     2
## 1327           inspires     2
## 1328          inspiring     3
## 1329             insult    -2
## 1330           insulted    -2
## 1331          insulting    -2
## 1332            insults    -2
## 1333             intact     2
## 1334          integrity     2
## 1335        intelligent     2
## 1336            intense     1
## 1337           interest     1
## 1338         interested     2
## 1339        interesting     2
## 1340          interests     1
## 1341       interrogated    -2
## 1342          interrupt    -2
## 1343        interrupted    -2
## 1344       interrupting    -2
## 1345       interruption    -2
## 1346         interrupts    -2
## 1347         intimidate    -2
## 1348        intimidated    -2
## 1349        intimidates    -2
## 1350       intimidating    -2
## 1351       intimidation    -2
## 1352          intricate     2
## 1353          intrigues     1
## 1354         invincible     2
## 1355             invite     1
## 1356           inviting     1
## 1357       invulnerable     2
## 1358              irate    -3
## 1359             ironic    -1
## 1360              irony    -1
## 1361         irrational    -1
## 1362       irresistible     2
## 1363         irresolute    -2
## 1364      irresponsible     2
## 1365       irreversible    -1
## 1366           irritate    -3
## 1367          irritated    -3
## 1368         irritating    -3
## 1369           isolated    -1
## 1370              itchy    -2
## 1371            jackass    -4
## 1372          jackasses    -4
## 1373             jailed    -2
## 1374             jaunty     2
## 1375            jealous    -2
## 1376           jeopardy    -2
## 1377               jerk    -3
## 1378              jesus     1
## 1379              jewel     1
## 1380             jewels     1
## 1381            jocular     2
## 1382               join     1
## 1383               joke     2
## 1384              jokes     2
## 1385              jolly     2
## 1386             jovial     2
## 1387                joy     3
## 1388             joyful     3
## 1389           joyfully     3
## 1390            joyless    -2
## 1391             joyous     3
## 1392           jubilant     3
## 1393              jumpy    -1
## 1394            justice     2
## 1395        justifiably     2
## 1396          justified     2
## 1397               keen     1
## 1398               kill    -3
## 1399             killed    -3
## 1400            killing    -3
## 1401              kills    -3
## 1402               kind     2
## 1403             kinder     2
## 1404               kiss     2
## 1405              kudos     3
## 1406               lack    -2
## 1407      lackadaisical    -2
## 1408                lag    -1
## 1409             lagged    -2
## 1410            lagging    -2
## 1411               lags    -2
## 1412               lame    -2
## 1413           landmark     2
## 1414              laugh     1
## 1415            laughed     1
## 1416           laughing     1
## 1417             laughs     1
## 1418          laughting     1
## 1419           launched     1
## 1420               lawl     3
## 1421            lawsuit    -2
## 1422           lawsuits    -2
## 1423               lazy    -1
## 1424               leak    -1
## 1425             leaked    -1
## 1426              leave    -1
## 1427              legal     1
## 1428            legally     1
## 1429            lenient     1
## 1430          lethargic    -2
## 1431           lethargy    -2
## 1432               liar    -3
## 1433              liars    -3
## 1434           libelous    -2
## 1435               lied    -2
## 1436          lifesaver     4
## 1437       lighthearted     1
## 1438               like     2
## 1439              liked     2
## 1440              likes     2
## 1441         limitation    -1
## 1442            limited    -1
## 1443             limits    -1
## 1444         litigation    -1
## 1445          litigious    -2
## 1446             lively     2
## 1447              livid    -2
## 1448               lmao     4
## 1449              lmfao     4
## 1450             loathe    -3
## 1451            loathed    -3
## 1452            loathes    -3
## 1453           loathing    -3
## 1454              lobby    -2
## 1455           lobbying    -2
## 1456                lol     3
## 1457             lonely    -2
## 1458           lonesome    -2
## 1459            longing    -1
## 1460               loom    -1
## 1461             loomed    -1
## 1462            looming    -1
## 1463              looms    -1
## 1464              loose    -3
## 1465             looses    -3
## 1466              loser    -3
## 1467             losing    -3
## 1468               loss    -3
## 1469               lost    -3
## 1470            lovable     3
## 1471               love     3
## 1472              loved     3
## 1473           lovelies     3
## 1474             lovely     3
## 1475             loving     2
## 1476             lowest    -1
## 1477              loyal     3
## 1478            loyalty     3
## 1479               luck     3
## 1480            luckily     3
## 1481              lucky     3
## 1482         lugubrious    -2
## 1483            lunatic    -3
## 1484           lunatics    -3
## 1485               lurk    -1
## 1486            lurking    -1
## 1487              lurks    -1
## 1488                mad    -3
## 1489          maddening    -3
## 1490            made-up    -1
## 1491              madly    -3
## 1492            madness    -3
## 1493          mandatory    -1
## 1494        manipulated    -1
## 1495       manipulating    -1
## 1496       manipulation    -1
## 1497             marvel     3
## 1498          marvelous     3
## 1499            marvels     3
## 1500        masterpiece     4
## 1501       masterpieces     4
## 1502             matter     1
## 1503            matters     1
## 1504             mature     2
## 1505         meaningful     2
## 1506        meaningless    -2
## 1507              medal     3
## 1508         mediocrity    -3
## 1509         meditative     1
## 1510         melancholy    -2
## 1511             menace    -2
## 1512            menaced    -2
## 1513              mercy     2
## 1514              merry     3
## 1515               mess    -2
## 1516             messed    -2
## 1517         messing up    -2
## 1518         methodical     2
## 1519           mindless    -2
## 1520            miracle     4
## 1521              mirth     3
## 1522           mirthful     3
## 1523         mirthfully     3
## 1524          misbehave    -2
## 1525         misbehaved    -2
## 1526         misbehaves    -2
## 1527        misbehaving    -2
## 1528           mischief    -1
## 1529          mischiefs    -1
## 1530          miserable    -3
## 1531             misery    -2
## 1532          misgiving    -2
## 1533     misinformation    -2
## 1534        misinformed    -2
## 1535     misinterpreted    -2
## 1536         misleading    -3
## 1537            misread    -1
## 1538       misreporting    -2
## 1539  misrepresentation    -2
## 1540               miss    -2
## 1541             missed    -2
## 1542            missing    -2
## 1543            mistake    -2
## 1544           mistaken    -2
## 1545           mistakes    -2
## 1546          mistaking    -2
## 1547      misunderstand    -2
## 1548   misunderstanding    -2
## 1549     misunderstands    -2
## 1550      misunderstood    -2
## 1551               moan    -2
## 1552             moaned    -2
## 1553            moaning    -2
## 1554              moans    -2
## 1555               mock    -2
## 1556             mocked    -2
## 1557            mocking    -2
## 1558              mocks    -2
## 1559          mongering    -2
## 1560         monopolize    -2
## 1561        monopolized    -2
## 1562        monopolizes    -2
## 1563       monopolizing    -2
## 1564              moody    -1
## 1565               mope    -1
## 1566             moping    -1
## 1567              moron    -3
## 1568       motherfucker    -5
## 1569      motherfucking    -5
## 1570           motivate     1
## 1571          motivated     2
## 1572         motivating     2
## 1573         motivation     1
## 1574              mourn    -2
## 1575            mourned    -2
## 1576           mournful    -2
## 1577           mourning    -2
## 1578             mourns    -2
## 1579            mumpish    -2
## 1580             murder    -2
## 1581           murderer    -2
## 1582          murdering    -3
## 1583          murderous    -3
## 1584            murders    -2
## 1585               myth    -1
## 1586               n00b    -2
## 1587              naive    -2
## 1588              nasty    -3
## 1589            natural     1
## 1590           na\xefve    -2
## 1591              needy    -2
## 1592           negative    -2
## 1593         negativity    -2
## 1594            neglect    -2
## 1595          neglected    -2
## 1596         neglecting    -2
## 1597           neglects    -2
## 1598             nerves    -1
## 1599            nervous    -2
## 1600          nervously    -2
## 1601               nice     3
## 1602              nifty     2
## 1603             niggas    -5
## 1604             nigger    -5
## 1605                 no    -1
## 1606             no fun    -3
## 1607              noble     2
## 1608              noisy    -1
## 1609           nonsense    -2
## 1610               noob    -2
## 1611              nosey    -2
## 1612           not good    -2
## 1613        not working    -3
## 1614          notorious    -2
## 1615              novel     2
## 1616               numb    -1
## 1617               nuts    -3
## 1618         obliterate    -2
## 1619        obliterated    -2
## 1620          obnoxious    -3
## 1621            obscene    -2
## 1622           obsessed     2
## 1623           obsolete    -2
## 1624           obstacle    -2
## 1625          obstacles    -2
## 1626          obstinate    -2
## 1627                odd    -2
## 1628             offend    -2
## 1629           offended    -2
## 1630           offender    -2
## 1631          offending    -2
## 1632            offends    -2
## 1633            offline    -1
## 1634                oks     2
## 1635            ominous     3
## 1636 once-in-a-lifetime     3
## 1637      opportunities     2
## 1638        opportunity     2
## 1639          oppressed    -2
## 1640         oppressive    -2
## 1641           optimism     2
## 1642         optimistic     2
## 1643         optionless    -2
## 1644             outcry    -2
## 1645      outmaneuvered    -2
## 1646            outrage    -3
## 1647           outraged    -3
## 1648           outreach     2
## 1649        outstanding     5
## 1650          overjoyed     4
## 1651           overload    -1
## 1652         overlooked    -1
## 1653          overreact    -2
## 1654        overreacted    -2
## 1655       overreaction    -2
## 1656         overreacts    -2
## 1657           oversell    -2
## 1658        overselling    -2
## 1659          oversells    -2
## 1660 oversimplification    -2
## 1661     oversimplified    -2
## 1662     oversimplifies    -2
## 1663       oversimplify    -2
## 1664      overstatement    -2
## 1665     overstatements    -2
## 1666         overweight    -1
## 1667           oxymoron    -1
## 1668               pain    -2
## 1669             pained    -2
## 1670              panic    -3
## 1671           panicked    -3
## 1672             panics    -3
## 1673           paradise     3
## 1674            paradox    -1
## 1675             pardon     2
## 1676           pardoned     2
## 1677          pardoning     2
## 1678            pardons     2
## 1679             parley    -1
## 1680         passionate     2
## 1681            passive    -1
## 1682          passively    -1
## 1683           pathetic    -2
## 1684                pay    -1
## 1685              peace     2
## 1686           peaceful     2
## 1687         peacefully     2
## 1688            penalty    -2
## 1689            pensive    -1
## 1690            perfect     3
## 1691          perfected     2
## 1692          perfectly     3
## 1693           perfects     2
## 1694              peril    -2
## 1695            perjury    -3
## 1696        perpetrator    -2
## 1697       perpetrators    -2
## 1698          perplexed    -2
## 1699          persecute    -2
## 1700         persecuted    -2
## 1701         persecutes    -2
## 1702        persecuting    -2
## 1703          perturbed    -2
## 1704              pesky    -2
## 1705          pessimism    -2
## 1706        pessimistic    -2
## 1707          petrified    -2
## 1708             phobic    -2
## 1709        picturesque     2
## 1710             pileup    -1
## 1711              pique    -2
## 1712             piqued    -2
## 1713               piss    -4
## 1714             pissed    -4
## 1715            pissing    -3
## 1716            piteous    -2
## 1717             pitied    -1
## 1718               pity    -2
## 1719            playful     2
## 1720           pleasant     3
## 1721             please     1
## 1722            pleased     3
## 1723           pleasure     3
## 1724             poised    -2
## 1725             poison    -2
## 1726           poisoned    -2
## 1727            poisons    -2
## 1728            pollute    -2
## 1729           polluted    -2
## 1730           polluter    -2
## 1731          polluters    -2
## 1732           pollutes    -2
## 1733               poor    -2
## 1734             poorer    -2
## 1735            poorest    -2
## 1736            popular     3
## 1737           positive     2
## 1738         positively     2
## 1739         possessive    -2
## 1740           postpone    -1
## 1741          postponed    -1
## 1742          postpones    -1
## 1743         postponing    -1
## 1744            poverty    -1
## 1745           powerful     2
## 1746          powerless    -2
## 1747             praise     3
## 1748            praised     3
## 1749            praises     3
## 1750           praising     3
## 1751               pray     1
## 1752            praying     1
## 1753              prays     1
## 1754              prblm    -2
## 1755             prblms    -2
## 1756           prepared     1
## 1757           pressure    -1
## 1758          pressured    -2
## 1759            pretend    -1
## 1760         pretending    -1
## 1761           pretends    -1
## 1762             pretty     1
## 1763            prevent    -1
## 1764          prevented    -1
## 1765         preventing    -1
## 1766           prevents    -1
## 1767              prick    -5
## 1768             prison    -2
## 1769           prisoner    -2
## 1770          prisoners    -2
## 1771         privileged     2
## 1772          proactive     2
## 1773            problem    -2
## 1774           problems    -2
## 1775          profiteer    -2
## 1776           progress     2
## 1777          prominent     2
## 1778            promise     1
## 1779           promised     1
## 1780           promises     1
## 1781            promote     1
## 1782           promoted     1
## 1783           promotes     1
## 1784          promoting     1
## 1785         propaganda    -2
## 1786          prosecute    -1
## 1787         prosecuted    -2
## 1788         prosecutes    -1
## 1789        prosecution    -1
## 1790           prospect     1
## 1791          prospects     1
## 1792         prosperous     3
## 1793            protect     1
## 1794          protected     1
## 1795           protects     1
## 1796            protest    -2
## 1797         protesters    -2
## 1798         protesting    -2
## 1799           protests    -2
## 1800              proud     2
## 1801            proudly     2
## 1802            provoke    -1
## 1803           provoked    -1
## 1804           provokes    -1
## 1805          provoking    -1
## 1806      pseudoscience    -3
## 1807             punish    -2
## 1808           punished    -2
## 1809           punishes    -2
## 1810           punitive    -2
## 1811              pushy    -1
## 1812            puzzled    -2
## 1813            quaking    -2
## 1814       questionable    -2
## 1815         questioned    -1
## 1816        questioning    -1
## 1817             racism    -3
## 1818             racist    -3
## 1819            racists    -3
## 1820               rage    -2
## 1821            rageful    -2
## 1822              rainy    -1
## 1823               rant    -3
## 1824             ranter    -3
## 1825            ranters    -3
## 1826              rants    -3
## 1827               rape    -4
## 1828             rapist    -4
## 1829            rapture     2
## 1830           raptured     2
## 1831           raptures     2
## 1832          rapturous     4
## 1833               rash    -2
## 1834           ratified     2
## 1835              reach     1
## 1836            reached     1
## 1837            reaches     1
## 1838           reaching     1
## 1839           reassure     1
## 1840          reassured     1
## 1841          reassures     1
## 1842         reassuring     2
## 1843          rebellion    -2
## 1844          recession    -2
## 1845           reckless    -2
## 1846          recommend     2
## 1847        recommended     2
## 1848         recommends     2
## 1849           redeemed     2
## 1850             refuse    -2
## 1851            refused    -2
## 1852           refusing    -2
## 1853             regret    -2
## 1854          regretful    -2
## 1855            regrets    -2
## 1856          regretted    -2
## 1857         regretting    -2
## 1858             reject    -1
## 1859           rejected    -1
## 1860          rejecting    -1
## 1861            rejects    -1
## 1862            rejoice     4
## 1863           rejoiced     4
## 1864           rejoices     4
## 1865          rejoicing     4
## 1866            relaxed     2
## 1867         relentless    -1
## 1868            reliant     2
## 1869            relieve     1
## 1870           relieved     2
## 1871           relieves     1
## 1872          relieving     2
## 1873          relishing     2
## 1874         remarkable     2
## 1875            remorse    -2
## 1876            repulse    -1
## 1877           repulsed    -2
## 1878             rescue     2
## 1879            rescued     2
## 1880            rescues     2
## 1881          resentful    -2
## 1882             resign    -1
## 1883           resigned    -1
## 1884          resigning    -1
## 1885            resigns    -1
## 1886           resolute     2
## 1887            resolve     2
## 1888           resolved     2
## 1889           resolves     2
## 1890          resolving     2
## 1891          respected     2
## 1892        responsible     2
## 1893         responsive     2
## 1894            restful     2
## 1895           restless    -2
## 1896            restore     1
## 1897           restored     1
## 1898           restores     1
## 1899          restoring     1
## 1900           restrict    -2
## 1901         restricted    -2
## 1902        restricting    -2
## 1903        restriction    -2
## 1904          restricts    -2
## 1905           retained    -1
## 1906             retard    -2
## 1907           retarded    -2
## 1908            retreat    -1
## 1909            revenge    -2
## 1910         revengeful    -2
## 1911            revered     2
## 1912             revive     2
## 1913            revives     2
## 1914             reward     2
## 1915           rewarded     2
## 1916          rewarding     2
## 1917            rewards     2
## 1918               rich     2
## 1919         ridiculous    -3
## 1920                rig    -1
## 1921             rigged    -1
## 1922    right direction     3
## 1923           rigorous     3
## 1924         rigorously     3
## 1925               riot    -2
## 1926              riots    -2
## 1927               risk    -2
## 1928              risks    -2
## 1929                rob    -2
## 1930             robber    -2
## 1931              robed    -2
## 1932             robing    -2
## 1933               robs    -2
## 1934             robust     2
## 1935               rofl     4
## 1936         roflcopter     4
## 1937            roflmao     4
## 1938            romance     2
## 1939              rotfl     4
## 1940          rotflmfao     4
## 1941            rotflol     4
## 1942               ruin    -2
## 1943             ruined    -2
## 1944            ruining    -2
## 1945              ruins    -2
## 1946           sabotage    -2
## 1947                sad    -2
## 1948             sadden    -2
## 1949           saddened    -2
## 1950              sadly    -2
## 1951               safe     1
## 1952             safely     1
## 1953             safety     1
## 1954            salient     1
## 1955              sappy    -1
## 1956          sarcastic    -2
## 1957          satisfied     2
## 1958               save     2
## 1959              saved     2
## 1960               scam    -2
## 1961              scams    -2
## 1962            scandal    -3
## 1963         scandalous    -3
## 1964           scandals    -3
## 1965          scapegoat    -2
## 1966         scapegoats    -2
## 1967              scare    -2
## 1968             scared    -2
## 1969              scary    -2
## 1970          sceptical    -2
## 1971              scold    -2
## 1972              scoop     3
## 1973              scorn    -2
## 1974           scornful    -2
## 1975             scream    -2
## 1976           screamed    -2
## 1977          screaming    -2
## 1978            screams    -2
## 1979            screwed    -2
## 1980         screwed up    -3
## 1981            scumbag    -4
## 1982             secure     2
## 1983            secured     2
## 1984            secures     2
## 1985           sedition    -2
## 1986          seditious    -2
## 1987            seduced    -1
## 1988     self-confident     2
## 1989       self-deluded    -2
## 1990            selfish    -3
## 1991        selfishness    -3
## 1992           sentence    -2
## 1993          sentenced    -2
## 1994          sentences    -2
## 1995         sentencing    -2
## 1996             serene     2
## 1997             severe    -2
## 1998               sexy     3
## 1999              shaky    -2
## 2000              shame    -2
## 2001             shamed    -2
## 2002           shameful    -2
## 2003              share     1
## 2004             shared     1
## 2005             shares     1
## 2006          shattered    -2
## 2007               shit    -4
## 2008           shithead    -4
## 2009             shitty    -3
## 2010              shock    -2
## 2011            shocked    -2
## 2012           shocking    -2
## 2013             shocks    -2
## 2014              shoot    -1
## 2015      short-sighted    -2
## 2016  short-sightedness    -2
## 2017           shortage    -2
## 2018          shortages    -2
## 2019              shrew    -4
## 2020                shy    -1
## 2021               sick    -2
## 2022               sigh    -2
## 2023       significance     1
## 2024        significant     1
## 2025          silencing    -1
## 2026              silly    -1
## 2027            sincere     2
## 2028          sincerely     2
## 2029          sincerest     2
## 2030          sincerity     2
## 2031             sinful    -3
## 2032       singleminded    -2
## 2033            skeptic    -2
## 2034          skeptical    -2
## 2035         skepticism    -2
## 2036           skeptics    -2
## 2037               slam    -2
## 2038              slash    -2
## 2039            slashed    -2
## 2040            slashes    -2
## 2041           slashing    -2
## 2042            slavery    -3
## 2043      sleeplessness    -2
## 2044              slick     2
## 2045            slicker     2
## 2046           slickest     2
## 2047           sluggish    -2
## 2048               slut    -5
## 2049              smart     1
## 2050            smarter     2
## 2051           smartest     2
## 2052              smear    -2
## 2053              smile     2
## 2054             smiled     2
## 2055             smiles     2
## 2056            smiling     2
## 2057               smog    -2
## 2058             sneaky    -1
## 2059               snub    -2
## 2060            snubbed    -2
## 2061           snubbing    -2
## 2062              snubs    -2
## 2063           sobering     1
## 2064             solemn    -1
## 2065              solid     2
## 2066         solidarity     2
## 2067           solution     1
## 2068          solutions     1
## 2069              solve     1
## 2070             solved     1
## 2071             solves     1
## 2072            solving     1
## 2073             somber    -2
## 2074          some kind     0
## 2075     son-of-a-bitch    -5
## 2076             soothe     3
## 2077            soothed     3
## 2078           soothing     3
## 2079      sophisticated     2
## 2080               sore    -1
## 2081             sorrow    -2
## 2082          sorrowful    -2
## 2083              sorry    -1
## 2084               spam    -2
## 2085            spammer    -3
## 2086           spammers    -3
## 2087           spamming    -2
## 2088              spark     1
## 2089            sparkle     3
## 2090           sparkles     3
## 2091          sparkling     3
## 2092        speculative    -2
## 2093             spirit     1
## 2094           spirited     2
## 2095         spiritless    -2
## 2096           spiteful    -2
## 2097           splendid     3
## 2098          sprightly     2
## 2099          squelched    -1
## 2100               stab    -2
## 2101            stabbed    -2
## 2102             stable     2
## 2103              stabs    -2
## 2104              stall    -2
## 2105            stalled    -2
## 2106           stalling    -2
## 2107            stamina     2
## 2108           stampede    -2
## 2109           startled    -2
## 2110             starve    -2
## 2111            starved    -2
## 2112            starves    -2
## 2113           starving    -2
## 2114          steadfast     2
## 2115              steal    -2
## 2116             steals    -2
## 2117         stereotype    -2
## 2118        stereotyped    -2
## 2119            stifled    -1
## 2120          stimulate     1
## 2121         stimulated     1
## 2122         stimulates     1
## 2123        stimulating     2
## 2124             stingy    -2
## 2125             stolen    -2
## 2126               stop    -1
## 2127            stopped    -1
## 2128           stopping    -1
## 2129              stops    -1
## 2130              stout     2
## 2131           straight     1
## 2132            strange    -1
## 2133          strangely    -1
## 2134          strangled    -2
## 2135           strength     2
## 2136         strengthen     2
## 2137       strengthened     2
## 2138      strengthening     2
## 2139        strengthens     2
## 2140           stressed    -2
## 2141           stressor    -2
## 2142          stressors    -2
## 2143           stricken    -2
## 2144             strike    -1
## 2145           strikers    -2
## 2146            strikes    -1
## 2147             strong     2
## 2148           stronger     2
## 2149          strongest     2
## 2150             struck    -1
## 2151           struggle    -2
## 2152          struggled    -2
## 2153          struggles    -2
## 2154         struggling    -2
## 2155           stubborn    -2
## 2156              stuck    -2
## 2157            stunned    -2
## 2158           stunning     4
## 2159             stupid    -2
## 2160           stupidly    -2
## 2161              suave     2
## 2162        substantial     1
## 2163      substantially     1
## 2164         subversive    -2
## 2165            success     2
## 2166         successful     3
## 2167               suck    -3
## 2168              sucks    -3
## 2169             suffer    -2
## 2170          suffering    -2
## 2171            suffers    -2
## 2172           suicidal    -2
## 2173            suicide    -2
## 2174              suing    -2
## 2175            sulking    -2
## 2176              sulky    -2
## 2177             sullen    -2
## 2178           sunshine     2
## 2179              super     3
## 2180             superb     5
## 2181           superior     2
## 2182            support     2
## 2183          supported     2
## 2184          supporter     1
## 2185         supporters     1
## 2186         supporting     1
## 2187         supportive     2
## 2188           supports     2
## 2189           survived     2
## 2190          surviving     2
## 2191           survivor     2
## 2192            suspect    -1
## 2193          suspected    -1
## 2194         suspecting    -1
## 2195           suspects    -1
## 2196            suspend    -1
## 2197          suspended    -1
## 2198         suspicious    -2
## 2199              swear    -2
## 2200           swearing    -2
## 2201             swears    -2
## 2202              sweet     2
## 2203              swift     2
## 2204            swiftly     2
## 2205            swindle    -3
## 2206           swindles    -3
## 2207          swindling    -3
## 2208        sympathetic     2
## 2209           sympathy     2
## 2210               tard    -2
## 2211              tears    -2
## 2212             tender     2
## 2213              tense    -2
## 2214            tension    -1
## 2215           terrible    -3
## 2216           terribly    -3
## 2217           terrific     4
## 2218          terrified    -3
## 2219             terror    -3
## 2220          terrorize    -3
## 2221         terrorized    -3
## 2222         terrorizes    -3
## 2223              thank     2
## 2224           thankful     2
## 2225             thanks     2
## 2226             thorny    -2
## 2227         thoughtful     2
## 2228        thoughtless    -2
## 2229             threat    -2
## 2230           threaten    -2
## 2231         threatened    -2
## 2232        threatening    -2
## 2233          threatens    -2
## 2234            threats    -2
## 2235           thrilled     5
## 2236             thwart    -2
## 2237           thwarted    -2
## 2238          thwarting    -2
## 2239            thwarts    -2
## 2240              timid    -2
## 2241           timorous    -2
## 2242              tired    -2
## 2243               tits    -2
## 2244           tolerant     2
## 2245          toothless    -2
## 2246                top     2
## 2247               tops     2
## 2248               torn    -2
## 2249            torture    -4
## 2250           tortured    -4
## 2251           tortures    -4
## 2252          torturing    -4
## 2253       totalitarian    -2
## 2254    totalitarianism    -2
## 2255               tout    -2
## 2256             touted    -2
## 2257            touting    -2
## 2258              touts    -2
## 2259            tragedy    -2
## 2260             tragic    -2
## 2261           tranquil     2
## 2262               trap    -1
## 2263            trapped    -2
## 2264             trauma    -3
## 2265          traumatic    -3
## 2266           travesty    -2
## 2267            treason    -3
## 2268         treasonous    -3
## 2269           treasure     2
## 2270          treasures     2
## 2271          trembling    -2
## 2272          tremulous    -2
## 2273            tricked    -2
## 2274           trickery    -2
## 2275            triumph     4
## 2276         triumphant     4
## 2277            trouble    -2
## 2278           troubled    -2
## 2279           troubles    -2
## 2280               TRUE     2
## 2281              trust     1
## 2282            trusted     2
## 2283              tumor    -2
## 2284               twat    -5
## 2285               ugly    -3
## 2286       unacceptable    -2
## 2287      unappreciated    -2
## 2288         unapproved    -2
## 2289            unaware    -2
## 2290       unbelievable    -1
## 2291        unbelieving    -1
## 2292           unbiased     2
## 2293          uncertain    -1
## 2294            unclear    -1
## 2295      uncomfortable    -2
## 2296        unconcerned    -2
## 2297        unconfirmed    -1
## 2298        unconvinced    -1
## 2299         uncredited    -1
## 2300          undecided    -1
## 2301      underestimate    -1
## 2302     underestimated    -1
## 2303     underestimates    -1
## 2304    underestimating    -1
## 2305          undermine    -2
## 2306         undermined    -2
## 2307         undermines    -2
## 2308        undermining    -2
## 2309        undeserving    -2
## 2310        undesirable    -2
## 2311             uneasy    -2
## 2312       unemployment    -2
## 2313            unequal    -1
## 2314          unequaled     2
## 2315          unethical    -2
## 2316             unfair    -2
## 2317          unfocused    -2
## 2318        unfulfilled    -2
## 2319            unhappy    -2
## 2320          unhealthy    -2
## 2321            unified     1
## 2322        unimpressed    -2
## 2323      unintelligent    -2
## 2324             united     1
## 2325             unjust    -2
## 2326          unlovable    -2
## 2327            unloved    -2
## 2328          unmatched     1
## 2329        unmotivated    -2
## 2330     unprofessional    -2
## 2331       unresearched    -2
## 2332        unsatisfied    -2
## 2333          unsecured    -2
## 2334          unsettled    -1
## 2335    unsophisticated    -2
## 2336           unstable    -2
## 2337        unstoppable     2
## 2338        unsupported    -2
## 2339             unsure    -1
## 2340        untarnished     2
## 2341           unwanted    -2
## 2342           unworthy    -2
## 2343              upset    -2
## 2344             upsets    -2
## 2345          upsetting    -2
## 2346            uptight    -2
## 2347             urgent    -1
## 2348             useful     2
## 2349         usefulness     2
## 2350            useless    -2
## 2351        uselessness    -2
## 2352              vague    -2
## 2353           validate     1
## 2354          validated     1
## 2355          validates     1
## 2356         validating     1
## 2357            verdict    -1
## 2358           verdicts    -1
## 2359             vested     1
## 2360           vexation    -2
## 2361             vexing    -2
## 2362            vibrant     3
## 2363            vicious    -2
## 2364             victim    -3
## 2365          victimize    -3
## 2366         victimized    -3
## 2367         victimizes    -3
## 2368        victimizing    -3
## 2369            victims    -3
## 2370           vigilant     3
## 2371               vile    -3
## 2372          vindicate     2
## 2373         vindicated     2
## 2374         vindicates     2
## 2375        vindicating     2
## 2376            violate    -2
## 2377           violated    -2
## 2378           violates    -2
## 2379          violating    -2
## 2380           violence    -3
## 2381            violent    -3
## 2382           virtuous     2
## 2383           virulent    -2
## 2384             vision     1
## 2385          visionary     3
## 2386          visioning     1
## 2387            visions     1
## 2388           vitality     3
## 2389            vitamin     1
## 2390          vitriolic    -3
## 2391          vivacious     3
## 2392         vociferous    -1
## 2393      vulnerability    -2
## 2394         vulnerable    -2
## 2395            walkout    -2
## 2396           walkouts    -2
## 2397             wanker    -3
## 2398               want     1
## 2399                war    -2
## 2400            warfare    -2
## 2401               warm     1
## 2402             warmth     2
## 2403               warn    -2
## 2404             warned    -2
## 2405            warning    -3
## 2406           warnings    -3
## 2407              warns    -2
## 2408              waste    -1
## 2409             wasted    -2
## 2410            wasting    -2
## 2411           wavering    -1
## 2412               weak    -2
## 2413           weakness    -2
## 2414             wealth     3
## 2415            wealthy     2
## 2416              weary    -2
## 2417               weep    -2
## 2418            weeping    -2
## 2419              weird    -2
## 2420            welcome     2
## 2421           welcomed     2
## 2422           welcomes     2
## 2423          whimsical     1
## 2424          whitewash    -3
## 2425              whore    -4
## 2426             wicked    -2
## 2427            widowed    -1
## 2428        willingness     2
## 2429                win     4
## 2430             winner     4
## 2431            winning     4
## 2432               wins     4
## 2433             winwin     3
## 2434               wish     1
## 2435             wishes     1
## 2436            wishing     1
## 2437         withdrawal    -3
## 2438          woebegone    -2
## 2439             woeful    -3
## 2440                won     3
## 2441          wonderful     4
## 2442                woo     3
## 2443             woohoo     3
## 2444               wooo     4
## 2445               woow     4
## 2446               worn    -1
## 2447            worried    -3
## 2448              worry    -3
## 2449           worrying    -3
## 2450              worse    -3
## 2451             worsen    -3
## 2452           worsened    -3
## 2453          worsening    -3
## 2454            worsens    -3
## 2455          worshiped     3
## 2456              worst    -3
## 2457              worth     2
## 2458          worthless    -2
## 2459             worthy     2
## 2460                wow     4
## 2461              wowow     4
## 2462              wowww     4
## 2463           wrathful    -3
## 2464              wreck    -2
## 2465              wrong    -2
## 2466            wronged    -2
## 2467                wtf    -4
## 2468               yeah     1
## 2469           yearning     1
## 2470              yeees     2
## 2471                yes     1
## 2472           youthful     2
## 2473              yucky    -2
## 2474              yummy     3
## 2475             zealot    -2
## 2476            zealots    -2
## 2477            zealous     2

Lets look at bing

bing <- get_sentiments("bing")

bing
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

And lastly nrc

nrc <- get_sentiments("nrc")

nrc
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created either using crowdsourcing or cloud computing/AI like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.

Lets look at the words with a joy score from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
  dplyr::mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 629,573 × 4
##    gutenberg_id linenumber chapter word   
##           <int>      <int>   <int> <chr>  
##  1          944          1       0 the    
##  2          944          1       0 voyage 
##  3          944          1       0 of     
##  4          944          1       0 the    
##  5          944          1       0 beagle 
##  6          944          1       0 by     
##  7          944          2       0 charles
##  8          944          2       0 darwin 
##  9          944          8       0 about  
## 10          944          8       0 the    
## # ℹ 629,563 more rows

Lets add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

tidy_books
## # A tibble: 629,573 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 629,563 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The Voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 277 × 2
##    word           n
##    <chr>      <int>
##  1 found        301
##  2 good         161
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          86
##  7 present       85
##  8 food          78
##  9 beautiful     61
## 10 elevation     60
## # ℹ 267 more rows

We can alos examine how sentiment changes throughout a work.

library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`

Now lets plot it

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Lets compare the three sentiment directions

There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all 3 of our dictationaries and examine how the sentiment changes across the arc of TVOTB.

library(tidyr)

voyage <- tidy_books %>%
  filter(book == "The Voyage of the Beagle")

voyage
## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows

Lets again use integar division (‘%/%’) to define larger sections of text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.

affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  voyage %>%
    inner_join(get_sentiments("nrc")%>%
                 filter(sentiment %in% c("positive", "negative"))
               ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1154 of `x` matches multiple rows in `y`.
## ℹ Row 4245 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

I DO NOT KNOW WHY “POSITIVE” IS NOT FOUND?!?!?!

We can now estimate the net sentiment (positive - negative) in each chunk of the novel for each lexion (dictionary). Lets bind them all together and visualize with ggplot

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")
## Warning: Removed 2477 rows containing missing values (`position_stack()`).

Lets look at the counts based on each dictionary

get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 2,421 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive    964
##  2 like       positive    705
##  3 well       positive    701
##  4 good       positive    384
##  5 doubt      negative    306
##  6 respect    positive    266
##  7 wild       negative    266
##  8 bright     positive    252
##  9 remarkable positive    247
## 10 savages    negative    217
## # ℹ 2,411 more rows

This can be shown visually, and we can pipe straight into ggplot2

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scale = "free_y") +
  labs(x = "Contribution to Sentiment", y = NULL)

Lets spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)

custom_stop_words
## # A tibble: 1,153 × 2
##    word      lexicon
##    <chr>     <chr>  
##  1 wild      custom 
##  2 dark      custom 
##  3 great     custom 
##  4 like      custom 
##  5 a         SMART  
##  6 a's       SMART  
##  7 able      SMART  
##  8 about     SMART  
##  9 above     SMART  
## 10 according SMART  
## # ℹ 1,143 more rows

Word Clouds!

We can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques.

Lets use the wordcloud package!

library(wordcloud)
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
## 
##     textplot
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
## Warning in wordcloud(word, n, max.words = 100): male could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): female could not be fit on
## page. It will not be plotted.

Lets also look at a comparison.cloud(), which may require turning the dataframe into a matrix.

We can change to matrix ising the acast() function.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tigerstats':
## 
##     tips
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining with `by = join_by(word)`

Looking at units beyond words

Lots of useful work can be done by tokening at the word level, but sometimes its nice to look at different units of text. For example, we can look beyoind just unigrams.

Ex.. I am not having a good day.

bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  dplyr::summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  dplyr::summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  dplyr::mutate(ratio = negativewords/words) %>%
  filter(chapter !=0) %>%
  slice_max(ratio, n = 1) %>%
  ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 5
##   book                                        chapter negativewords words  ratio
##   <chr>                                         <int>         <int> <int>  <dbl>
## 1 The Descent of Man, and Selection in Relat…      20             4    87 0.0460
## 2 The Expression of the Emotions in Man and …      10           249  4220 0.0590
## 3 The Voyage of the Beagle                         10           375 11202 0.0335

N-Grams (pt 1-3)

So far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing word relationships.

library(dplyr)
library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n=2)

darwin_bigrams
## # A tibble: 580,636 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle the voyage    
##  2 The Voyage of the Beagle voyage of     
##  3 The Voyage of the Beagle of the        
##  4 The Voyage of the Beagle the beagle    
##  5 The Voyage of the Beagle beagle by     
##  6 The Voyage of the Beagle charles darwin
##  7 The Voyage of the Beagle <NA>          
##  8 The Voyage of the Beagle <NA>          
##  9 The Voyage of the Beagle <NA>          
## 10 The Voyage of the Beagle <NA>          
## # ℹ 580,626 more rows

This data is still tidytext format, and is structured as one-token-per-row. Each token is a bigram.

Counting and filtering n-gram

#darwin_bigrams %>%
#  count(bigram, sort = TRUE)

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s separate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

New bigram counts

bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_counts
## # A tibble: 79,272 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle charles darwin
##  2 The Voyage of the Beagle NA NA         
##  3 The Voyage of the Beagle NA NA         
##  4 The Voyage of the Beagle NA NA         
##  5 The Voyage of the Beagle NA NA         
##  6 The Voyage of the Beagle NA NA         
##  7 The Voyage of the Beagle online edition
##  8 The Voyage of the Beagle NA NA         
##  9 The Voyage of the Beagle degree symbol 
## 10 The Voyage of the Beagle degs italics  
## # ℹ 79,262 more rows

We may also be interested in trigrams, which are 3 word combos

trigrams <- darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

trigrams
## # A tibble: 17,771 × 4
##    word1         word2  word3           n
##    <chr>         <chr>  <chr>       <int>
##  1 <NA>          <NA>   <NA>         7965
##  2 tierra        del    fuego          84
##  3 secondary     sexual characters     79
##  4 captain       fitz   roy            45
##  5 de            la     physionomie    30
##  6 domestication vol    ii             26
##  7 vol           ii     pp             22
##  8 vertebrates   vol    iii            21
##  9 proc          zoolog soc            18
## 10 proc          zool   soc            17
## # ℹ 17,761 more rows

Lets analyze some bigrams

bigrams_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 24 × 3
##    book                                                 word1           n
##    <chr>                                                <chr>       <int>
##  1 The Descent of Man, and Selection in Relation to Sex sexual        254
##  2 The Descent of Man, and Selection in Relation to Sex natural       156
##  3 The Descent of Man, and Selection in Relation to Sex unconscious     6
##  4 The Descent of Man, and Selection in Relation to Sex continued       5
##  5 The Expression of the Emotions in Man and Animals    natural         4
##  6 The Descent of Man, and Selection in Relation to Sex artificial      2
##  7 The Descent of Man, and Selection in Relation to Sex ordinary        2
##  8 The Descent of Man, and Selection in Relation to Sex careful         1
##  9 The Descent of Man, and Selection in Relation to Sex consequent      1
## 10 The Descent of Man, and Selection in Relation to Sex la              1
## # ℹ 14 more rows

Lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
## # A tibble: 51,213 × 6
##    book                                       bigram     n      tf   idf  tf_idf
##    <chr>                                      <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and… nerve…    47 0.00350 1.10  0.00384
##  2 The Descent of Man, and Selection in Rela… sexua…   138 0.00315 1.10  0.00346
##  3 The Expression of the Emotions in Man and… la ph…    35 0.00260 1.10  0.00286
##  4 The Voyage of the Beagle                   bueno…    54 0.00245 1.10  0.00269
##  5 The Voyage of the Beagle                   capta…    53 0.00240 1.10  0.00264
##  6 The Descent of Man, and Selection in Rela… secon…   101 0.00231 1.10  0.00254
##  7 The Voyage of the Beagle                   fitz …    50 0.00227 1.10  0.00249
##  8 The Expression of the Emotions in Man and… muscl…    30 0.00223 1.10  0.00245
##  9 The Expression of the Emotions in Man and… orbic…    29 0.00216 1.10  0.00237
## 10 The Descent of Man, and Selection in Rela… sexua…   254 0.00580 0.405 0.00235
## # ℹ 51,203 more rows
bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>%
  ungroup() %>%
  mutate(bigram = reorder(bigram, tf_idf)) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf_idf of bigrams", y = NULL)

Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 740 × 3
##    word1 word2      n
##    <chr> <chr>  <int>
##  1 not   be        90
##  2 not   only      89
##  3 not   have      80
##  4 not   a         78
##  5 not   to        74
##  6 not   the       63
##  7 not   at        56
##  8 not   been      54
##  9 not   appear    51
## 10 not   know      47
## # ℹ 730 more rows

By doing sentiment analysis on bigrams, we can examine how often sentiment associated words are preceded bby a modifier like “not” or other negating words.

AFINN <- get_sentiments("afinn")

AFINN
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

We can examine the most frequent words that were preceded by “not” and associated with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

not_words
## # A tibble: 97 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 like          2    10
##  2 doubt        -1     9
##  3 wish          1     7
##  4 admit        -1     5
##  5 easy          1     5
##  6 pretend      -1     5
##  7 reach         1     5
##  8 difficult    -1     4
##  9 extend        1     4
## 10 help          2     4
## # ℹ 87 more rows

Lets visualize

library(ggplot2)

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurences", y = "words preceded by \"not\"")

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

negated_words
## # A tibble: 182 × 4
##    word1   word2   value     n
##    <chr>   <chr>   <dbl> <int>
##  1 no      doubt      -1   173
##  2 no      great       3    14
##  3 not     like        2    10
##  4 not     doubt      -1     9
##  5 not     wish        1     7
##  6 without doubt      -1     7
##  7 not     admit      -1     5
##  8 not     easy        1     5
##  9 not     pretend    -1     5
## 10 not     reach       1     5
## # ℹ 172 more rows

Lets visualize the negated words

negated_words %>%
  mutate(contribution = n * value,
         word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1) %>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
  xlab("Words preceded by negation term") +
  ylab("Sentiment value * # of occurences") +
  coord_flip()

Lets visualize a network of bigrams with ggraph

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:mosaic':
## 
##     compare
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following object is masked from 'package:plotly':
## 
##     groups
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph
## IGRAPH aba6dda DN-- 158 106 -- 
## + attr: name (v/c), n (e/n)
## + edges from aba6dda (vertex names):
##  [1] NA        ->NA          sexual    ->selection   vol       ->ii         
##  [4] natural   ->selection   lower     ->animals     sexual    ->differences
##  [7] breeding  ->season      secondary ->sexual      south     ->america    
## [10] sexual    ->characters  del       ->fuego       tierra    ->del        
## [13] vol       ->iii         de        ->la          bright    ->colours    
## [16] sexual    ->difference  distinct  ->species     tail      ->feathers   
## [19] closely   ->allied      vocal     ->organs      zoological->gardens    
## [22] natural   ->history     buenos    ->ayres       captain   ->fitz       
## + ... omitted several edges
library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also add directionality to this netwoek

set.seed(1234)

a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Word Frequencies

A central question in text mining is how to quantify what a document is about. We can do this by looking at words that make up the documents, and measuring term frequency.

There are a lot of words that may not be important, these are the stop words.

One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly use words and increases the weight for words that are not used very much.

Term frequency in Darwin’s works

library(dplyr)
library(tidytext)

book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(book_words)[1] <- "book"

book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

Now lets dissect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

book_words
## # A tibble: 35,579 × 3
##    book                                                 word      n
##    <chr>                                                <chr> <int>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490
##  2 The Voyage of the Beagle                             the   16930
##  3 The Descent of Man, and Selection in Relation to Sex of    16762
##  4 The Voyage of the Beagle                             of     9438
##  5 The Descent of Man, and Selection in Relation to Sex in     8882
##  6 The Expression of the Emotions in Man and Animals    the    8045
##  7 The Descent of Man, and Selection in Relation to Sex and    7854
##  8 The Descent of Man, and Selection in Relation to Sex to     5901
##  9 The Voyage of the Beagle                             and    5768
## 10 The Voyage of the Beagle                             a      5328
## # ℹ 35,569 more rows
book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  dplyr::summarize(total = sum(n))

book_words
## # A tibble: 35,579 × 3
##    book                                                 word      n
##    <chr>                                                <chr> <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490
##  2 The Voyage of the Beagle                             the   16930
##  3 The Descent of Man, and Selection in Relation to Sex of    16762
##  4 The Voyage of the Beagle                             of     9438
##  5 The Descent of Man, and Selection in Relation to Sex in     8882
##  6 The Expression of the Emotions in Man and Animals    the    8045
##  7 The Descent of Man, and Selection in Relation to Sex and    7854
##  8 The Descent of Man, and Selection in Relation to Sex to     5901
##  9 The Voyage of the Beagle                             and    5768
## 10 The Voyage of the Beagle                             a      5328
## # ℹ 35,569 more rows
book_words <- dplyr::left_join(book_words, total_words)
## Joining with `by = join_by(book)`
book_words
## # A tibble: 35,579 × 4
##    book                                                 word      n  total
##    <chr>                                                <chr> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490 311041
##  2 The Voyage of the Beagle                             the   16930 208118
##  3 The Descent of Man, and Selection in Relation to Sex of    16762 311041
##  4 The Voyage of the Beagle                             of     9438 208118
##  5 The Descent of Man, and Selection in Relation to Sex in     8882 311041
##  6 The Expression of the Emotions in Man and Animals    the    8045 110414
##  7 The Descent of Man, and Selection in Relation to Sex and    7854 311041
##  8 The Descent of Man, and Selection in Relation to Sex to     5901 311041
##  9 The Voyage of the Beagle                             and    5768 208118
## 10 The Voyage of the Beagle                             a      5328 208118
## # ℹ 35,569 more rows

You can see that the usual suspects are the most common words, but don’t tell us anything about what the book topic is.

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 370 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 3 rows containing missing values (`geom_bar()`).

Zipf’s Law

The frequency that a word appears is inversely proportional to its rank when predicting a topic.

Lets apply Zipf’s law to Darwin’s work

freq_by_rank <- book_words %>%
  group_by(book) %>%
  dplyr::mutate(rank = row_number(),
        'term frequency' = n/total) %>%
  ungroup()

freq_by_rank
## # A tibble: 35,579 × 6
##    book                                word      n  total  rank `term frequency`
##    <chr>                               <chr> <dbl>  <dbl> <int>            <dbl>
##  1 The Descent of Man, and Selection … the   25490 311041     1           0.0820
##  2 The Voyage of the Beagle            the   16930 208118     1           0.0813
##  3 The Descent of Man, and Selection … of    16762 311041     2           0.0539
##  4 The Voyage of the Beagle            of     9438 208118     2           0.0453
##  5 The Descent of Man, and Selection … in     8882 311041     3           0.0286
##  6 The Expression of the Emotions in … the    8045 110414     1           0.0729
##  7 The Descent of Man, and Selection … and    7854 311041     4           0.0253
##  8 The Descent of Man, and Selection … to     5901 311041     5           0.0190
##  9 The Voyage of the Beagle            and    5768 208118     3           0.0277
## 10 The Voyage of the Beagle            a      5328 208118     4           0.0256
## # ℹ 35,569 more rows
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

Lets use TF-IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf
## # A tibble: 35,579 × 7
##    book                                   word      n  total     tf   idf tf_idf
##    <chr>                                  <chr> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in … the   25490 311041 0.0820     0      0
##  2 The Voyage of the Beagle               the   16930 208118 0.0813     0      0
##  3 The Descent of Man, and Selection in … of    16762 311041 0.0539     0      0
##  4 The Voyage of the Beagle               of     9438 208118 0.0453     0      0
##  5 The Descent of Man, and Selection in … in     8882 311041 0.0286     0      0
##  6 The Expression of the Emotions in Man… the    8045 110414 0.0729     0      0
##  7 The Descent of Man, and Selection in … and    7854 311041 0.0253     0      0
##  8 The Descent of Man, and Selection in … to     5901 311041 0.0190     0      0
##  9 The Voyage of the Beagle               and    5768 208118 0.0277     0      0
## 10 The Voyage of the Beagle               a      5328 208118 0.0256     0      0
## # ℹ 35,569 more rows

Lets look at some terms with high tf-idf in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 35,579 × 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <dbl>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and … tears   126 1.14e-3 1.10  1.25e-3
##  2 The Expression of the Emotions in Man and … blush   114 1.03e-3 1.10  1.13e-3
##  3 The Descent of Man, and Selection in Relat… sexu…   745 2.40e-3 0.405 9.71e-4
##  4 The Descent of Man, and Selection in Relat… sele…   621 2.00e-3 0.405 8.10e-4
##  5 The Voyage of the Beagle                    sea     351 1.69e-3 0.405 6.84e-4
##  6 The Voyage of the Beagle                    degs    117 5.62e-4 1.10  6.18e-4
##  7 The Expression of the Emotions in Man and … eyeb…   149 1.35e-3 0.405 5.47e-4
##  8 The Voyage of the Beagle                    isla…   271 1.30e-3 0.405 5.28e-4
##  9 The Descent of Man, and Selection in Relat… shewn   143 4.60e-4 1.10  5.05e-4
## 10 The Expression of the Emotions in Man and … frown    46 4.17e-4 1.10  4.58e-4
## # ℹ 35,569 more rows

Lets look at a visualization for these high tf-idf words

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)