Final_Project

Now lets take a look at some gglot2 barplots

We’ll start with making a dataframe based on the tooth data.

df<- data.frame(dose = c("D0.5", "D1", "D2"), 
                len = c(4.2, 10, 29.5))

df

##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

And now lets make a second dataframe

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                           dose = rep(c("D0.5", "D1", "D2"), 2), 
                           len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2

##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Lets load up ggplot2

library(ggplot2)

Lets set our parameters for ggplot

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = -0.3)

Now lets add the labels inside the bars

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") +
  scale_color_manual(values = c("blue", "gold", "red"))

This is kinda hard to see, so lets change the fill.

f + geom_col(aes(fill = dose)) + 
  scale_fill_manual(values = c("blue", "gold", "red"))

Ok how do we do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) + 
  geom_col(aes(color = supp, fill = supp), position = position_stack()) + 
  scale_color_manual(values = c("blue", "gold")) + 
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) + 
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) + 
  scale_color_manual(values = c("blue", "gold")) + 
  scale_fill_manual(values = c("blue", "gold"))

Now lets add those labels to the dodged barplot

p + geom_text(
  aes(label = len, group = supp), 
  position = position_dodge(0.8), 
  vjust = -0.3, size = 3.5
)

Now what if we need to add labels to our stacked barplots? For this we need dplyr

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) - 0.5 * len)

df2

## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

Now lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y = len)) +
  geom_col(aes(fill = supp), width = 0.7) + 
  geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
  scale_color_manual(values = c("blue", "gold")) + 
  scale_fill_manual(values =c("blue", "gold"))

Lets look at some boxplots

data("ToothGrowth")

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() +
    theme(legend.position = "top")
)

Lets startt with a very basic boxplot with dose v length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") + 
  stat_summary(fun = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")

We can also change the scale number of variables included, and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))

## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

Lets put our x axis in descending order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

We can also change boxplot colors by groups

tg + geom_boxplot(aes(color = dose)) + 
  scale_color_manual(values = c("indianred", "blue1", "green2"))

What is we want to display our data subset by oj vs vitamin c?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) + 
  scale_fill_manual(values = c("#999999", "#E69F00"))

tg2

We can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 50), rnorm(200, 58))
)

head(wdata, 4)

##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430

Now lets load dplyr

library(dplyr)

mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set( 
  theme_classic() + 
    theme(legend.position = "bottom")
  )

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") + 
  geom_vline(aes(xintercept = mean(weight)), 
             linetype = "dashed", linewidth = 0.6)

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") + 
  scale_color_manual(values = c("#00AFBB", "#E7B800"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") + 
  scale_color_manual(values = c("#00AFBB", "#E7B800")) + 
  scale_fill_manual(values = c("indianred", "lightblue1"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we want to combine density plots and histograms?

a + geom_histogram(aes(y = after_stat(density)), 
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes( y = stat(density), color = sex), 
                   fill = "white", position = "identity") + 
  geom_density(aes(color = sex), size = 1) + 
  scale_color_manual(values = c("indianred", "lightblue1"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

First lets load the required packages

library(ggplot2)

Lets set our theme

theme_set(
  theme_dark() + 
    theme(legend.position = "top")
)

First lets initate a ggplot called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes (x = dose, y = len))

lets crete a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") + 
  stat_summary(fun = mean, fun.args = list(mult = 1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets add a boxplot and dotplot together

tg + geom_boxplot(width = 0.5) + 
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) + 
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") + 
  stat_summary(fun = mean, fun.args = list(mult = 1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) + 
  geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') + 
  scale_fill_manual(values = c("indianred", "lightblue1"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) + 
  geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center', 
                dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  scale_color_manual(values = c( "#00AFBB", "#E7B800"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Now lets change it up and look at some line plots

We’ll start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff that we are modifying.

df <- data.frame(dose = c("D0.5", "D1", "D2"), 
                 len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3), 
                  dose = rep(c("D0.5,", "D1", "D2"), 2), 
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2

##   supp  dose  len
## 1   VC D0.5,  6.8
## 2   VC    D1 15.0
## 3   VC    D2 33.0
## 4   OJ D0.5,  4.2
## 5   OJ    D1 10.0
## 6   OJ    D2 29.5

Now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
  
)

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim = c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab = "")
  for(i in 0:6) lines(c(0.3, 0.7), c(i,i), lty = i, lwd = 3)
text(rep(0.1,6), 0:6, labels = c("0.'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'", 
                                 "4.'dotdash'", "5.'longdash'", "6.'twodash'"))
par(mar=oldPar$mar, font=oldPar$font)
}

generateRLineTypes()

Now lets build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))

p + geom_line() + geom_point()

Now lets modify the line type and color

p + geom_line(linetype = "dashed", color = "steelblue") + 
  geom_point(color = "steelblue")

Now lets try a step graph, which indicates a threshold type progression

p + geom_step() + geom_point()

Now lets move on to making multiple groups. First we will create out ggplot object

p <- ggplot(df2, aes(x=dose, y=len, group = supp))

Now lets change line types and point shapes by group

p + geom_line(aes(linetype = supp, color = supp)) + 
  geom_point(aes(shape = supp, color = supp)) +
  scale_color_manual(values = c("red", "blue"))

Now lets look at line plots with a numeric x axis

df3 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("0.5", "1", "2"), 2), 
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df3

##   supp dose  len
## 1   VC  0.5  6.8
## 2   VC    1 15.0
## 3   VC    2 33.0
## 4   OJ  0.5  4.2
## 5   OJ    1 10.0
## 6   OJ    2 29.5

Now lets plot where both axises are treated as continous labels

df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group= supp, color = supp)) + 
  geom_line() + geom_point()

Now lets look at a line graph with having the x axis as dates. We will use the built in economics time series for this example

head(economics)

## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018

ggplot(data = economics, aes(x = date, y = pop)) + 
  geom_line()

Now lets subset the data

ss <- subset(economics, date > as.Date("2006-1-1")) 
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

We can also change the line size, for instance, by another variable like unemployment

ggplot(data = economics, aes(x=date, y= pop)) +
  geom_line(aes(linewidth = unemploy/pop))

We can also plot multiple time-series data

ggplot(economics, aes(x = date)) + 
  geom_line(aes(y = psavert), color = "darkred") + 
  geom_line(aes(y=uempmed), color = "steelblue", linetype = "twodash")

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date)) +
  geom_area(aes(y = psavert), fill = "#999999", 
       color = "#999999", alpha = 0.5) +
  geom_area(aes(y = uempmed), fill = "#e69F00", 
            color = "#E69F00", alpha = 0.5)

First lets load the required packages

library(ggplot2)
library(ggridges)

#BiocManager::install("ggridges")

Now lets load some sample data

?airquality

air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air

## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)

## Loading required package: viridisLite

ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) + 
  geom_density_ridges_gradient() +
  scale_fill_viridis(option = "C", name = "Temp")

## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data

library(tidyr)

airquality %>%
  gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) + 
  geom_density_ridges() + 
  facet_wrap(~ Measurement, scales = "free")

## Picking joint bandwidth of 11

## Picking joint bandwidth of 40.1

## Picking joint bandwidth of 2.65

## Picking joint bandwidth of 1.44

## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

A density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 55), rnorm(200, 58))
  )

library(dplyr)
mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() + 
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object.

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

d + geom_density() + 
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now lets change the y axis to count instead of density

d + geom_density(aes(y = after_stat(count)), fill = "lightgray") + 
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

d + geom_density(aes(color = sex)) + 
  scale_color_manual(values = c("darkgray", "gold"))

Lastly, lets fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
  scale_color_manual(values = c("grey", "gold")) + 
  scale_fill_manual(values = c("grey", "gold"))

First lets load out required package

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Lets start with a scatter plot of the Orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference, 
      color = ~Tree, size = ~age, 
      text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference)
)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)

We’ll use the random numbers as lines on the graph

plot_ly(data = new_data,  x = ~age, y = ~circumference, color = ~Tree, size = ~age, 
      text = ~paste("Tree ID:", Tree, "<br>Circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line, and add labels.

plot_ly(data = Orange, x = ~age, y = ~circumference, 
      color = ~Tree, size = ~age, 
      text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference)) %>%
  add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "Plot with switchable trace", 
    updatemenus = list(
      list(
        type = 'downdrop',
        y = 0.8,
        buttons = list(
          list(method = 'restyle', 
               args = list('mode', 'markers'), 
               label = "Marker"
        ), 
        list(method = "restyle", 
             args = list('mode', 'lines'),
             label = "Lines"
        )
      )
    )
  )
)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

First lets load our required packages

library(plotly)

Now lets create a random 3D matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5), 
  y <- seq (1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))

Now lets plot our 3D data

plot_ly(d, x= ~x, y = ~y, z = ~z) %>%
  add_surface()

Lets add some more aspects to it, such as topogrophy

plot_ly(d, x= ~x, y = ~y, z = ~z) %>%
  add_surface(
    contours = list(
      z = list( 
        show = TRUE,
        usecolormap = TRUE,
        highlightcolor = "FF0000", 
        project = list(z = TRUE)
        )
    )
)

Now lets look at a 3D scatterplot

plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

First lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() + 
    theme(legend.position = 'top')
)

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <- df %>%
  group_by(dose) %>%
  summarise( 
    sd = sd(len, na.rm = TRUE), 
    stderr = std.error(len, na.rm = TRUE),
    len = mean(len), 
    )

df.summary

## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Lets now look at some key functions

geom_corssbar() for hollow bars with middle indicated by a horizontal line
geom_errorbar() for error bars
geom_errorbarh() for horizontal error bars
geom_linerange() for drawing an interval represented by a vertical line
geom_pointrange() for creating an interval represented by a vertical line; with a point in the middle

lets start by creating a ggplot object

tg <- ggplot( 
  df.summary, 
  aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
  )

Now lets look at the most basic error bars

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2) + 
  geom_point(size = 1.5)

Now lets create horizontal error bars by manipulating our graph

ggplot(df.summary, aes(x=len, y=dose, xmin = len-sd, xmax = len + sd)) + 
  geom_point() +
  geom_errorbarh(height = 0.2)

This just gives you an idea of error bars on the horizontal axis

Now lets look at adding jitter points (actual measurements) to our data.

ggplot(df, aes(dose, len)) + 
  geom_jitter(position = position_jitter(0.2), color = "darkgray") + 
  geom_pointrange(aes(ymin = len - sd, ymax = len + sd), data = df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgray", trim = FALSE) + 
  geom_pointrange(aes(ymin = len - sd, ymax = len +sd), data = df.summary)

Now how about with a line graph?

ggplot(df.summary, aes(dose, len)) + 
  geom_line(aes(group = 1)) + # always specify this when you have 1 line 
  geom_errorbar(aes(ymin = len-stderr, ymax = len + stderr), width = 0.2) + 
  geom_point(size = 2)

Now lets make a bar graph with halve error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgrey", color = "black") + 
  geom_errorbar(aes(ymin = len, ymax = len + stderr), width = 0.2)

You can see that by not specifying wmin = len-stderr, we have in essence cut our error bar in half.

How about we add jitter points to line plots? We need to use the original dataframe for the jitter plot, and the summary df for the geom layers.

ggplot(df, aes(dose, len)) + 
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") + 
  geom_line(aes(group = 1), data = df.summary) + 
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr), 
    data = df.summary, width = 0.2) + 
  geom_point(data = df.summary, size = 0.2)

What about adding jitterpoints to a barplot?

ggplot(df, aes(dose, len)) + 
  geom_col(data = df.summary, fill = NA, color = "black") + 
  geom_jitter(position = position_jitter(0.3), color = "blue") + 
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr), 
                data = df.summary, width = 0.2)

What if we wanted to have our error bars per group? (OJ vs VC)

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise( 
    sd = sd(len), 
    stderr = std.error(len), 
    len = mean(len)
  )

## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.

df.summary2

## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) + 
  geom_pointrange(
    aes(ymin = len - stderr, ymax = len + stderr, color = supp), 
    position = position_dodge(0.3)) + 
  scale_color_manual(values = c("indianred", "lightblue"))

How about line plots with multiple error bars?

ggplot(df.summary2, aes(dose, len)) + 
  geom_line(aes(linetype = supp, group = supp)) +
  geom_point() + 
  geom_errorbar(aes(ymin = len-stderr, ymax = len + stderr, group = supp), width = 0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len)) + 
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = len - sd, ymax = len + sd, group = supp), width = 0.2, position = position_dodge(0.8)) + 
  scale_fill_manual(values = c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color = supp)) + 
  geom_jitter(position = position_dodge(0.2)) +
  geom_line(aes(group = supp), data = df.summary2) +
  geom_point() + 
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), data = df.summary2, width = 0.2)

ggplot(df, aes(dose, len, color = supp)) + 
  geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white") + 
  geom_jitter(
    position =  position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr), data = df.summary2, 
    width = 0.2, position = position_dodge(0.8)) + 
  scale_color_manual(values = c("indianred", "lightblue")) + 
  theme(legend.position = "top")

Now lets do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 50), rnorm(200, 58)))

Now lets look at our dataframe

head(wdata, 5)

##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430
## 5   F 50.42912

Now lets load our plotting package

library(ggplot2)

theme_set(
  theme_classic() + 
    theme(legend.position = "bottom")
)

Now lets create our ECDF Plot

ggplot(wdata, aes(x = weight)) + 
  stat_ecdf(aes(color = sex, linetype = sex), 
            geom = "step", linewidth = 1.5) + 
  scale_color_manual(values = c("#00AFBB", "#E7B900")) + 
  labs(y = "weight")

Now lets take a look at qq plots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some data

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 50), rnorm(200, 58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
  theme_classic() + 
    theme(legend.position = "top")
)

Create a qq plot of the weight

ggplot(wdata, aes(sample = weight)) + 
  stat_qq(aes(color = sex)) + 
  scale_color_manual(values = c("#0073C2FF", "#FC4E07")) + 
  labs(y = "weight")

#install.packages(ggpubr)
library(ggpubr)

ggqqplot(wdata, x = "weight", 
         color = "sex" ,
         palettes = c("#0073C2FF", "#FC4E07"), 
         ggtheme = theme_pubclean())

Now what would a non-normal distribution look like?

#install.packages(mnonr)

library(mnonr)

data2 <- mnonr::mnonr(n = 1000, p = 2, ms = 3, mk = 61, Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2), initial = NULL)

data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample = V1)) + 
  stat_qq()

ggqqplot(data2, x = "V1", 
         palette = "#0073C2FF", 
         ggtheme = theme_pubclean())

Lets look at how to put multiple plots togeyher into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw() + 
    theme(legend.position = "top")
  
)

First lets create a nice boxplot

lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x = dose, y = len)) + 
  geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) + 
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))

p

Now lets look at the gvgplot facet function

p + facet_grid(rows = vars(supp))

Now lets do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols = vars(supp))

Now lets look at the facet_wrap function. This allows facets to be places side-by-side

p + facet_wrap(vars(dose), ncol = 2)

Now how do we combine multiple plots using ggarrange()

Lets start by making some basic plots. First we will defind a color palette and data.

my3cols <- c("#E7B800","#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) + 
  scale_color_manual(values = my3cols)

Ok now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose), 
                       binaxis = 'y', stackdir = 'center') + 
  scale_color_manual(values = my3cols) + 
  scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x = date, y = psavert)) + 
  geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

figure

This looks great, but we can make it look even better

figure2 <- ggarrange( 
  lp, 
  ggarrange(bxp, dp, ncol = 2, labels =c("B", "C")), 
  nrow = 2, 
  labels = "A")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

figure2

ok this looks really good, but you’ll notice that there are two legends that are the same.

ggarrange(
  bxp, dp, labels = c("A", "B"), 
  common.legend = TRUE, legend = "bottom")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly, we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")

## file saved to facetfigure.pdf

We can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

lastly, we can export to a pdf with multiple pages and multiple columns

ggexport(bxp, dp, lp, filename = "test2.pdf", nrow = 2, ncol = 1)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Lets get started with heatmaps

#install.packages(heatmap3)

library(heatmap3)

Now lets get our data.

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

Now lets play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now lets apply our color selections

heatmap(data2, ColSideColors = cc)

library(RColorBrewer)
heatmap(data2, ColSideColors = cc, 
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Theres more that we can customize

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:plotrix':
## 
##     plotCI

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(data2, ColSideColors = cc, 
          col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Missing Values

If you encounter an unusual value in your dataset, and simply want to move on to the rest of your anaylsis, you have two options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

In this instance, y is the width of the diamond, so anything under 3 mm or above 20 is excluded

I dont recommend this option, just because there is one bad measurement doesn’t mean they are all bad

Instead, I recommend replacing the unusual values with missing values

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values shouldn’t pass silently into the night.

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) + 
  geom_point()

## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to supress that warning you can use na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) + 
  geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled adn non-cancelled times.

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time), 
    sched_hour = sched_dep_time %/% 100, 
    sched_min = sched_dep_time %% 100, 
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) + 
  geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4)

## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4):
## Ignoring unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we want to know what our outliers are?

First we need to load the required libraries.

library(outliers)
library(ggplot2)

download.file(
  url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip",
  destfile = "AirQualityUCI.zip"
)

And reload the dataset because we removed outliters.

library(readxl)

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Lets create a function using the grubb test to identify all outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribition.

grubbs.flag <- function(x) {
  #lets create a variable called outliers and save nothing in it, we'll add to the variable 
  #as we identify them 
  outliers <- NULL
  # We'll create a variable called test to identify which univariate we are testing 
  test <- x
  # now using the outliers package, use grubbs.test to find outliers in our variable 
  grubbs.result <- grubbs.test(test)
  # lets get the p-values of all tested variables 
  pv <- grubbs.result$p.value
  # now lets search through our p-values for ones that are outside of 0.5
  while(pv < 0.05) {
    # anything with a pvalues greated than p = 0.05, we add to our empty outliers vector 
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, " ") [[1]][3]))
    # now we want to remove those outliers from our test vairable 
    test <- x[!x %in% outliers]
    # and run the grubbs test again without the outliers 
    grubbs.result <- grubbs.test(test)
    # and save the new p values
    pv <- grubbs.result$p.value 
  }
  return(data.frame(x = x, outliers = (x %in% outliers)))

}

CATEGORICAL VARIABLES

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) + 
  geom_freqpoly(mapping = aes(color = cut), bindwith = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwith = 500): Ignoring
## unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution because the counts differ so much.

ggplot(diamonds) + 
  geom_bar(mapping = aes(x = cut))

To make the comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(color = cut), bindwith = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwith = 500): Ignoring
## unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe that because frequency polygons are a little hard to interpret.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribition of values.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + 
  geom_boxplot()

We see much less information about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average!

Lets look at some car data

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it 90 degrees.

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + 
  coord_flip()

To visualize the correlation between two continuous variables, we can use a scatter plot.

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price))

Scatterolots become less useful as the size of your dataset grown, because we get overplot. We can fix this using the alpha aesthetic.

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

First lets load a required library

library(RCurl)

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

College_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(College_Data)

## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabettically by college?

alphabetical <- College_Data %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)

## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset with the filter function

Louisiana_Cases <- filter(College_Data, state == "Louisiana")

Lets filter out a smaller amount of states

South_Cases <- filter(College_Data, state == "Louisana" | state == "Texas" | state == "Arkansas" | state ==  "Mississippi")

Lets look at some time series data

First we’ll load the required libraries

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)
library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

Lets create group_bu object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)

## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting

Days_since_first_reported <- tally(state_cases)

Lets visualize some data

First lets start off with some definitions

Data - obvious - the stuff we want to visualize

Layer - made of gemetric elements and requisite statistical information. Includes geometric objects with represents the plot

Scales = used to map values in teh data space that is used for the creation of values (color, size, shape, etc)

Coordinate system - descrines how the data coordinates are mapped together in relation to the plan on the graphic

Faceting - how to break up the data into subsets to display multiple types or groups of data

theme - controls the finer points of the display, such as the font size and background color

options(repr.plot.width = 6, rep.plot.height = 6)

class(College_Data)

## [1] "data.frame"

head(College_Data)

##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA

summary(College_Data)

##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Now lets take a look at a different dataset.

iris <- as.data.frame(iris)

class(iris)

## [1] "data.frame"

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Lets start by creating a scatter plot of the College Data

ggplot(data=College_Data, aes(x = cases, y = cases_2021)) + 
  geom_point() + 
  theme_minimal()

## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) + 
  geom_point() +
  theme_minimal()

Lets color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) + 
  geom_point() +
  theme_minimal()

## Warning: Removed 337 rows containing missing values (`geom_point()`).

Lets color coordinate the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) + 
  geom_point() +
  theme_minimal()

Lets run a simple histogram of our Louisiana Case Data

hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", 
     main = "Total College Covid-19 Infections (Louisiana)")

Lets run a simple histogram for the Iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width",
     ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(bindwith = 100, color = "black", aes(fill = county)) + 
  xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 Cases in Louisiana")

## Warning in geom_histogram(bindwith = 100, color = "black", aes(fill = county)):
## Ignoring unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lets create a ggplot for the IRIS data

histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

histogram_iris + geom_histogram(bindwith = 0.2, color = "black", aes(fill = Species)) + 
  xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")

## Warning in geom_histogram(bindwith = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Maybe a density plot makes more sense for our college data

ggplot(South_Cases) + 
  geom_density(aes(x = cases, fill = state), alpha = 0.25)

Lets do it with the iris data

ggplot(iris) + 
  geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

Lets look at violin plots for iris

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) + 
  geom_violin() +
  theme_classic() +
  theme(legend.position = "none")

Now lets try the south data

ggplot(data = South_Cases, aes (x = state, y = cases, color = state)) + 
  geom_violin() + 
  theme_gray() +
  theme(legend.position = "none")

Now lets take a look at residual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly disperesed, a non linear model is more appropriate.

Lets start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) + 
  geom_point(aes(x =.fitted, y = .resid))

Now lets look at the southern state cases

ggplot(lm(cases ~ cases_2021, data = South_Cases)) + 
  geom_point(aes(x = .fitted, y = .resid))

A linear model is not a good call for the state cases

Now lets do some correlations 9:44

obesity <- read.csv("obesity_insurance.csv")

library(tidyr)
library(dplyr)

Lets look at the structure of the dataset

str(obesity)

## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

lets look at the column classes

class(obesity)

## [1] "data.frame"

and get a summary of distribution of the variables

summary(obesity)

##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now lets look at the distribution for insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now lets look at correlations. The cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or two data frames. The cov() command, on the otherhand examines the covariance. The cor.test() command carries out a test as to the significance of the correlation.

cor(obesity$charges, obesity$bmi)

## [1] 0.198341

This test uses a spearman Rho correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')

## [1] 0.08252397

This correlation measures strength of a correlation between -1 and 1.

Now lets look at the Tietjen-Moore test. This is used for univariate datasets. The algorithm depicts the dectection of the outliers in a univariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n = length(dataSeries)
  ## compute the absolute residuals 
  r = abs(dataSeries - mean(dataSeries))
  ## sort data according to size of residual 
  df = data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  ## create a subset of the data without the largest values 
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  ## compute the sums of squares 
  ksub = (subdataSeries - mean(subdataSeries)) **2
  all = (df$dataSeries - mean(df$dataSeries)) **2
  ## compute the test statistic 
  sum(ksub)/sum(all)
  
}

This function helps to compute the absolute residuals and sorts data according to the size of the residuals. Later, we will focus on the computation of sum of squares

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  # Compute critical values based on simulation 
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

The critical region for the Tietjen-Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and computing the Tietjen-Moore test statistic. Typically, 10,000 random samples are used. The values of the Tietjen-Moore statistic obtained from the data is compared to this reference distribution. The values of the test statistic is between zero and one. If there are no outliers in the data, the test statistic is close to one. If there are outliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regardless of which test statistic is used, Lk or Ek.

First we will look at charges

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 100)

## $T
## [1] 0.4556033
## 
## $Talpha
##       50% 
## 0.6350056

Lets check out bmi

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 7)

## $T
## [1] 0.9475628
## 
## $Talpha
##      50% 
## 0.950515

Probability Plots

library(ggplot2)
library(tigerstats)

## Loading required package: abd

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## Loading required package: lattice

## Loading required package: grid

## Loading required package: mosaic

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:scales':
## 
##     rescale

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:plotly':
## 
##     do

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or for example, you may wonder “what is the likelihood that a person has a BMI of exactly __? In this case, you would need to retrieve the density of the BMI distribution at values 140. The BMI distribution can be modeled with a mean of 100 and a standard deivation of 15. The corresponding densit is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Lets create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) + 
  geom_point()

This gives us the probability of every single point occuring.

Now lets use the pnorm funciton for more info.

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) + 
  geom_point()

What if we want to find the probability of the bmi being greater than 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x, , mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)

## [1] "6.29 %"

pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.06287869

What about the probability that a BMI is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)), 2), "%")
}

pp_less(40)

## [1] "93.71 %"

pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.9371213

What if we want to find the area in between?

pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Lets use the qnorm function. We need to assume a normal distribution for this

What bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)

## [1] 16.4769

What if you want a random sampling of values within your distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(50000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-Wilk Test

So now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])

## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the rest by increasing the sample size

shapiro.test(obesity$charges[1:1000])

## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

NOw lets check out age

shapiro.test(obesity$age[1:1000])

## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16

And lastly bmi

shapiro.test(obesity$bmi[1:1000])

## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

First lets load our packages (had to download air_data)

library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

library(readxl)

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Date - date of measurement time - time of measurement CO(GT) - average hourly CO2 PT08, s1(CO) - tin oxide hourly average sensor response NMHC - average hourly non-metallix hydrocaarbon concentration C6HC - average benzene concentration PT08.S3(NMHC) - titania average hourly sensor response NOx - average hourly NOx concentration NO2 - Average hourly NO2 concentration T - temperature RH = realtive humidity AH - Absolute humidity

str(Air_data)

## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...

library(tidyr)
library(dplyr)
library(lubridate)
library(hms)

## 
## Attaching package: 'hms'

## The following object is masked from 'package:lubridate':
## 
##     hms

library(ggplot2)

Lets get rid of the date in the time column

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)

## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…

plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)

## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

The Sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, digust, fear, joy, sadness, surproise and trust. AFFIN assigns a score between =-5 and 5, with negative indicating negative setiment, and 5 postive.

The function get_sentiments() allows us to get the specific sentiments lexicon with the measures for each one

# This line removes the prompt to choose 1 or 2 (yes or no) which will stop it from knitting. 
Sys.setenv(TEXTDATA_WARNING = "false")

install.packages("textdata")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(tidytext)
library(textdata)

afinn <- get_sentiments("afinn")

afinn

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Still have to type 1

Lets look at bing

bing <- get_sentiments("bing")

bing

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

and lastly nrc

nrc <- get_sentiments("nrc")

nrc

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created either using crowdsourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.

Lets look at the words with a joy source from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(c(3704, 24923, 2009, 2300), mirror = "https://www.gutenberg.org/dirs/")
# These are the Darwin Books 
## The voyage of the Beagle - 3704
## On the origin of species by the means of natural selection - 24923 
## The Variation of Animals and Plants Under Domestication, Vol. I - 2009 
## The descent of man, and selection in reltion to sex - 2300

tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
 dplyr:: mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

tidy_books

## # A tibble: 922,891 × 4
##    gutenberg_id linenumber chapter word   
##           <int>      <int>   <int> <chr>  
##  1         2009          1       0 1228   
##  2         2009          1       0 1859   
##  3         2009          1       0 first  
##  4         2009          1       0 edition
##  5         2009          2       0 22764  
##  6         2009          2       0 1860   
##  7         2009          2       0 second 
##  8         2009          2       0 edition
##  9         2009          3       0 2009   
## 10         2009          3       0 1872   
## # ℹ 922,881 more rows

Lets add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 3704] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 24923] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2009] <- "The Variation of Animals and Plants Under Domestication"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"


tidy_books

## # A tibble: 922,891 × 4
##    book                                                 linenumber chapter word 
##    <chr>                                                     <int>   <int> <chr>
##  1 The Variation of Animals and Plants Under Domestica…          1       0 1228 
##  2 The Variation of Animals and Plants Under Domestica…          1       0 1859 
##  3 The Variation of Animals and Plants Under Domestica…          1       0 first
##  4 The Variation of Animals and Plants Under Domestica…          1       0 edit…
##  5 The Variation of Animals and Plants Under Domestica…          2       0 22764
##  6 The Variation of Animals and Plants Under Domestica…          2       0 1860 
##  7 The Variation of Animals and Plants Under Domestica…          2       0 seco…
##  8 The Variation of Animals and Plants Under Domestica…          2       0 edit…
##  9 The Variation of Animals and Plants Under Domestica…          3       0 2009 
## 10 The Variation of Animals and Plants Under Domestica…          3       0 1872 
## # ℹ 922,881 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The Voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 276 × 2
##    word           n
##    <chr>      <int>
##  1 found        305
##  2 good         165
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          91
##  7 present       85
##  8 food          81
##  9 elevation     71
## 10 beautiful     62
## # ℹ 266 more rows

We can also examine how sentiment changes throughout a work.

library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

Now lets plot it

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 2, scales = "free_x")

Lets compare the three sentiment dictions

There are several options for setiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three for our dictionaries and examine how the sentiment changes across the arc of TVOTB.

library(tidyr)

voyage <- tidy_books %>%
  filter(book == "The Voyage of the Beagle")

voyage

## # A tibble: 215,982 × 4
##    book                     linenumber chapter word        
##    <chr>                         <int>   <int> <chr>       
##  1 The Voyage of the Beagle          1       0 a           
##  2 The Voyage of the Beagle          1       0 naturalist’s
##  3 The Voyage of the Beagle          1       0 voyage      
##  4 The Voyage of the Beagle          2       0 round       
##  5 The Voyage of the Beagle          2       0 the         
##  6 The Voyage of the Beagle          2       0 world       
##  7 The Voyage of the Beagle          4       0 first       
##  8 The Voyage of the Beagle          4       0 edition     
##  9 The Voyage of the Beagle          4       0 may         
## 10 The Voyage of the Beagle          4       0 1860        
## # ℹ 215,972 more rows

Lets again use integer division (‘%/%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to dind the net sentiment in each of these secitions of text.

affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(get_sentiments("bing")) %>% 
    mutate(method = "Bing et al."), 
  voyage %>%
    inner_join(get_sentiments("nrc") %>%
                 filter(sentiment %in% c("positive", "netagive"))
               ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, 
              values_from = n, 
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexicon (dictionary). Lets bind them all together and visualize with ggplot.

bind_rows(affin, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~method, ncol = 1, scales = "free_y")

Lets look at the counts based on each dictionary

get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

bing_word_counts

## # A tibble: 2,391 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive   1452
##  2 well       positive   1057
##  3 like       positive   1005
##  4 wild       negative    939
##  5 good       positive    554
##  6 doubt      negative    450
##  7 remarkable positive    436
##  8 respect    positive    425
##  9 variety    positive    387
## 10 important  positive    367
## # ℹ 2,381 more rows

This can be shown visually, adn we can pipe straight into ggplot2

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, scale = "free_y") + 
  labs(x = "Contribution to Sentiment", y = NULL)

Lets spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)

custom_stop_words

## # A tibble: 1,153 × 2
##    word      lexicon
##    <chr>     <chr>  
##  1 wild      custom 
##  2 dark      custom 
##  3 great     custom 
##  4 like      custom 
##  5 a         SMART  
##  6 a's       SMART  
##  7 able      SMART  
##  8 about     SMART  
##  9 above     SMART  
## 10 according SMART  
## # ℹ 1,143 more rows

So far we’ve only looked at single words, but manu interesting (more accurate) analyses are based on the realtionship between words.

Lets look at some methods of tidytext for calculating and visualizing relationships.

library(dplyr)
library(tidytext)
library(gutenbergr)
library(ggplot2)

darwin_books <- darwin <- gutenberg_download(c(3704, 24923, 2009, 2300), mirror = "https://www.gutenberg.org/dirs/")

colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 3704] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 24923] <- "On the Origin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2009] <- "The Variation of Animals and Plants Under Domestication"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"


darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) 

darwin_bigrams

## # A tibble: 849,935 × 2
##    book                                                    bigram            
##    <chr>                                                   <chr>             
##  1 The Variation of Animals and Plants Under Domestication 1228 1859         
##  2 The Variation of Animals and Plants Under Domestication 1859 first        
##  3 The Variation of Animals and Plants Under Domestication first edition     
##  4 The Variation of Animals and Plants Under Domestication 22764 1860        
##  5 The Variation of Animals and Plants Under Domestication 1860 second       
##  6 The Variation of Animals and Plants Under Domestication second edition    
##  7 The Variation of Animals and Plants Under Domestication 2009 1872         
##  8 The Variation of Animals and Plants Under Domestication 1872 sixth        
##  9 The Variation of Animals and Plants Under Domestication sixth edition     
## 10 The Variation of Animals and Plants Under Domestication edition considered
## # ℹ 849,925 more rows

This data is still in tidytext format, and its structured as one-token-per-row. Each token in a bigram.

Counting and filtering n-gram

darwin_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 268,627 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the   13322
##  2 <NA>     11160
##  3 in the    6661
##  4 on the    4563
##  5 to the    3288
##  6 the same  2405
##  7 that the  2252
##  8 it is     2100
##  9 from the  1887
## 10 and the   1852
## # ℹ 268,617 more rows

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

New bigram count

bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_counts

## # A tibble: 122,329 × 2
##    book                                                    bigram            
##    <chr>                                                   <chr>             
##  1 The Variation of Animals and Plants Under Domestication 1228 1859         
##  2 The Variation of Animals and Plants Under Domestication 22764 1860        
##  3 The Variation of Animals and Plants Under Domestication 2009 1872         
##  4 The Variation of Animals and Plants Under Domestication 1872 sixth        
##  5 The Variation of Animals and Plants Under Domestication sixth edition     
##  6 The Variation of Animals and Plants Under Domestication edition considered
##  7 The Variation of Animals and Plants Under Domestication definitive edition
##  8 The Variation of Animals and Plants Under Domestication NA NA             
##  9 The Variation of Animals and Plants Under Domestication NA NA             
## 10 The Variation of Animals and Plants Under Domestication NA NA             
## # ℹ 122,319 more rows

We may also be interested in trigrams, which are three word combos

trigrams <- darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,  
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

trigrams

## # A tibble: 27,658 × 4
##    word1     word2  word3          n
##    <chr>     <chr>  <chr>      <int>
##  1 <NA>      <NA>   <NA>       12514
##  2 tierra    del    fuego        108
##  3 secondary sexual characters    93
##  4 captain   fitz   roy           46
##  5 hort      soc    vol           43
##  6 proc      zoolog soc           43
##  7 closely   allied species       40
##  8 wild      rock   pigeon        36
##  9 alph      de     candolle      31
## 10 lop       eared  rabbits       31
## # ℹ 27,648 more rows

Lets anaylze some biograms

bigrams_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)

## # A tibble: 60 × 3
##    book                                                    word1           n
##    <chr>                                                   <chr>       <int>
##  1 The Variation of Animals and Plants Under Domestication natural       342
##  2 The Descent of Man, and Selection in Relation to Sex    sexual        254
##  3 The Descent of Man, and Selection in Relation to Sex    natural       156
##  4 On the Origin of Species By Means of Natural Selection  natural        21
##  5 On the Origin of Species By Means of Natural Selection  continued      19
##  6 The Variation of Animals and Plants Under Domestication sexual         15
##  7 On the Origin of Species By Means of Natural Selection  unconscious    13
##  8 The Variation of Animals and Plants Under Domestication unconscious     8
##  9 On the Origin of Species By Means of Natural Selection  methodical      7
## 10 The Variation of Animals and Plants Under Domestication continued       7
## # ℹ 50 more rows

Lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf

## # A tibble: 76,358 × 6
##    book                                       bigram     n      tf   idf  tf_idf
##    <chr>                                      <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 On the Origin of Species By Means of Natu… rock …   201 0.00659 0.693 0.00457
##  2 The Variation of Animals and Plants Under… natur…   342 0.0153  0.288 0.00440
##  3 The Descent of Man, and Selection in Rela… sexua…   254 0.00580 0.693 0.00402
##  4 On the Origin of Species By Means of Natu… bud v…    76 0.00249 1.39  0.00345
##  5 The Voyage of the Beagle                   capta…    56 0.00218 1.39  0.00302
##  6 On the Origin of Species By Means of Natu… _g ba…    61 0.00200 1.39  0.00277
##  7 On the Origin of Species By Means of Natu… lop e…    61 0.00200 1.39  0.00277
##  8 The Voyage of the Beagle                   bahia…    47 0.00183 1.39  0.00253
##  9 On the Origin of Species By Means of Natu… hort …    55 0.00180 1.39  0.00250
## 10 The Variation of Animals and Plants Under… glaci…    39 0.00175 1.39  0.00242
## # ℹ 76,348 more rows

bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>%
  ungroup() %>%
  mutate(bigram = reorder(bigram, tf_idf)) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 2, scales = "free") + 
  labs(x = "tf-idf of bigrams", y = NULL)

using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 902 × 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      140
##  2 not   only    122
##  3 not   have    121
##  4 not   to      118
##  5 not   been    112
##  6 not   a       108
##  7 not   the     104
##  8 not   in       69
##  9 not   at       68
## 10 not   so       68
## # ℹ 892 more rows

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negating words.

# This line removes the prompt to choose 1 or 2 (yes or no) which will stop it from knitting. 
Sys.setenv(TEXTDATA_WARNING = "false")

install.packages("textdata")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(textdata)

AFINN <- get_sentiments("afinn")


AFINN

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

We can examine the most frequent words that were preceded by “not”, and associate with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE) 

not_words

## # A tibble: 103 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 like          2    13
##  2 doubt        -1    12
##  3 pretend      -1    11
##  4 wish          1    10
##  5 reach         1     9
##  6 increased     1     8
##  7 admit        -1     7
##  8 forget       -1     7
##  9 perfectly     3     7
## 10 deny         -2     6
## # ℹ 93 more rows

Lets visualize

library(ggplot2)

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0 )) + 
  geom_col(show.legend = FALSE) + 
  labs(x = "Sentiment value * number or occurences", y = "words preceded by \"not\"")

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE) 

negated_words

## # A tibble: 195 × 4
##    word1   word2     value     n
##    <chr>   <chr>     <dbl> <int>
##  1 no      doubt        -1   239
##  2 no      great         3    22
##  3 not     like          2    13
##  4 not     doubt        -1    12
##  5 not     pretend      -1    11
##  6 not     wish          1    10
##  7 no      good          3     9
##  8 not     reach         1     9
##  9 not     increased     1     8
## 10 without doubt        -1     8
## # ℹ 185 more rows

Lets visualize the negation words

negated_words %>%
  mutate(contribution = n * value, 
         word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1) %>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value >0)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~ word1, scales = "free") + 
  scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
  xlab("words preceded by negation term") +
  ylab("Sentiment value * $ of occurences") + 
  coord_flip()

Visualize a network of bigrams with ggraph

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:mosaic':
## 
##     compare

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following object is masked from 'package:plotly':
## 
##     groups

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"

bigram_graph

## IGRAPH d3dba10 DN-- 291 219 -- 
## + attr: name (v/c), n (e/n)
## + edges from d3dba10 (vertex names):
##  [1] NA       ->NA          natural  ->selection   sexual   ->selection  
##  [4] vol      ->ii          rock     ->pigeon      distinct ->species    
##  [7] closely  ->allied      south    ->america     sexual   ->differences
## [10] lower    ->animals     secondary->sexual      tail     ->feathers   
## [13] breeding ->season      del      ->fuego       sexual   ->characters 
## [16] tierra   ->del         vol      ->iii         north    ->america    
## [19] fresh    ->water       allied   ->species     natural  ->history    
## [22] strongly ->marked      wing     ->feathers    bud      ->variation  
## + ... omitted several edges

library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") + 
  geom_edge_link() + 
  geom_node_point() + 
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also add directionality to this network

set.seed(1234)

a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))

ggraph(bigram_graph, layout = "fr") + 
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, 
                 arrow = a, end_cap = circle(0.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 3) + 
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) + 
  theme_void()

A central question in text mining is how to quantify what a document is about. We can do this by looking at words that make up the document, and measuring term frequency.

There are a lot of works that may not be important, these are the stop words.

One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words adn increases teh weight for words that are not used very much.

Term frequency in Darwins works.

library(dplyr)
library(tidytext)
library(gutenbergr)


book_words <- gutenberg_download(c(3704, 24923, 2009, 2300), mirror = "https://www.gutenberg.org/dirs/")

colnames(book_words)[1] <- "book"

book_words$book[book_words$book == 3704] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 24923] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2009] <- "The Variation of Animals and Plants Under Domestication"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

Now lets disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

book_words <- book_words %>%
  dplyr::rename(word_count = n) 

book_words <- book_words %>%
  group_by(book) %>%
  dplyr::mutate(total = base::sum(word_count)) %>%
  ungroup()

book_words <- book_words %>%
  dplyr::mutate(tf = word_count / total)

head(book_words)

## # A tibble: 6 × 5
##   book                                            word  word_count  total     tf
##   <chr>                                           <chr>      <int>  <int>  <dbl>
## 1 The Descent of Man, and Selection in Relation … the        25490 311041 0.0820
## 2 The Voyage of the Beagle                        the        17080 215982 0.0791
## 3 The Descent of Man, and Selection in Relation … of         16762 311041 0.0539
## 4 The Variation of Animals and Plants Under Dome… the        14570 210294 0.0693
## 5 On the Origin of Species By Means of Natural S… the        13518 185574 0.0728
## 6 The Variation of Animals and Plants Under Dome… of         10440 210294 0.0496

head(book_words)

## # A tibble: 6 × 5
##   book                                            word  word_count  total     tf
##   <chr>                                           <chr>      <int>  <int>  <dbl>
## 1 The Descent of Man, and Selection in Relation … the        25490 311041 0.0820
## 2 The Voyage of the Beagle                        the        17080 215982 0.0791
## 3 The Descent of Man, and Selection in Relation … of         16762 311041 0.0539
## 4 The Variation of Animals and Plants Under Dome… the        14570 210294 0.0693
## 5 On the Origin of Species By Means of Natural S… the        13518 185574 0.0728
## 6 The Variation of Animals and Plants Under Dome… of         10440 210294 0.0496

book_words$word_count <- as.numeric(book_words$word_count)

total_words <- book_words %>%
  group_by(book) %>%
  mutate(total = sum(word_count)) %>%
  ungroup()

book_words

## # A tibble: 48,604 × 5
##    book                                           word  word_count  total     tf
##    <chr>                                          <chr>      <dbl>  <int>  <dbl>
##  1 The Descent of Man, and Selection in Relation… the        25490 311041 0.0820
##  2 The Voyage of the Beagle                       the        17080 215982 0.0791
##  3 The Descent of Man, and Selection in Relation… of         16762 311041 0.0539
##  4 The Variation of Animals and Plants Under Dom… the        14570 210294 0.0693
##  5 On the Origin of Species By Means of Natural … the        13518 185574 0.0728
##  6 The Variation of Animals and Plants Under Dom… of         10440 210294 0.0496
##  7 The Voyage of the Beagle                       of         10041 215982 0.0465
##  8 The Descent of Man, and Selection in Relation… in          8882 311041 0.0286
##  9 The Descent of Man, and Selection in Relation… and         7854 311041 0.0253
## 10 On the Origin of Species By Means of Natural … of          7769 185574 0.0419
## # ℹ 48,594 more rows

total_words <- book_words %>%
  ungroup() %>%
  group_by(book) %>%
  summarize(total = sum(word_count), .groups = "drop")

colnames(total_words)

## [1] "book"  "total"

book_words

## # A tibble: 48,604 × 5
##    book                                           word  word_count  total     tf
##    <chr>                                          <chr>      <dbl>  <int>  <dbl>
##  1 The Descent of Man, and Selection in Relation… the        25490 311041 0.0820
##  2 The Voyage of the Beagle                       the        17080 215982 0.0791
##  3 The Descent of Man, and Selection in Relation… of         16762 311041 0.0539
##  4 The Variation of Animals and Plants Under Dom… the        14570 210294 0.0693
##  5 On the Origin of Species By Means of Natural … the        13518 185574 0.0728
##  6 The Variation of Animals and Plants Under Dom… of         10440 210294 0.0496
##  7 The Voyage of the Beagle                       of         10041 215982 0.0465
##  8 The Descent of Man, and Selection in Relation… in          8882 311041 0.0286
##  9 The Descent of Man, and Selection in Relation… and         7854 311041 0.0253
## 10 On the Origin of Species By Means of Natural … of          7769 185574 0.0419
## # ℹ 48,594 more rows

You can see that the usual suspects are the most common words, but don’t tell us anything about what the books topic is.

library(ggplot2)

ggplot(book_words, aes(word_count/total, fill = book)) + 
  geom_histogram(show.legend = FALSE) + 
  xlim(NA, 0.0009) + 
  facet_wrap(~book, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 522 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 4 rows containing missing values (`geom_bar()`).

Zipf’s Law

The frequency that a word appears is inversely proportional to its rank when predicting a topic.

Lets apply Zipf’s Law to Darwin’s work.

freq_by_rank <- book_words %>%
  group_by(book) %>%
  mutate(rank = row_number(), 
         'term frequency' = word_count/total) %>%
  ungroup()

freq_by_rank

## # A tibble: 48,604 × 7
##    book                    word  word_count  total     tf  rank `term frequency`
##    <chr>                   <chr>      <dbl>  <int>  <dbl> <int>            <dbl>
##  1 The Descent of Man, an… the        25490 311041 0.0820     1           0.0820
##  2 The Voyage of the Beag… the        17080 215982 0.0791     1           0.0791
##  3 The Descent of Man, an… of         16762 311041 0.0539     2           0.0539
##  4 The Variation of Anima… the        14570 210294 0.0693     1           0.0693
##  5 On the Origin of Speci… the        13518 185574 0.0728     1           0.0728
##  6 The Variation of Anima… of         10440 210294 0.0496     2           0.0496
##  7 The Voyage of the Beag… of         10041 215982 0.0465     2           0.0465
##  8 The Descent of Man, an… in          8882 311041 0.0286     3           0.0286
##  9 The Descent of Man, an… and         7854 311041 0.0253     4           0.0253
## 10 On the Origin of Speci… of          7769 185574 0.0419     2           0.0419
## # ℹ 48,594 more rows

freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() + 
  scale_y_log10()

Let is TF - IDF to find words for each document by decreasing teh weight for commonly used words adn increasing the weight for words that are very much in a collection of documents.

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, word_count)

book_tf_idf

## # A tibble: 48,604 × 7
##    book                              word  word_count  total     tf   idf tf_idf
##    <chr>                             <chr>      <dbl>  <int>  <dbl> <dbl>  <dbl>
##  1 The Descent of Man, and Selectio… the        25490 311041 0.0820     0      0
##  2 The Voyage of the Beagle          the        17080 215982 0.0791     0      0
##  3 The Descent of Man, and Selectio… of         16762 311041 0.0539     0      0
##  4 The Variation of Animals and Pla… the        14570 210294 0.0693     0      0
##  5 On the Origin of Species By Mean… the        13518 185574 0.0728     0      0
##  6 The Variation of Animals and Pla… of         10440 210294 0.0496     0      0
##  7 The Voyage of the Beagle          of         10041 215982 0.0465     0      0
##  8 The Descent of Man, and Selectio… in          8882 311041 0.0286     0      0
##  9 The Descent of Man, and Selectio… and         7854 311041 0.0253     0      0
## 10 On the Origin of Species By Mean… of          7769 185574 0.0419     0      0
## # ℹ 48,594 more rows

Lets look at terms with high tf-idf in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 48,604 × 6
##    book                                   word  word_count      tf   idf  tf_idf
##    <chr>                                  <chr>      <dbl>   <dbl> <dbl>   <dbl>
##  1 On the Origin of Species By Means of … _c           129 6.95e-4 1.39  9.64e-4
##  2 On the Origin of Species By Means of … gard…        103 5.55e-4 1.39  7.69e-4
##  3 The Variation of Animals and Plants U… sele…        561 2.67e-3 0.288 7.67e-4
##  4 On the Origin of Species By Means of … _see_        202 1.09e-3 0.693 7.55e-4
##  5 The Descent of Man, and Selection in … sexu…        745 2.40e-3 0.288 6.89e-4
##  6 The Descent of Man, and Selection in … shewn        143 4.60e-4 1.39  6.37e-4
##  7 On the Origin of Species By Means of … _g            85 4.58e-4 1.39  6.35e-4
##  8 On the Origin of Species By Means of … bank…         81 4.36e-4 1.39  6.05e-4
##  9 The Descent of Man, and Selection in … sele…        621 2.00e-3 0.288 5.74e-4
## 10 The Descent of Man, and Selection in … 1869         128 4.12e-4 1.39  5.70e-4
## # ℹ 48,594 more rows

Lets look at a visulazation for these high tf-idf words

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ igraph::%--%()          masks lubridate::%--%()
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ nlme::collapse()        masks dplyr::collapse()
## ✖ gridExtra::combine()    masks dplyr::combine()
## ✖ RCurl::complete()       masks tidyr::complete()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ mosaic::count()         masks dplyr::count()
## ✖ purrr::cross()          masks mosaic::cross()
## ✖ igraph::crossing()      masks tidyr::crossing()
## ✖ purrr::discard()        masks scales::discard()
## ✖ mosaic::do()            masks plotly::do(), dplyr::do()
## ✖ Matrix::expand()        masks tidyr::expand()
## ✖ plotly::filter()        masks dplyr::filter(), stats::filter()
## ✖ hms::hms()              masks lubridate::hms()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ Matrix::pack()          masks tidyr::pack()
## ✖ purrr::simplify()       masks igraph::simplify()
## ✖ mosaic::stat()          masks ggplot2::stat()
## ✖ mosaic::tally()         masks dplyr::tally()
## ✖ Matrix::unpack()        masks tidyr::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) + 
  geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = "free") + 
  labs(x = "tf-idf", y = NULL)

Final_Project

Molly Brown

2026-02-27