Final Project Data Products

Now let’s take a look at some ggplot2 barplots

We’ll start with making a data frame based on the tooth data

df <- data.frame (dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

And now let’s make a second dataframe

df2 <- data.frame(supp= rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Let’s load up ggplot2

library(ggplot2)

Let’s start our parameters for ggplot

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Let’s start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now let’s change the fill and add labels to the top

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = -0.3)

Now let’s add labels inside the bars

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now let’s change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") +
  scale_color_manual(values = c("blue", "gold", "red" ))

This is kind of hard to see, so let’s change the fill

f + geom_col(aes(fill = dose)) +
  scale_fill_manual(values = c("blue", "gold", "red"))

Now how do we do this with multiple groups?

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_stack()) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))
p

Now let’s add those labels to the dodged barplot

p + geom_text(
  aes(label = len, group = supp),
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

Now what if we want to add labels to our stacked barplots? For this we need dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

Now let us recreate our stacked graphs

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(fill = supp), width = 0.7) +
  geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

Let’s look at some boxplots

data("ToothGrowth")

Let’s change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Let’s load ggplot

library(ggplot2)

Let’s set the theme for our plots to classic

theme_set(
  theme_bw() +
    theme(legend.position = "top")
)

Let’s start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now let’s look at a boxplot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") +
  stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

Let’s put our x-axis in descending order

tg +geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

We can also change boxplot colors by group

tg + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = c("indianred", "blue1", "green2"))

What if we want to display our data subset by OJ vs Vitamin C?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#999999", "#E69F00"))

tg2

We can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

Histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 50), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430

Now let’s load dplyr

library(dplyr)

mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now let’s load the plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now let’s create a ggplot oject

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") +
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Now let’s change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("indianred", "lightblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we want to combine density plots and histograms?

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666")
## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex), 
                   fill = "white", position = "identity") +
  geom_density(aes(color = sex), size = 1) +
  scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

First let’s load the required packages

library(ggplot2)

Let’s set the theme

theme_set(
  theme_dark() +
    theme(legend.position = "top")
)

First let’s initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))

Let’s create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
  stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Let’s add a bxoplot and a dotplot together

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) +
  geom_dotplot(binaxis= "y", stackdir = "center", fill = "#999999") +
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Let’s create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') +
  scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center',
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  scale_color_manual(values = c("#00afbb", "#e7b800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Now let’s change it up and look at line plots

We will start by making a custom dataframe kind of like the tooth data set. This way we can see the lines and stuff that we are modifying

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

Now let’s create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Now let’s again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
  
)

Now let’s do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim =c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab = "")
  for(i in 0:6) lines(c(0.3, 0.7), c(i,i), lty=i, lwd = 3)
  text(rep(0.1,6), 0:6, labels = c("0.'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'", 
                                   "4.'dotdash'", "5.'longdash'", "6.twodash'"))
  par(mar=oldPar$mar, font=oldPar$font)
}

generateRLineTypes()

Now let’s build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))

p + geom_line() + geom_point()

Now let’s modify the line type and color

p + geom_line(linetype = "dashed", color = "steelblue") +
  geom_point(color = "steelblue")

Now let’s try a step graph, which indicates a threshold type progression

p + geom_step() + geom_point()

Now let’s move on to making multiple groups. First we’ll create our ggplot object

p <- ggplot(df2, aes(x=dose, y=len, group=supp))

Now let’s change line types and point shapes by group

p + geom_line(aes(linetypes = supp, color = supp)) +
  geom_point(aes(shape = supp, color = supp)) +
  scale_color_manual(values = c("red", "blue"))
## Warning in geom_line(aes(linetypes = supp, color = supp)): Ignoring unknown
## aesthetics: linetypes

Now let’s look at line plots with a numeric x axis

df3 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("0.5", "1", "2"), 2), 
                  len=c(6.8, 15, 33, 4.2, 10, 29.5))

df3
##   supp dose  len
## 1   VC  0.5  6.8
## 2   VC    1 15.0
## 3   VC    2 33.0
## 4   OJ  0.5  4.2
## 5   OJ    1 10.0
## 6   OJ    2 29.5

Now let’s plot where both axises are treated as continuous labels

df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group = supp, color = supp)) +
  geom_line() + geom_point()

Now let’s look at the line graph with having the x axis as dates. We will use the built in economics time series for this example.

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data = economics, aes(x = date, y = pop)) +
  geom_line()

Now let’s subset the data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

We can also change the line size, for instance by another variable like unemployment

ggplot(data = economics, aes(x=date, y= pop)) +
  geom_line(aes(size = unemploy/pop))

We can also plot multiple time-series data

ggplot(economics, aes(x = date)) +
  geom_line(aes(y=psavert), color = "darkred") +
  geom_line(aes(y = uempmed), color = "steelblue", linetype = "twodash")

Lastly, let’s make this into an area plot

ggplot(economics, aes(x=date)) +
  geom_area(aes(y=psavert), fill = "#999999",
            color= "#999999", alpha = 0.5) +
  geom_area(aes(y = uempmed), fill = "#E69f00", 
            color= "#E69f00", alpha = 0.5)

First let’s load the required packages

library(ggplot2)
library(ggridges)

#BiocManager::install("ggridges")

Now let’s load some sample data

?airquality 
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air 
## Picking joint bandwidth of 2.65

Now let us add some extra things to our graph

library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = after_stat(x)) +
  geom_density_ridges_gradient() +
  scale_fill_viridis(option = "C", name = "Temp")
## Picking joint bandwidth of 2.65

Last thing we will do is create a face plot for all our data

library(tidyr)

airquality %>%
  gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) +
  geom_density_ridges() +
  facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

A density plot is an alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now let’s load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position = "right")
)

Now let’s do the basic plot functions. First, we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now let’s do a basic density plot

d + geom_density() +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now let’s change the y axis to count instead of density

d + geom_density(aes(y = stat(count)), fill = "lightgrey") +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

d + geom_density(aes(color = sex)) +
  scale_color_manual(values = c("darkgrey", "gold"))

Lastly, let’s fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
  scale_color_manual(values = c("grey", "gold")) +
  scale_fill_manual(values = c("grey", "gold"))

First let’s load our required package

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Let’s start with a scatter plot of the Orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now let’s add some more information

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~age,
        text = ~paste("Tree ID", Tree, "<br>Age:", age, "Circ:", circumference)
)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now let’s create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)

We’ll use the random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now let’s create a graph with the option of showing as a scatter line and add labels

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~circumference,
        text = ~paste("TreeID:", Tree, "<br>Age:", age, "Circ:", circumference)) %>%
  add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "Plot or Orange data with switchable trace",
    updatemenus = list(
      list(
        type = 'downdrop',
        y = 0.8,
        buttons = list(
          list(method = 'restyle',
               args = list('mode', 'markers'),
               label = "Marker"
        ),
        list(method = "restyle",
             args = list('mode', 'lines'),
             label = "Lines"
         )
       )
     )
   )
 )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

First let’s load the required packages

library(plotly)

Now lets create a random 3D matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))

Now let’s plot our 3D data

plot_ly(d, x=~x, y = ~y, z = ~z) %>%
  add_surface()

Let’s add some more aspects to it, such as at topography

plot_ly(d, x = ~x, y = ~y, z = ~z) %>%
  add_surface(
    contours = list(
      z = list(
        show = TRUE,
        usecolormap = TRUE,
        highlightcolor = "FF0000",
        project = list(z = TRUE)
      )
    )
  )

Now let’s look at a 3D scatter plot

plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

First let’s load our required packages

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position = 'top')
)

Let’s again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now let’s use dplyr for manipulation purposes

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE),
    stderr = std.error(len, na.rm = TRUE),
    len = mean(len),
  )

df.summary
## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Let’s now look at some key functions

Let’s sstart by creating a ggplot object

tg <- ggplot(
  df.summary, 
  aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
)

Now let’s look at the most basic error bars

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2) +
  geom_point(size = 1.5)

Now let’s create horizontal error bars by manipulating our graph

ggplot(df.summary, aes(x=len, y=dose, xmin= len-sd, xmax = len+sd)) +
  geom_point() +
  geom_errorbarh(height = 0.2)

This gives an idea of error bars on the horizontal axis

Now let’s look at adding jitter points (actual measurements) to our data

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)

Now let’s try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgrey", trim = FALSE) +
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)

Now what about a line graph?

ggplot(df.summary, aes(dose, len)) +
  geom_line(aes(group = 1)) + # always specify this when you have 1 line
  geom_errorbar(aes(ymin = len-stderr, ymax = len-stderr), width = 0.2) +
  geom_point(size = 2)

Now let’s male a bar graph with halve error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgrey", color = "black") +
  geom_errorbar(aes(ymin = len, ymax= len+stderr), width = 0.2)

You can see that by not specifying ymin = len-stderr, we have in essence, cut our error bars in half

How about we add jitter points to line plots? We need to yse the original dataframe for the jitter plot, and the summary df for the geom layers

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_line(aes(group = 1), data = df.summary) +
  geom_errorbar(
    aes(ymin= len- stderr, ymax = len + stderr), 
    data = df.summary, width = 0.2) +
  geom_point(data = df.summary, size = 0.2)

What about adding jitter points to a barplot?

ggplot(df, aes(dose, len)) +
  geom_col(data = df.summary, fill = NA, color = "black") +
  geom_jitter(postion = position_jitter(0,2), color = "darkgrey") +
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr),
                data = df.summary, width = 0.2)
## Warning in geom_jitter(postion = position_jitter(0, 2), color = "darkgrey"):
## Ignoring unknown parameters: `postion`

What if we wanted to have our error bars per group?

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise(
    sd = sd(len),
    stderr = std.error(len),
    len = mean(len)
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have a mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) +
  geom_pointrange(
    aes(ymin = len-stderr, ymax = len + stderr, color = supp),
    position = position_dodge(0.3)) +
  scale_color_manual(values = c("indianred", "lightblue"))

How about line plots with multiple error bars?

ggplot(df.summary2, aes(dose, len)) +
  geom_line(aes(linetype = supp, group = supp)) +
  geom_point() +
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr, group = supp), width = 0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len)) + 
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = len-sd, ymax = len+sd, group = supp),
    width = 0.2, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("indianred", "lightblue"))

Now let’s add some jitter points

ggplot(df, aes(dose, len, color = supp)) +
  geom_jitter(position = position_dodge(0.2)) +
  geom_line(aes(group = supp), data = df.summary2) +
  geom_point() +
  geom_errorbar(aes(ymin = len-stderr, ymax = len+ stderr, group =supp), data = df.summary2, width=0.2)

ggplot(df, aes(dose, len, color = supp)) +
  geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white") +
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr), data = df.summary2, 
    width = 0.2, position = position_dodge(0.8)) +
  scale_color_manual(values = c("indianred", "lightblue")) +
  theme(legend.position = "top")

Now let’s do an emperical cumulative distribution function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 50), rnorm(200, 58)))

Now let’s look at our dataframe

head(wdata, 5)
##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430
## 5   F 50.42912

Now let’s load our plotting packages

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now let’s create our ECDF Plot

ggplot(wdata, aes(x=weight)) +
  stat_ecdf(aes(color = sex, linetype = sex),
            geom = "step", linewidth = 1.5) +
  scale_color_manual(values = c("#00AFBB", "#E7B900")) +
  labs(y = "weight")

Now let’s take a look at qq plots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now let’s randomly generate some data

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)

Let’s set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Create a qq plot of the weight

ggplot(wdata, aes(sample=weight)) +
  stat_qq(aes(color = sex)) +
  scale_color_manual(values = c("#0073c2ff", "#fc4e07")) +
  labs(y = "weight")

#install.packages("ggpubr")
library(ggpubr)

ggqqplot(wdata, x = "weight",
         color = "sex",
         palettes = c("#0073c2ff", "#fc4e07"),
         ggtheme = theme_pubclean())

Now what would a non-normal distribution look like?

#install.packages(mnonr)

library(mnonr)

data2 <- mnonr::mnonr(n = 1000, p=2, ms=3, mk=61, Sigma = matrix(c(1,0.5,0.5,1), 2,2), initial = NULL)

data2 <- as.data.frame(data2)

Now let’s plot the non-normal data

ggplot(data2, aes(sample=V1)) +
  stat_qq()

ggqqplot(data2, x = "V1",
         palatte = "#0073c2ff",
         ggtheme = theme_pubclean())

Let’s look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw() +
    theme(legend.position = "top")
  
)

First let’s create a nice boxplot

Let’s load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x=dose, y=len)) +
  geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#00afbb", "#e7b800"))

p

Now let’s look at the gwplot facit function

p + facet_grid(rows = vars(supp))

Now let’s do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols = vars(supp))

p

Now let’s look at the facet_wrap function that allows facets to be placed side-by-side

p + facet_wrap(vars(dose), ncol = 2)

Now how do we combine multiple plots using ggarrange()

Let’s start by making some basic plots. First, we will define a color palette and data

my3cols <- c("#e7b800", "#2e9FDF", "#fc4e07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now let’s make some basic plots

p <- ggplot(ToothGrowth, aes(x =dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = my3cols)

Now, let’s do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
                       binaxis = 'y', stackdir = 'center') +
  scale_color_manual(values = my3cols) +
  scale_fill_manual(values = my3cols)

Now let’s create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) +
  geom_line(color = "indianred")

Now, we can make the figure

figure <- ggarrange(bxp, dp, lp, labels= c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure 

We can make this look even better

figure2 <- ggarrange(
  lp,
  ggarrange(bxp, dp, ncol =2, labels = c("B", "C")),
  nrow=2,
  labels = "A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure2

This looks good too, but two of the legends are the same

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly, we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")
## file saved to facetfigure.pdf

We can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

Lastly, we can export to pdf with multiple pages and columns

ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow =2, ncol=1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Let’s get started with heatmaps

#install.packages(heatmap3)
library(heatmap3)

Now, let’s get our data

data <- ldeaths 

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now let’s generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

Now, let’s play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now let’s apply our color selections

heatmap(data2, ColSideColors = cc)

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

There are more things to customize

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
## 
##     plotCI
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(data2, ColSideColors = cc,
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Missing Values

If you encounter an unusual value in your dataset, and simply want to move on to the rest of your analysis, you have two options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

In this instance, y id the width of the diamond, so anything under 3 mm or above 20 is excluded

I do not recommend this option. Just because there is one bad measurement does not mean they are all bad

Instead, I recommend replacing the unusual values with missing values

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values should not pass silently into the night

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to suppress that warning, you can use na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)

Other times, you want to understand what makes observations with missing values different to the observations with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flgiht was cancelled. So, you might want to compare the scheduled departure times for cancelled and non-cancelled times

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour +sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) + 
  geom_freqpoly(mapping = aes(color = cancelled), binwidth = 1/4)

What if we want to identify outliers?

First we need to load the required libraries

library(outliers)
library(ggplot2)

And reload the dataset because we removed outliers

Air_data <- readxl::read_excel("AirQualityUCI.xlsx")

Let’s create a function using the grubb test to identify all outliers The grubb test identifies outliers in a univariate dataset that is presumed to come from a normal distribution

grubbs.flag <- function(x) {
  #lets create a variable called outliers and save nothing in it. We'll add to the variable 
  #as we identify them 
  outliers <- NULL
  #We'll create a variable called test to identify which unvariate we are testing 
  test <- x
  #Now using the outliers package, use grubbs.test to find outliers in our variable 
  grubbs.result <- grubbs.test(test)
  #Let's get the p-values of all tested variables 
  pv <- grubbs.result$p.value
  #Now let's search through our p-values for ones that are outside of 0.5
  while(pv < 0.05) {
    #Anything with a pvalue greater than p = 0.05 , we add to our empty outliers vector 
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, " ")[[1]][3]))
  #Now we want to remove those outliers from our test variable
  test <- x[!x %in% outliers]
  #And run the grubbs test again without outliers 
  grubbs.result <- grubbs.test(test)
  #And save the new p values
  pv <- grubbs.result$p.value 
}
return(data.frame(x=x, Outliers = (x %in% outliers)))
}
identified_outliers <- grubbs.flag(Air_data$AH)

Now we can create a histogram showing where the outliers were

ggplot(grubbs.flag(Air_data$AH), aes(x=Air_data$AH, color = Outliers, fill = Outliers)) +
  geom_histogram(binwidth = diff(range(Air_data$AH))/30)+
  theme_bw()

CATEGORICAL VARIABLES

First, let’s load the required libraries and plot some data

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It’s hard to see the difference in distribution because the counts differ so much

ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut))

To make the comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area curve is one.

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fair diamonds (the lowest cut quallity) have the highest average price. But, maybe that’s is because frequency polygons are a little hard to interpret.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.

ggplot(data = diamonds, mapping = aes(x=cut, y=price)) +
  geom_boxplot()

We see much less information about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the conterintuitve finding that better quality diamonds are cheaper on average!

Let’s look at some car data

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it to 90 degrees

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  coord_flip()

To visualize the correlation between two continuous variables, we can use a scatter plot

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

Scatterplots become less useful as the size of your dataset grows, because we get overplot. We can fix this using the alpha asthetic.

ggplot(data = diamonds) +
  geom_point(mapping = aes(x= carat, y = price), alpha = 1/100)

First, let’s load a required library

library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(dplyr)

Now let’s get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

College_Data <- read.csv(site)

First, let’s use the str function. This shows the structure of the object.

str(College_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabetically by college?

alphabetical <- College_Data %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset with the filter function

Louisiana_Cases <- filter(College_Data, state == "Louisiana")

Let’s filter out smaller amount of states

South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")

Let’s look at some time series data

First, we’ll load the required libraries

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now, let’s load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

Let’s create a group_by object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting

Days_since_first_reported <- tally(state_cases)

Let’s visualize some data

First, let’s start off with some definitions

Data- obvious- the stuff we want to visualize

Layer- made of geometric elements and requisite statistical information; include geometric objects which represents the plot

Scales- used to map values in the data space that are used for creation of values (color, size, shape, etc)

Coordinate system- describes how the data coordinated are mapped together in relation to the plan on the graphic

Faceting- how to break up data into subsets to display multiple types or groups of data

Theme- controls the finer points of the display, such as font size and background color

options(repr.plot.width = 6, repr.plot.height = 6)

class(College_Data)
## [1] "data.frame"
head(College_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now let’s take a look at a different dataset

iris <- as.data.frame(iris)

class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Let’s start by creating a scatter plot of the College Data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now let’s do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point() +
  theme_minimal()

Let’s color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Let’s color coordinate iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
  geom_point() +
  theme_minimal() 

Let’s run a simple histogram of our Louisiana Case Data

hist(Louisiana_Cases$cases, freq = NULL, desnity = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency",
     main = "Total College Covid-19 Infections (Louisiana)")

Let’s run a simple histogram for the Iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", 
     ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(binwidth = 100, color = "black", aes(fill = county)) +
  xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 cases in Louisiana")

Let’s create a ggplot for the IRIS data

historgram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

historgram_iris + geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")

Maybe a density plot makes more sense for college data

ggplot(South_Cases) +
  geom_density(aes(x = cases, fill = state), alpha = 0.25)

Let’s do it with Iris data

ggplot(iris) + 
  geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

Let’s look at violin plots for iris

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
  geom_violin() +
  theme_classic() +
  theme(legend.position = "none")

Now, let’s try the south data

ggplot(data = South_Cases, aes(x=state, y=cases, color=state)) +
  geom_violin () +
  theme_gray () +
  theme(legend.position = "none")

Now let’s take a look at residual plots. This is a graph that displays the residuals on the vertical axis and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.

Let’s start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
  geom_point(aes(x = .fitted, y = .resid))

Now look at the southern state’s cases

ggplot(lm(cases ~ cases_2021, data = South_Cases)) +
  geom_point(aes(x= .fitted, y = .resid))

A linear model is not a good call for the state cases

Now let’s do some correlations

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
obesity <- read_csv("Obesity_insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyr)
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Let’s look at the structure of the dataset

str(obesity)
## spc_tbl_ [1,338 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr [1:1338] "female" "male" "male" "male" ...
##  $ bmi     : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
##  $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr [1:1338] "yes" "no" "no" "no" ...
##  $ region  : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_character(),
##   ..   bmi = col_double(),
##   ..   children = col_double(),
##   ..   smoker = col_character(),
##   ..   region = col_character(),
##   ..   charges = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Now let’s look at the column classes

class(obesity)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

And get a summary of distribution of the variables

summary(obesity)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now let’s look at the distribution for insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now let’s look at correlations. The cor() command is used to determine correlation between two vectors. All of the columns of a data frame, or two data frames. The cov() command, on the other hand examines the covariance. This cor.test() command carries out a test to the significance of teh correlation

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

This test uses a spearman Rho correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397

This correlation measures the strength of a correlation between -1 and 1.

Now let’s look at the Tietjen=Moore test. This is used for unvariate datasets. The algorithm depicts the detection of the outliers in an unvariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n = length(dataSeries)
  ## Compute the absolute residuals 
  r = abs(dataSeries - mean(dataSeries))
  ## Sort data according to size of residual 
  df = data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  ## Create a subset of the data without the largest values 
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  ## Compute the sums of squares
  ksub = (subdataSeries - mean(subdataSeries)) **2
  all = (df$dataSeries - mean(df$dataSeries)) **2
  ## Compute the test statistic 
  sum(ksub)/sum(all)
  
}

This function helps to compute the absolute residuals and sorts data according to the size of the residuals. Later, we will focus on the computation of the sum of squares.

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  #Compute critical values based on simulation
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
  }

This function helps us to compute the critical values based on simulation data. Now let’s demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

The critical region for the Tietjen-Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample size and computing the Tietjen-Moore test statistic. Typically, 10,000 random samples are in use. The values of the Tietjen-Moore statistic is obtained from the data is compared to this reference distribution. The values of the test statistic are between zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regardless of which test statistics issued, Lk or Ek.

First we will look at charges

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 100)
## $T
## [1] 0.4556033
## 
## $Talpha
##       50% 
## 0.6350056

Let’s check out bmi

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] 0.9475628
## 
## $Talpha
##      50% 
## 0.950515

Probability Plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or, for example, you may wonder “what is the likelihood that a person has an IQ of exactly 140? In this case, you would need to determine the probabilty of events. The IQ distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Let’s create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) +
  geom_point()

This gives us the probability of every single point occurring

Now let’s use the pnorm function for more information

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x=bmi, y = Density)) +
  geom_point()

What if we want to find the probability of the bmi being greater thatn 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x, , mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)
## [1] "6.29 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.06287869

What about the probability that a bmi is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)),2), "%")
}

pp_less(40)
## [1] "93.71 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.9371213

What if we want to find the area in between?

pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Let’s use the qnorm function. We need to assume a normal distrubtion for this.

What bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769

What if we want a random sampling of values within our distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-Wilk Test

So now we know how to generate a normal distribution. How do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size.

shapiro.test(obesity$charges[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

Now, let’s check out age

shapiro.test(obesity$age[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16

And lastly, bmi

shapiro.test(obesity$bmi[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

First, let’s load our packages

library(readr)
library(readxl)

Air_data <- read_excel("AirQualityUCI.xlsx")

Date- date of measurement Time- time of measurement CO(GT)- average hourly CO2 PT08, s1(CO)- tin oxide hourly average sensor response NMHC- average hourly non-metallic hydrocarbon concentration C6HC- average benzene concentration PT08.s3(NMHC)- titanium average hourly sensor response NOx- average hourly NOx concentration NO2- average hourly NO2 concentration T- temper RH- relative humidity AH- absolute humidity

str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Let’s get rid of our date in the time column

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab= "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)
## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

First, let’s load a required library

library(RCurl)
library(dplyr)

Now let’s get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

College_Data <- read.csv(site)

First, let’s use the str function. This shows the structure of the object.

str(College_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabetically by college?

alphabetical <- College_Data %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset with the filter function

Louisiana_Cases <- filter(College_Data, state == "Louisiana")

Let’s filter out smaller amount of states

South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")

Let’s look at some time series data

First, we’ll load the required libraries

library(lubridate)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(scales)

Now, let’s load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

Let’s create a group_by object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting

Days_since_first_reported <- tally(state_cases)

Let’s visualize some data

First, let’s start off with some definitions

Data- obvious- the stuff we want to visualize

Layer- made of geometric elements and requisite statistical information; include geometric objects which represents the plot

Scales- used to map values in the data space that are used for creation of values (color, size, shape, etc)

Coordinate system- describes how the data coordinated are mapped together in relation to the plan on the graphic

Faceting- how to break up data into subsets to display multiple types or groups of data

Theme- controls the finer points of the display, such as font size and background color

options(repr.plot.width = 6, repr.plot.height = 6)

class(College_Data)
## [1] "data.frame"
head(College_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now let’s take a look at a different dataset

iris <- as.data.frame(iris)

class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Let’s start by creating a scatter plot of the College Data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now let’s do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point() +
  theme_minimal()

Let’s color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Let’s color coordinate iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
  geom_point() +
  theme_minimal() 

Let’s run a simple histogram of our Louisiana Case Data

hist(Louisiana_Cases$cases, freq = NULL, desnity = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency",
     main = "Total College Covid-19 Infections (Louisiana)")

Let’s run a simple histogram for the Iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", 
     ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(binwidth = 100, color = "black", aes(fill = county)) +
  xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 cases in Louisiana")

Let’s create a ggplot for the IRIS data

historgram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

historgram_iris + geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")

Maybe a density plot makes more sense for college data

ggplot(South_Cases) +
  geom_density(aes(x = cases, fill = state), alpha = 0.25)

Let’s do it with Iris data

ggplot(iris) + 
  geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

Let’s look at violin plots for iris

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
  geom_violin() +
  theme_classic() +
  theme(legend.position = "none")

Now, let’s try the south data

ggplot(data = South_Cases, aes(x=state, y=cases, color=state)) +
  geom_violin () +
  theme_gray () +
  theme(legend.position = "none")

Now let’s take a look at residual plots. This is a graph that displays the residuals on the vertical axis and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.

Let’s start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
  geom_point(aes(x = .fitted, y = .resid))

Now look at the southern state’s cases

ggplot(lm(cases ~ cases_2021, data = South_Cases)) +
  geom_point(aes(x= .fitted, y = .resid))

A linear model is not a good call for the state cases

Now let’s do some correlations

library(readr)
obesity <- read_csv("Obesity_insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyr)
library(dplyr)
library(plyr)

Let’s look at the structure of the dataset

str(obesity)
## spc_tbl_ [1,338 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr [1:1338] "female" "male" "male" "male" ...
##  $ bmi     : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
##  $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr [1:1338] "yes" "no" "no" "no" ...
##  $ region  : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_character(),
##   ..   bmi = col_double(),
##   ..   children = col_double(),
##   ..   smoker = col_character(),
##   ..   region = col_character(),
##   ..   charges = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Now let’s look at the column classes

class(obesity)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

And get a summary of distribution of the variables

summary(obesity)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now let’s look at the distribution for insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now let’s look at correlations. The cor() command is used to determine correlation between two vectors. All of the columns of a data frame, or two data frames. The cov() command, on the other hand examines the covariance. This cor.test() command carries out a test to the significance of teh correlation

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

This test uses a spearman Rho correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397

This correlation measures the strength of a correlation between -1 and 1.

Now let’s look at the Tietjen=Moore test. This is used for unvariate datasets. The algorithm depicts the detection of the outliers in an unvariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n = length(dataSeries)
  ## Compute the absolute residuals 
  r = abs(dataSeries - mean(dataSeries))
  ## Sort data according to size of residual 
  df = data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  ## Create a subset of the data without the largest values 
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  ## Compute the sums of squares
  ksub = (subdataSeries - mean(subdataSeries)) **2
  all = (df$dataSeries - mean(df$dataSeries)) **2
  ## Compute the test statistic 
  sum(ksub)/sum(all)
  
}

This function helps to compute the absolute residuals and sorts data according to the size of the residuals. Later, we will focus on the computation of the sum of squares.

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  #Compute critical values based on simulation
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
  }

This function helps us to compute the critical values based on simulation data. Now let’s demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

The critical region for the Tietjen-Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample size and computing the Tietjen-Moore test statistic. Typically, 10,000 random samples are in use. The values of the Tietjen-Moore statistic is obtained from the data is compared to this reference distribution. The values of the test statistic are between zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regardless of which test statistics issued, Lk or Ek.

First we will look at charges

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 100)
## $T
## [1] 0.4556033
## 
## $Talpha
##       50% 
## 0.6352265

Let’s check out bmi

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] 0.9475628
## 
## $Talpha
##       50% 
## 0.9505177

Probability Plots

library(ggplot2)
library(tigerstats)

We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or, for example, you may wonder “what is the likelihood that a person has an IQ of exactly 140? In this case, you would need to determine the probabilty of events. The IQ distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Let’s create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) +
  geom_point()

This gives us the probability of every single point occurring

Now let’s use the pnorm function for more information

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x=bmi, y = Density)) +
  geom_point()

What if we want to find the probability of the bmi being greater thatn 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x, , mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)
## [1] "6.29 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.06287869

What about the probability that a bmi is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)),2), "%")
}

pp_less(40)
## [1] "93.71 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.9371213

What if we want to find the area in between?

pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Let’s use the qnorm function. We need to assume a normal distrubtion for this.

What bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769

What if we want a random sampling of values within our distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-Wilk Test

So now we know how to generate a normal distribution. How do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size.

shapiro.test(obesity$charges[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

Now, let’s check out age

shapiro.test(obesity$age[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16

And lastly, bmi

shapiro.test(obesity$bmi[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

First, let’s load our packages

library(readr)
library(readxl)

Air_data <- read_excel("AirQualityUCI.xlsx")

Date- date of measurement Time- time of measurement CO(GT)- average hourly CO2 PT08, s1(CO)- tin oxide hourly average sensor response NMHC- average hourly non-metallic hydrocarbon concentration C6HC- average benzene concentration PT08.s3(NMHC)- titanium average hourly sensor response NOx- average hourly NOx concentration NO2- average hourly NO2 concentration T- temper RH- relative humidity AH- absolute humidity

str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
library(ggplot2)

Let’s get rid of our date in the time column

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab= "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)
## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

First we’ll look at the unnest_token function

Let’s start by looking at an Emily Dickenson passage

text <- c("Because I could not stop from Death - ",
          "He kindly stopped for me - ", 
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop from Death - "
## [2] "He kindly stopped for me - "           
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"

This is the typical character vecotr that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "Because I could not stop from Death - "
## 2     2 "He kindly stopped for me - "           
## 3     3 "The Carriage held but just Ourselves -"
## 4     4 "and Immortality"

Reminder: A tibble is a modern class of data frame within R. It’s available in the dplyr and tibble packages that have convenient print method, will not convert strongs to factors, and do not use row names. Tibbles are great for use with tidy tools.

Next, we will use the ‘unest_tokens’ function

First, we have the output column name that will be created as the text is unnested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Let’s use the janeustenr package to analyze some Jane Austen texts. There are 6 books in this package.

library(janeaustenr)
library(stringr)
detach("package:plyr", unload=TRUE)
library(dplyr)

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()

original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

To work with this as a tidy dataset, we need to restruture it in the one-token-per-row format, which as we saw earlier, is done with the unnest_tokens() function

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function uses the tokenizers package to seperate each line of text in the original dataframe into tokens.

The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs can be used.

Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr.

Often in text analysis, we will want to remove stop words. Stop words are words that are NOT USEFUL for an analysis. These include words like the, of, to, and, and so forth.

We can remove stop words (kept in the tidytext dataset ‘stop_words’) with an anti_join().

data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

The stop words dataset in the tidytext package contains stop words from three lexicons. We can use them all together, as we have three, or filter() to only use one set of stop words if that’s more appropriate for your analysis.

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

Because we’ve been using tidy tools, our word counts are stored in a tidy dataframe. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most common words.

library(ggplot2)

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

The gutenbergr package

This package provides access to the public domain works from the gutenberg project (www.gutenberg.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().

Word Frequencies

Let’s look at some biology texts, starting with Darwin.

The Voyage od the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227 The descent of man, and selection in relation to sex - 2300

We can access these words using the gutenberg_download() and the Project Gutenberg IDnumbers

library(gutenbergr)

darwin <- gutenbergr::gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")

Let’s break into these tokens

tidy_darwin <- darwin %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

Let’s check out what the most common darwin words are.

tidy_darwin %>%
  count(word, sort = TRUE)
## # A tibble: 23,630 × 2
##    word          n
##    <chr>     <int>
##  1 species    2998
##  2 male       1672
##  3 males      1337
##  4 animals    1310
##  5 birds      1292
##  6 female     1197
##  7 sexes      1095
##  8 females    1038
##  9 selection  1038
## 10 sexual      801
## # ℹ 23,620 more rows

Now,let’s get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and Adaptation - 63540

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")

Let’s tokenize them

tidy_morgan <- morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

What are the most common words?

tidy_morgan %>%
  count(word, sort = TRUE)
## # A tibble: 13,855 × 2
##    word             n
##    <chr>        <int>
##  1 species        869
##  2 regeneration   814
##  3 piece          702
##  4 cut            669
##  5 male           668
##  6 forms          631
##  7 selection      604
##  8 cells          576
##  9 found          552
## 10 development    546
## # ℹ 13,845 more rows

Lastly, let’s look at Thomas Henry Huxley

Evidence as to mans place in nature- 2931 On the reception of the Origin of Species- 2089 Evolution and Ethics, and other essays- 2940 Science and Culture, and other essays- 52344

huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")
tidy_huxley <- huxley %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tidy_huxley %>%
  count(word, sort = TRUE)
## # A tibble: 16,090 × 2
##    word          n
##    <chr>     <int>
##  1 species     339
##  2 nature      331
##  3 time        287
##  4 life        286
##  5 existence   255
##  6 knowledge   238
##  7 animals     227
##  8 natural     223
##  9 animal      216
## 10 science     207
## # ℹ 16,080 more rows

Now, let’s calculate the frequency for each word for the works of Darwin, Morgan, and Huxley by binding the frames together

library(tidyr)

frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                       mutate(tidy_darwin, author = "Charles Darwin"),
                       mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/ sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from = author, values_from = proportion) %>%
  pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")

frequency
## # A tibble: 95,895 × 3
##    word    author               proportion
##    <chr>   <chr>                     <dbl>
##  1 a       Thomas Hunt Morgan   0.00206   
##  2 a       Thomas Henry Huxley  0.0000856 
##  3 a       Charles Darwin       0.000141  
##  4 ab      Thomas Hunt Morgan   0.000165  
##  5 ab      Thomas Henry Huxley  0.0000978 
##  6 ab      Charles Darwin       0.00000642
##  7 abaiss  Thomas Hunt Morgan  NA         
##  8 abaiss  Thomas Henry Huxley NA         
##  9 abaiss  Charles Darwin       0.00000642
## 10 abandon Thomas Hunt Morgan   0.00000752
## # ℹ 95,885 more rows

Now we need to change the table so that each author has its own row

frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)

frequency2
## # A tibble: 31,965 × 4
##    word        `Thomas Hunt Morgan` `Thomas Henry Huxley` `Charles Darwin`
##    <chr>                      <dbl>                 <dbl>            <dbl>
##  1 a                     0.00206                0.0000856       0.000141  
##  2 ab                    0.000165               0.0000978       0.00000642
##  3 abaiss               NA                     NA               0.00000642
##  4 abandon               0.00000752             0.0000122       0.00000321
##  5 abandoned             0.0000150              0.0000245       0.00000321
##  6 abashed              NA                     NA               0.00000321
##  7 abatement            NA                      0.0000245       0.00000321
##  8 abbot                NA                      0.0000245       0.00000321
##  9 abbott               NA                     NA               0.00000642
## 10 abbreviated          NA                     NA               0.0000128 
## # ℹ 31,955 more rows

Now let’s plot

library(scales)

ggplot(frequency2, aes(x = `Charles Darwin`, y = `Thomas Hunt Morgan`), color = abs(- 'Charles Darwin' -'Thomas Hunt Morgan')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                         low = "darkslategray4", high = "gray75") +
   theme(legend.position = "none") +
  labs(y = "Thomas Hunt Morgan", x = "Charles Darwin")
## Warning: Removed 24513 rows containing missing values (`geom_point()`).
## Warning: Removed 24514 rows containing missing values (`geom_text()`).

ggplot(frequency2, aes(x = `Charles Darwin`, y = `Thomas Henry Huxley`), color = abs(- 'Charles Darwin' -'Thomas Henry Huxley')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                         low = "darkslategray4", high = "gray75") +
   theme(legend.position = "none") +
  labs(y = "Thomas Henry Huxley", x = "Charles Darwin")
## Warning: Removed 23389 rows containing missing values (`geom_point()`).
## Warning: Removed 23390 rows containing missing values (`geom_text()`).

ggplot(frequency2, aes(x = `Thomas Hunt Morgan`, y = `Thomas Henry Huxley`), color = abs(- 'Charles Darwin' -'Thomas Hunt Morgan')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                         low = "darkslategray4", high = "gray75") +
   theme(legend.position = "none") +
  labs(y = "Thomas Hunt Morgan", x = "Charles Darwin")
## Warning: Removed 26068 rows containing missing values (`geom_point()`).
## Warning: Removed 26069 rows containing missing values (`geom_text()`).

The Sentiments Datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive

The function get_sentiments() allows us to get the specific sentiments lexicon with the measures for each one

library(tidytext)
install.packages("textdata")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(textdata)

Let’s look at afinn and bing

afinn <- read.csv("afinn.csv")

head(afinn)
##   X       word value
## 1 1    abandon    -2
## 2 2  abandoned    -2
## 3 3   abandons    -2
## 4 4   abducted    -2
## 5 5  abduction    -2
## 6 6 abductions    -2
bing <- read.csv("bing.csv")


head(bing)
##   X       word sentiment
## 1 1    2-faces  negative
## 2 2   abnormal  negative
## 3 3    abolish  negative
## 4 4 abominable  negative
## 5 5 abominably  negative
## 6 6  abominate  negative

And lastly nrc

nrc <- read.csv("nrc.csv")

head(nrc)
##   X      word sentiment
## 1 1    abacus     trust
## 2 2   abandon      fear
## 3 3   abandon  negative
## 4 4   abandon   sadness
## 5 5 abandoned     anger
## 6 6 abandoned      fear

These librarie were created either using crowdourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing

Let’s look at the words with a joy scor from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(c(944, 1227, 1228,2300), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")

tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup () %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 786,575 × 4
##    gutenberg_id linenumber chapter word   
##           <int>      <int>   <int> <chr>  
##  1          944          1       0 the    
##  2          944          1       0 voyage 
##  3          944          1       0 of     
##  4          944          1       0 the    
##  5          944          1       0 beagle 
##  6          944          1       0 by     
##  7          944          2       0 charles
##  8          944          2       0 darwin 
##  9          944          8       0 about  
## 10          944          8       0 the    
## # ℹ 786,565 more rows

Let’s add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"


tidy_books
## # A tibble: 786,575 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 786,565 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First, let’s NRC.

nrc_joy <- nrc %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The Voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 277 × 2
##    word           n
##    <chr>      <int>
##  1 found        301
##  2 good         161
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          86
##  7 present       85
##  8 food          78
##  9 beautiful     61
## 10 elevation     60
## # ℹ 267 more rows

We can also examine how sentiment changes throughout a work

library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`

Now, let’s plot

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Let’s compare the three sentiment dictions

There are several options for the sentiment lexicons, you might want some more info on which is appropriate for your purposes. Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB.

library(tidyr)

voyage <- tidy_books %>%
  filter(book == "The Voyage of the Beagle")

voyage
## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows

Let’s again use integer division (‘%/%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.

afinn <- voyage %>%
  inner_join(afinn) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(bing) %>%
    mutate(method = "Bing et al."),
  voyage %>%
    inner_join(nrc) %>%
                 filter(sentiment %in% c("positive", "negative"))
               ) %>%
    mutate(method = "NRC") %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 79 of `x` matches multiple rows in `y`.
## ℹ Row 10386 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

We can now estimate the net sentiment (positive-negative) in each chunk of the novel text for each lexicon (dictionary). Let’s bind them all together and visualize with ggplot.

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Let’s look at the counts based on each dictionary

nrc %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)
##   sentiment    n
## 1  negative 3316
## 2  positive 2308
get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 2,492 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive   1226
##  2 well       positive    855
##  3 like       positive    813
##  4 good       positive    487
##  5 doubt      negative    414
##  6 wild       negative    317
##  7 respect    positive    310
##  8 remarkable positive    295
##  9 important  positive    281
## 10 bright     positive    258
## # ℹ 2,482 more rows

This can be shown visually, and we can pipe straight into ggplot2.

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scale = "free_y") +
  labs(x = "Contribute to Sentiment", y = NULL)

Let’s spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)

custom_stop_words
## # A tibble: 1,153 × 2
##    word      lexicon
##    <chr>     <chr>  
##  1 wild      custom 
##  2 dark      custom 
##  3 great     custom 
##  4 like      custom 
##  5 a         SMART  
##  6 a's       SMART  
##  7 able      SMART  
##  8 about     SMART  
##  9 above     SMART  
## 10 according SMART  
## # ℹ 1,143 more rows

Word Clouds!

We can see that tidy text mining and sentiment analysis work well with ggplot2, but having our data in tidy format leads to other nice graphing techniques.

Let’s use the wordcloud package!!

library(wordcloud)
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
## 
##     textplot
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`

Let’s look at the comparison.cloud(), which may require turning the dataframe into a matrix.

We can change to matrix using the acast() function.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tigerstats':
## 
##     tips
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining with `by = join_by(word)`

Looking at units beyond words

Lots of useful work can be done by tokenizing at the word level, but sometimes, it is nice to look at different units of text. For example, we can look beyonf just unigrams.

Ex. I am not having a good day.

bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative") 

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter !=0) %>%
  slice_max(ratio, n = 1) %>%
  ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 5
##   book                                        chapter negativewords words  ratio
##   <chr>                                         <int>         <int> <int>  <dbl>
## 1 On the Origin of Species By Means of Natur…       3             5    86 0.0581
## 2 The Descent of Man, and Selection in Relat…      20             4    87 0.0460
## 3 The Expression of the Emotions in Man and …      10           249  4220 0.0590
## 4 The Voyage of the Beagle                         10           375 11202 0.0335

So far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.

Let’s look at some methods of tidytext for calculating and visualizing word relationships.

library(dplyr)
library(tidytext)

darwin_books <- gutenbergr::gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")

colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"


darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n= 2)

darwin_bigrams
## # A tibble: 724,531 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle the voyage    
##  2 The Voyage of the Beagle voyage of     
##  3 The Voyage of the Beagle of the        
##  4 The Voyage of the Beagle the beagle    
##  5 The Voyage of the Beagle beagle by     
##  6 The Voyage of the Beagle charles darwin
##  7 The Voyage of the Beagle <NA>          
##  8 The Voyage of the Beagle <NA>          
##  9 The Voyage of the Beagle <NA>          
## 10 The Voyage of the Beagle <NA>          
## # ℹ 724,521 more rows

The data is still in tidytext format, and it is structured as one-token-per-row. Each token is bigram.

Counting and filtering n-gram

darwin_bigrams %>%
  count(bigram, sort= TRUE)
## # A tibble: 238,516 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the   11297
##  2 <NA>      8947
##  3 in the    5257
##  4 on the    4093
##  5 to the    2849
##  6 the same  2048
##  7 that the  1947
##  8 it is     1830
##  9 of a      1610
## 10 and the   1590
## # ℹ 238,506 more rows

Most of the common bigrams are stop-words. This can be a good tume to use tidyr’s separate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) 
  
bigrams_filtered
## # A tibble: 94,896 × 3
##    book                     word1   word2  
##    <chr>                    <chr>   <chr>  
##  1 The Voyage of the Beagle charles darwin 
##  2 The Voyage of the Beagle <NA>    <NA>   
##  3 The Voyage of the Beagle <NA>    <NA>   
##  4 The Voyage of the Beagle <NA>    <NA>   
##  5 The Voyage of the Beagle <NA>    <NA>   
##  6 The Voyage of the Beagle <NA>    <NA>   
##  7 The Voyage of the Beagle online  edition
##  8 The Voyage of the Beagle <NA>    <NA>   
##  9 The Voyage of the Beagle degree  symbol 
## 10 The Voyage of the Beagle degs    italics
## # ℹ 94,886 more rows

New bigram counts

bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_counts
## # A tibble: 94,896 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle charles darwin
##  2 The Voyage of the Beagle NA NA         
##  3 The Voyage of the Beagle NA NA         
##  4 The Voyage of the Beagle NA NA         
##  5 The Voyage of the Beagle NA NA         
##  6 The Voyage of the Beagle NA NA         
##  7 The Voyage of the Beagle online edition
##  8 The Voyage of the Beagle NA NA         
##  9 The Voyage of the Beagle degree symbol 
## 10 The Voyage of the Beagle degs italics  
## # ℹ 94,886 more rows

We may also be interested in trigrams, which are three word combos

trigrams <- darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word,) %>%
  count(word1, word2, word3, sort = TRUE)

trigrams
## # A tibble: 19,971 × 4
##    word1         word2  word3           n
##    <chr>         <chr>  <chr>       <int>
##  1 <NA>          <NA>   <NA>         9884
##  2 tierra        del    fuego          92
##  3 secondary     sexual characters     91
##  4 captain       fitz   roy            45
##  5 closely       allied species        30
##  6 de            la     physionomie    30
##  7 domestication vol    ii             26
##  8 vol           ii     pp             22
##  9 vertebrates   vol    iii            21
## 10 proc          zoolog soc            18
## # ℹ 19,961 more rows

Let’s analyze some bigrams

bigrams_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 39 × 3
##    book                                                   word1           n
##    <chr>                                                  <chr>       <int>
##  1 The Descent of Man, and Selection in Relation to Sex   sexual        254
##  2 On the Origin of Species By Means of Natural Selection natural       250
##  3 The Descent of Man, and Selection in Relation to Sex   natural       156
##  4 On the Origin of Species By Means of Natural Selection sexual         18
##  5 On the Origin of Species By Means of Natural Selection continued       6
##  6 The Descent of Man, and Selection in Relation to Sex   unconscious     6
##  7 On the Origin of Species By Means of Natural Selection methodical      5
##  8 The Descent of Man, and Selection in Relation to Sex   continued       5
##  9 On the Origin of Species By Means of Natural Selection unconscious     4
## 10 The Expression of the Emotions in Man and Animals      natural         4
## # ℹ 29 more rows

Let’s again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
## # A tibble: 60,595 × 6
##    book                                       bigram     n      tf   idf  tf_idf
##    <chr>                                      <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and… nerve…    47 0.00350 1.39  0.00485
##  2 On the Origin of Species By Means of Natu… natur…   250 0.0160  0.288 0.00460
##  3 The Expression of the Emotions in Man and… la ph…    35 0.00260 1.39  0.00361
##  4 The Voyage of the Beagle                   bueno…    54 0.00245 1.39  0.00339
##  5 The Voyage of the Beagle                   capta…    53 0.00240 1.39  0.00333
##  6 On the Origin of Species By Means of Natu… glaci…    36 0.00230 1.39  0.00319
##  7 The Voyage of the Beagle                   fitz …    50 0.00227 1.39  0.00314
##  8 The Expression of the Emotions in Man and… muscl…    30 0.00223 1.39  0.00310
##  9 The Expression of the Emotions in Man and… orbic…    29 0.00216 1.39  0.00299
## 10 The Expression of the Emotions in Man and… dr du…    26 0.00194 1.39  0.00268
## # ℹ 60,585 more rows
library(ggplot2)
bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n =10) %>%
  ungroup() %>%
  mutate(bigram = reorder(bigram, tf_idf)) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf of bigrams", y = NULL)

Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1=="not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 867 × 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      128
##  2 not   have    104
##  3 not   only    103
##  4 not   a       100
##  5 not   to       98
##  6 not   been     89
##  7 not   the      82
##  8 not   at       70
##  9 not   know     60
## 10 not   so       58
## # ℹ 857 more rows

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negating words.

install.packages("textdata")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
AFINN <- read.csv("afinn.csv")

head(AFINN)
##   X       word value
## 1 1    abandon    -2
## 2 2  abandoned    -2
## 3 3   abandons    -2
## 4 4   abducted    -2
## 5 5  abduction    -2
## 6 6 abductions    -2

We can examine the most frequent words that are preceded by “not”, and associate with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

not_words
## # A tibble: 114 × 3
##    word2     value     n
##    <chr>     <int> <int>
##  1 doubt        -1    25
##  2 like          2    11
##  3 pretend      -1     9
##  4 wish          1     8
##  5 admit        -1     7
##  6 difficult    -1     5
##  7 easy          1     5
##  8 reach         1     5
##  9 extend        1     4
## 10 forget       -1     4
## # ℹ 104 more rows

Let’s visualize

library(ggplot2)

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0 )) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurences", y = "Words preceded by \"not\"")

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

negated_words
## # A tibble: 208 × 4
##    word1   word2     value     n
##    <chr>   <chr>     <int> <int>
##  1 no      doubt        -1   210
##  2 not     doubt        -1    25
##  3 no      great         3    19
##  4 not     like          2    11
##  5 not     pretend      -1     9
##  6 not     wish          1     8
##  7 without doubt        -1     8
##  8 not     admit        -1     7
##  9 no      greater       3     6
## 10 not     difficult    -1     5
## # ℹ 198 more rows

Let’s visualize the negation words

negated_words %>%
  mutate(contribution = n * value,
         word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1) %>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value > 0)) + 
  geom_col(show.legend = FALSE) +
  facet_wrap(~ word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
  xlab("Words preceded by negation words") +
  ylab("Sentiment value * # of occurences") +
  coord_flip()

Visualize a network of bigrams with ggraph

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:mosaic':
## 
##     compare
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following object is masked from 'package:plotly':
## 
##     groups
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph
## IGRAPH f5dbef1 DN-- 203 140 -- 
## + attr: name (v/c), n (e/n)
## + edges from f5dbef1 (vertex names):
##  [1] NA        ->NA          natural   ->selection   sexual    ->selection  
##  [4] vol       ->ii          lower     ->animals     sexual    ->differences
##  [7] south     ->america     distinct  ->species     secondary ->sexual     
## [10] breeding  ->season      closely   ->allied      sexual    ->characters 
## [13] tierra    ->del         del       ->fuego       vol       ->iii        
## [16] de        ->la          natural   ->history     fresh     ->water      
## [19] north     ->america     bright    ->colours     sexual    ->difference 
## [22] allied    ->species     tail      ->feathers    strongly  ->marked     
## + ... omitted several edges
library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also add directionality to this network

set.seed(1234)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, 
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), vjust =1, hjust =1) +
  theme_void()

A central question in text mining is how to quantify what a document is about. We can do this but looking at words that make up the document, and measuring term frequency.

There are a lot of words that may not be important, these are the stop words.

One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words and increases the weight for words that are not used very much.

Term frequency in Darwins works

library(dplyr)
library(tidytext)

book_words <- gutenbergr::gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg/")

colnames(book_words)[1] <- "book"

book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

Now, let’s disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

book_words
## # A tibble: 43,024 × 3
##    book                                                   word      n
##    <chr>                                                  <chr> <int>
##  1 The Descent of Man, and Selection in Relation to Sex   the   25490
##  2 The Voyage of the Beagle                               the   16930
##  3 The Descent of Man, and Selection in Relation to Sex   of    16762
##  4 On the Origin of Species By Means of Natural Selection the   10301
##  5 The Voyage of the Beagle                               of     9438
##  6 The Descent of Man, and Selection in Relation to Sex   in     8882
##  7 The Expression of the Emotions in Man and Animals      the    8045
##  8 On the Origin of Species By Means of Natural Selection of     7864
##  9 The Descent of Man, and Selection in Relation to Sex   and    7854
## 10 The Descent of Man, and Selection in Relation to Sex   to     5901
## # ℹ 43,014 more rows
book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))

book_words
## # A tibble: 43,024 × 3
##    book                                                   word      n
##    <chr>                                                  <chr> <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex   the   25490
##  2 The Voyage of the Beagle                               the   16930
##  3 The Descent of Man, and Selection in Relation to Sex   of    16762
##  4 On the Origin of Species By Means of Natural Selection the   10301
##  5 The Voyage of the Beagle                               of     9438
##  6 The Descent of Man, and Selection in Relation to Sex   in     8882
##  7 The Expression of the Emotions in Man and Animals      the    8045
##  8 On the Origin of Species By Means of Natural Selection of     7864
##  9 The Descent of Man, and Selection in Relation to Sex   and    7854
## 10 The Descent of Man, and Selection in Relation to Sex   to     5901
## # ℹ 43,014 more rows
book_words <- left_join(book_words, total_words)
## Joining with `by = join_by(book)`
book_words
## # A tibble: 43,024 × 4
##    book                                                   word      n  total
##    <chr>                                                  <chr> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex   the   25490 311041
##  2 The Voyage of the Beagle                               the   16930 208118
##  3 The Descent of Man, and Selection in Relation to Sex   of    16762 311041
##  4 On the Origin of Species By Means of Natural Selection the   10301 157002
##  5 The Voyage of the Beagle                               of     9438 208118
##  6 The Descent of Man, and Selection in Relation to Sex   in     8882 311041
##  7 The Expression of the Emotions in Man and Animals      the    8045 110414
##  8 On the Origin of Species By Means of Natural Selection of     7864 157002
##  9 The Descent of Man, and Selection in Relation to Sex   and    7854 311041
## 10 The Descent of Man, and Selection in Relation to Sex   to     5901 311041
## # ℹ 43,014 more rows

You can see that the usual suspects are the most common words, but don’t tell anything about what the books topic is.

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 515 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).

Zipf’s Law

The frequency that a word appears is inversely proportional to its rank when predicting a topic.

Let’s apply Zipf’s Law to Darwin’s work.

freq_by_rank <- book_words %>%
  group_by(book) %>%
  mutate(rank = row_number(),
         `term frequency` = n/total) %>%
  ungroup()

freq_by_rank
## # A tibble: 43,024 × 6
##    book                                word      n  total  rank `term frequency`
##    <chr>                               <chr> <dbl>  <dbl> <int>            <dbl>
##  1 The Descent of Man, and Selection … the   25490 311041     1           0.0820
##  2 The Voyage of the Beagle            the   16930 208118     1           0.0813
##  3 The Descent of Man, and Selection … of    16762 311041     2           0.0539
##  4 On the Origin of Species By Means … the   10301 157002     1           0.0656
##  5 The Voyage of the Beagle            of     9438 208118     2           0.0453
##  6 The Descent of Man, and Selection … in     8882 311041     3           0.0286
##  7 The Expression of the Emotions in … the    8045 110414     1           0.0729
##  8 On the Origin of Species By Means … of     7864 157002     2           0.0501
##  9 The Descent of Man, and Selection … and    7854 311041     4           0.0253
## 10 The Descent of Man, and Selection … to     5901 311041     5           0.0190
## # ℹ 43,014 more rows
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10() 

Let’s use TF- IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf
## # A tibble: 43,024 × 7
##    book                                   word      n  total     tf   idf tf_idf
##    <chr>                                  <chr> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in … the   25490 311041 0.0820     0      0
##  2 The Voyage of the Beagle               the   16930 208118 0.0813     0      0
##  3 The Descent of Man, and Selection in … of    16762 311041 0.0539     0      0
##  4 On the Origin of Species By Means of … the   10301 157002 0.0656     0      0
##  5 The Voyage of the Beagle               of     9438 208118 0.0453     0      0
##  6 The Descent of Man, and Selection in … in     8882 311041 0.0286     0      0
##  7 The Expression of the Emotions in Man… the    8045 110414 0.0729     0      0
##  8 On the Origin of Species By Means of … of     7864 157002 0.0501     0      0
##  9 The Descent of Man, and Selection in … and    7854 311041 0.0253     0      0
## 10 The Descent of Man, and Selection in … to     5901 311041 0.0190     0      0
## # ℹ 43,014 more rows

Let’s look at terms with high tf-idf in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 43,024 × 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <dbl>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and … tears   126 1.14e-3 1.39  1.58e-3
##  2 The Expression of the Emotions in Man and … blush   114 1.03e-3 1.39  1.43e-3
##  3 The Expression of the Emotions in Man and … eyeb…   149 1.35e-3 0.693 9.35e-4
##  4 The Voyage of the Beagle                    degs    117 5.62e-4 1.39  7.79e-4
##  5 On the Origin of Species By Means of Natur… sele…   412 2.62e-3 0.288 7.55e-4
##  6 The Descent of Man, and Selection in Relat… sexu…   745 2.40e-3 0.288 6.89e-4
##  7 The Descent of Man, and Selection in Relat… shewn   143 4.60e-4 1.39  6.37e-4
##  8 On the Origin of Species By Means of Natur… hybr…   133 8.47e-4 0.693 5.87e-4
##  9 The Expression of the Emotions in Man and … frown    46 4.17e-4 1.39  5.78e-4
## 10 The Descent of Man, and Selection in Relat… sele…   621 2.00e-3 0.288 5.74e-4
## # ℹ 43,014 more rows

Let’s look at a visualization for these high tf-idf words

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)