GGPLOT

Barplots

Now lets take a look at some ggplot2 bar plots we’ll start with making a data frame based on the tooth data

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

Now lets make a second data frame

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

lets load up ggplot2

library(ggplot2)

Lets set up our parameters for ggplot2

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = -0.3)

Now lets add labels inside the bars

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the bar plot colors by group

f + geom_col(aes(color = dose), fill = "white") + 
  scale_color_manual(values = c("blue", "gold", "red"))

Lets change the fill

f + geom_col(aes(fill = dose)) + 
  scale_fill_manual(values = c("blue", "gold", "red"))

How do we do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) + 
  geom_col(aes(color = supp, fill = supp), position = position_stack()) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) + 
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))
p

Now lets add labels to the dodged bar plot

p + geom_text(
  aes(label = len, group = supp),
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

Now what if we want to add labels to our stacked barplots? For this we need dyplr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>% 
  mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

Now that we have our data set up, lets recreate our stacked graphs

ggplot(df2, aes(x = dose, y = len)) + 
  geom_col(aes(fill = supp), width = 0.7) +
  geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

Barplots

Lets look at some boxplots

data("ToothGrowth")

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Now load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() + 
    theme(legend.position = "top")
)

Lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") + 
  stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included and their order

tg + geom_boxplot() + 
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

Lets put our x axis in descending order

tg + geom_boxplot() + 
  scale_x_discrete(limits = c("2", "1", "0.5"))

We can also change boxplot colors by groups

tg + geom_boxplot(aes(color = dose)) + 
  scale_color_manual(values = c("indianred", "blue1", "green2"))

What if we want to display our data subset by oj vs vitamin c?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) + 
  scale_fill_manual(values = c("#999999", "orange"))

tg2

We can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

Histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 56), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now lets load dplyr

library(dplyr)

mu <- wdata %>%
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
  
)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") + 
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") +
  scale_color_manual(values = c("pink", "purple"))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") +
  scale_color_manual(values = c("pink", "purple")) + 
  scale_fill_manual(values = c("lightblue1", "green"))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

What if we want to combine density plots with historgrams

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "lightyellow1")
## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex),
                   fill = "white", position = "identity") +
  geom_density(aes(color = sex), size = 1) + 
  scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Dotplots

First lets load required packages

library(ggplot2)

lets set our theme

theme_set(
  theme_dark() + 
    theme(legend.position = "top")
)

Lets initiate a ggplot called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))

Lets create a dot plot with summary statistics

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
  stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_segment()`).

Lineplots

Now lets change it up and look at some line plots

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

Now lets create a second data frame plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Now lets load ggplot and set a theme

library(ggplot2)

theme_set(
  theme_grey() +
    theme(legend.position = "right")
)

Now lets do some basic line plots. First we will build a function to display all the line types

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0, 0, 0, 0))
  plot(1, pch="", ylim = c(0,6), xlim=c(0,0.7), axes = FALSE, xlab= "", ylab="")
  for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty=i, lwd=3)
  text(rep(0.1,6), 0:6, labels = c("0. 'blank'", "1. 'solid'", "2. 'dashed'", "3. 'dotted'",
                                   "4. 'dotdash'", "5.'longdash'", "6. 'twodash"))
  par(mar=oldPar$mar, font=oldPar$font)
}

generateRLineTypes()

Ridge Plots

First lets load required packages

library(ggplot2)
library(ggridges)

#BiocManager::install("ggridges")

Now lets load sample data

?airquality
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air 
## Picking joint bandwidth of 2.65

library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
  geom_density_ridges_gradient() +
  scale_fill_viridis(option = "c", name = "Temp")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'c' does not exist. Defaulting to 'viridis'.
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data.

library(tidyr)

airquality %>%
  gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() +aes(value, Month, group = Month) + 
  geom_density_ridges() + 
  facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Density Plots

A density plot is a nice alternative to historgrams

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata %>%
  group_by(sex) %>% 
  summarise(grp.mean = mean(weight))

Now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() + 
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

d + geom_density() + 
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed") 

Now lets change the y axis to count instead of density

d + geom_density(aes(y = stat(count)), fill = "lightgrey") +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

d + geom_density(aes(color= sex)) + 
  scale_color_manual(values = c("darkgray", "gold"))

Lastly, lets fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
  scale_color_manual(values = c("grey", "gold")) +
  scale_fill_manual(values = c("grey", "gold"))

Plotly

More line plots

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Lets start with a scatter plot of the orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference, 
        color = ~Tree, size = ~age, 
        text = ~paste("Tree ID:", Tree, "br>Age", age, "Circ", circumference)
      
)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our data frame

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)

We’ll use random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age, 
        text = ~paste("TreeID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line and add labels

plot_ly(data = Orange, x = ~age, y = ~circumference, 
        color = ~Tree, size = ~circumference,
        text = ~paste("TreeID:", Tree, "<br>Age:", age, "Circ:", circumference)) %>%
  add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "Plot or Orange data with switchable trace",
    updatemenus = list(
      list(
        type = 'downdrop',
        y = 0.8, 
        buttons = list(
          list(method = 'restyle',
               args = list('mode', 'markers'),
               label = "Marker"
               ), 
          list(method = "restyle", 
               args = list('mode', 'lines'),
               labels = "lines"
               )
        )
      )
    )
  )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

3D Plots

First lets load our required packages

library(plotly)

Now lets create a random 3D matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
  
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))

Now lets plot our 3D data

plot_ly(d, x=~x, y=~y, z=~z) %>%
  add_surface()

Lets add more aspects to it, such as topogrophy

plot_ly(d, x=~x, y=~y, z=~z) %>%
  add_surface(
    contours = list(
      z = list(
        show = TRUE, 
        usecolormap = TRUE,
        highlightcolor = "pink",
        project = list(z=TRUE)
      )
    )
  )

Now lets look at 3d scatter plot

plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

Other Graphing Techniques

Error Bars

Lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() + 
    theme(legend.position = 'top')
)

Lets again usee tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dyplyr for manipulation on purpose

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE),
    stderr = std.error(len, na.rm = TRUE),
    len = mean(len),
    
  )

df.summary
## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Now lets look at some key functions

Lets start by creating ggplot objects

tg <- ggplot(
  df.summary, 
  aes(x = dose, y= len, ymin= len - sd, ymax = len + sd)
)

Now lets look at the most basic error bars

tg  + geom_pointrange()

tg + geom_errorbar(width = 0.2) +
  geom_point(size = 1.5)

ECDF Plots

Now lets do an empirical cumulative distributio function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata = data.frame(
  sex=factor(rep(c("F", "M"), each = 200)),
  weight=c(rnorm(200, 50), rnorm(200, 58))
)

Now lets look at our data frame

head(wdata, 5)
##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430
## 5   F 50.42912

Now lets load our plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now lets create our ECDF plot

ggplot(wdata, aes(x=weight)) +
  stat_ecdf(aes(color=sex, linetype=sex),
            geom = "step", size = 1.5) +
  scale_color_manual(values = c("pink", "green")) + 
  labs(y = "weight")

qq Plots

Now lets look at qq plots. These are used to determine if the given data follows a normal distribution.

install.packages("ggpubr")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
set.seed(1234)
wdata = data.frame(
  sex=factor(rep(c("F", "M"), each = 200)),
  weight=c(rnorm(200, 55), rnorm(200, 58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Create a qq plot of the weight

ggplot(wdata, aes(sample=weight)) +
  stat_qq(aes(color=sex)) +
  scale_color_manual(values = c("pink", "purple")) +
  labs(y = "weight")

library(ggpubr)

ggqqplot(wdata, x = "weight",
         color = "sex",
         palettes = c("pink", "purple"),
         ggtheme = theme_pubclean())

Now what would a non-normal distribution look like?

library(mnonr)

data2<- mnonr::mnonr(n = 1000, p=2, ms=3, mk=61, Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2), initial = NULL)

data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample=V1)) +
  stat_qq()

ggqqplot(data2, x="V1",
         palette = "grey",
         ggtheme = theme_pubclean())

Facet Plots

Lets look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw() + 
    theme(legend.positon = "top")
)

First lets create a nice boxplot

lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x=dose, y=len)) +
  geom_boxplot(aes(fill=supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("pink", "orange"))

p
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets look at the gvgplot facit function

p + facet_grid(rows = vars(supp))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols = vars(supp))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

p
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets look at the facet_wrap function. This allows facets to be placed side-by-side

p + facet_wrap(vars(dose), ncol = 2)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now how do we combine multiple plots by using ggarrange()

Lets start by making some basic plots. First we will define a color palette and data

my3cols <- c("pink", "purple", "lightblue")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Lets make some basic plots

p <- ggplot(ToothGrowth, aes(x= dose, y=len))
bxp <- p + geom_boxplot(aes(color = dose)) + 
  scale_color_manual(values = my3cols)

Now lets do a dot plot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
                       binaxis = 'y', stackdir = 'center') +
  scale_color_manual(values = my3cols) +
  scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) + 
  geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.
figure
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

We can make this look even better

figure2 <- ggarrange(
  lp,
  ggarrange(bxp, dp, ncol = 2, labels = c("B", "C")),
  nrow = 2,
  labels = "A"
)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.
figure2
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

This looks good, but two legends are the same

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom"
)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.
## The `legend.positon` theme element is not defined in the element hierarchy.

Lastly, we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## file saved to facetfigure.pdf

Heatmaps

Lets get started with heat maps

library(heatmap3)

Now lets get our data

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heat map

heatmap(data2)

Outlier Detection

Missing Values

If you encounter an unusual value in your dataset, and simply want to move on to the rest of your analysis, you have two options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

Replace the usual values with missing values

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds3, mapping = aes(x = x, y =y)) + 
  geom_point()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

If you want to suppress the warning, you can make na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y =y)) + 
  geom_point(na.rm = TRUE)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Othertimes you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCFlights13 dataset, missing values in the dep_time variable idicte that the flight was cancelled. You might want to tcompare the scheduled departure times for cancelled and non-cancelled times.

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100, 
    sched_min = sched_dep_time %% 100, 
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) + 
  geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4)
## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4):
## Ignoring unknown parameters: `bindwith`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Outliers

What if we want to know what our outliers are? First we need to load the required libraries

library(outliers)
library(ggplot2)
library(readxl)

And reload the data set because we removed the outliers

The excel file needed for the data does not exist, so for the sake of the project, I’ve still written the code. If a chunk is missing a product, it is because the chunk cannot run due to non-existing files.

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Lets create a function using the grubb test to identify all the outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribution.

grubbs.flag <- function(x) {
  outliers <- NULL
  test <- x
  grubbs.result <- grubbs.test(test)
  pv <- grubbs.result$p.value
  while (pv < 0.05) {
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, "") [[1]] [[3]]))
    test <- x[!x %in% outliers]
    grubbs.result <- grubbs.test(test)
    pv <- grubbs.result$p.value
    
  }
  return(data.frame(x=x, outliers = (x %>% outliers)))
}

identified_outliers <- grubbs.flag(Air_data$AH)

Covariation

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Its hard to see the difference in distribution because the counts differ so much.

ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

To make the comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, whcih is the count standardized so that the area under the curve is one.

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Fair diamonds have the highest average price.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + 
  geom_boxplot()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Lets look at some car data.

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
  geom_boxplot()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

If you have long variable names, you can switch the axis and flip it 90 degrees.

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + 
  coord_flip()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

To visualize the correlation betweem two continuous variables, we can use a scatter plot

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

SCatterplots become less useful as the size of the dataset grows because we get overplot. We can fix this using the alpha aesthetic.

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Exploratory Data Analysis

library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(dplyr)

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

Lets get out data

College_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(College_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabetically by college

alphabetical <- College_Data %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select ()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset with the filter function

Louisiana_Cases <- filter(College_Data, state == "Louisiana")

Lets filter out a smaller amount of states

South_Cases <- filter(College_Data, state == "Lousisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")

Lets look at some time series data

Lets load the required packages

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

Lets create a group_by object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurments were made by state? This gives us an idea of when states started reporting

days_since_first_reported <- tally(state_cases)

Lets visualize some data

Lets start with some definitions

Data- the studd we want to visualize

Layer- made of gemetric elements and requisite statistical information. Include geometric objects which represents the plot

Scales- used to map values in the data space that is used for creation of values (color, size, shape, etc)

Coordinate system- describes how the data coordinates are mapped together in relation to the plan on the graphic

Faceting- how to break up data into subsets to display multiple types or groups of data

Theme- controls the finer points of the display, such as font size and background color

options(repr.plot.width = 6, rep.plot.height = 6)

class(College_Data)
## [1] "data.frame"
head(College_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now lets take a look at a different dataset

iris <- as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Lets start by creating a scatter plot of the college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(ggplot2)
library(RCurl)
library(dplyr)
options(repr.plot.width = 6, rep.plot.height = 6)
class(College_Data)
## [1] "data.frame"
head(College_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now lets take a look at another data set

iris <- as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Lets start by creating a scatter plot of the college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
  geom_point() + 
  theme_minimal()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_point()`).

Now lets do iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) + 
  geom_point() +
  theme_minimal()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Lets color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
  geom_point() +
  theme_minimal()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_point()`).

Lets color coordinate the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
  geom_point() + 
  theme_minimal()
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Lets run a simple histogram of our Louisiana case data

hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", main = "Total College Covid-19 Infections (Louisiana)")

Lets run a simple histogram for the iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width",
     ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county)) +
  xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 cases in Louisiana")
## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Lets create a ggplot for the IRIS data

histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width)) 

histogram_iris + geom_histogram(bindwidth = 0.2, color = "black", aes(fill = Species)) + 
  xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram Iris Sepal Width by Species")
## Warning in geom_histogram(bindwidth = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Maybe a density plot makes more sense for our college data

ggplot(South_Cases) +
  geom_density(aes(x = cases, fill = state), aplpha = 0.25)
## Warning in geom_density(aes(x = cases, fill = state), aplpha = 0.25): Ignoring
## unknown parameters: `aplpha`
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets do it with iris cases

ggplot(iris) +
  geom_density(aes(x = Sepal.Width, fill = Species), aplpha = 0.25)
## Warning in geom_density(aes(x = Sepal.Width, fill = Species), aplpha = 0.25):
## Ignoring unknown parameters: `aplpha`
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Lets look at violin plots for iris

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
  geom_violin() +
  theme_classic() +
  theme(legend.position = "none")
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets try the south data

ggplot(data = South_Cases, aes(x = state, y = cases, color = state)) +
  geom_violin() +
  theme_grey() +
  theme(legend.position = "none")
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets take a look at risidual plots. Lets start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) + 
  geom_point(aes(x = .fitted, y = .resid))
## Warning: `fortify(<lm>)` was deprecated in ggplot2 4.0.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
##   Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now look at the southern cases

ggplot(lm(cases ~ cases_2021, data = South_Cases)) + 
  geom_point(aes(x = .fitted, y = .resid))
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

Now lets do some correlations

For the following, there’s no link or file to access the obesity insurace data, but I’ve still written the code for project purposes, similar to previous sections:

obesity <- read.csv("Obesity_insurance.csv")
library(tidyr)
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Lets look at the structure of the data set

str(obesity)

Lets look at the column classes

class(obesity)

And get a summary of distribution of the variables

summary(obesity)

Now lets look athe the distribution for the insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)
boxplot(obesity$bmi)

Now lets look at correlations. The cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or two data frames. The cov() command on the other hand examines the covariance. The cor.test() command carries out a test as to the significance of the correlation.

cor(obesity$charges, obesity$bmi)

This test uses a spearmean Rho correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = "kendall")

This correlation measures strength of a correlation between -1 and Zoomed to fill Now lets look at the Tietjen=Moore test. This is used for univariate datasets. The algorithm depicts the detection of the outliers in a univariate dataset.

TietjenMoore <- function (dataseries, k)
{
  n = length (dataseries)
  # Compute the absolute residuals
  r = asb(dataseries - mean (dataseries))
  # Sort data according to size of residual
  df = data.frame (dataseries, r)
  dfs = df[order(df$r),]
  # create a subset of the data without the largest values.
  klarge = c((n-k+1):n)
  subdataseries = dfs$dataseries[-klarge]
  # Compute the sums of squares.
  ksub = (subdataseries = mean (subdataseries)) **2
  all = (df$dataseries - mean (df$dataSeries)) **2
  #compute the test statistic
  sum(ksub)/sum(all)

}

This function helps to compute the absolute resudiauls and sorts data according to the size of the residuals. Later , we will focus on the computation of sum of squares.

FindOutliersTietjenMooreTest <- function(dataseries, k, alpha = 0.5){ 
  ek <- Tietjenmoore (dataseries, k)
  # compute critical values based on simulation.
  test = c(1:10000)
  for(i in 1:10000){
    dataseriesdataseries = rnorm(length(dataSeries))
    test [i] = TietjenMoore(dataseriesdataseries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

The critical region for the Tietjen-Moore test is determind by simulation. The simulation is performed by generating a standard normal random sample of size n and computing the Tietjen Moore test statistic. Typically, 10,000 randome samples are use. The values of the Tietjen-Moore statistic obtained from the data is compared to this reference distribution. The values oif the test staistic is between zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are ourliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regardless of which test statistic isused, Lk or Ek.

First we will look at charges:


boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 50)

Lets checkout bmi


boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 100)

Lets do some probability

Probablility plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

we will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine probability of events. or for examples, you may wonder “what is the likelihood that a person has an BMI of exactly _? In this case, you would need to retrieve the density of the BMI distribution at values 140. The BMI distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

bmi.mean <-(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Lets create a plot of our normal distibution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) +
  geom_point()

This gives us the probability of every single point occuring

Now lets use the pnorm function for more info


bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.fram("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x + bmi, y = Density)) + 
  geom_point()
  

What if we want to find the probability of the bmi being greater than 40 in our distribution?


pp_greater <- function(x) {
  paste(round(100 * pnorm(x,, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)

What about the probability that a bmi is less than 40 in our population?


pp_less <- function(x) {
  paste(round(100 * (1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)), 2), "%")
}

pp_less(40)

pnormGC(c(20,40), region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

What if we want to find the area in between?


pnormGC(c(20,40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

What if we want to know the quantiles? Lets use the pnorm function. We need to assume a normal distribtuion for this.

What bmi represents the lowest 1% of the population


qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)

What if you want a random sampling of value within your distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(50000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-Wilk Test

So now we know how to generate normal distribution, how do we tell if our sample came from a normal distribution?

shapiro.test(obesity$charges[1:5])

You can see here with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size

shapiro.test(obesity$charges[1:1000])

Now lets checkout age

shapiro.test(obesity$age[1:1000])

And lastly, bmi

shapiro.test(obesity$bmi[1:1000])

Time series data

First lets load our packages

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(readxl)

The file does not exis, but for the sake of the project, this is the code:

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Date - date of measurement time - time of measurement CO(GT) - average hourly co2 PT08, s1(CO) - tin oxide hourly average sensor response NMHC - average hourly non-metallic hydrocarbon concentration C6HC - avage benzene concentration PT08. S3 (NMHC) - titania average hourly sensor response NoX - average hourly Nox concentration NO2 - Average hourly NO2 concentration T - temper RH - relative humidity AH - Absolute humidty

str(Air_data)
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Lets get rid of the date in the time columm

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis, xlab = "Absolute Humidity", ylap = "Relative Humidity")

otice we have an outlier in our data


t.test(Air_data$RH, Air_data$AH)

Text Mining

First we will look at the unnest_token function

Lets start by looking at an Emily Dickinson passage

text <- c("Because I could not stop from Death - ", 
          "He kindly stopped for me -", 
          "The Carriage held but just Ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop from Death - "
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"

This is a typical vector we might not want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "Because I could not stop from Death - "
## 2     2 "He kindly stopped for me -"            
## 3     3 "The Carriage held but just Ourselves -"
## 4     4 "and Immortality"

Next we will use the ‘unset_token’ functions. First we have the output column name that will be created as the text is unnested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeaustenr package to analyze some Jane Austen texts. There are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)
library(tidytext)
original_books <- austen_books() %>%
  dplyr::group_by(book) %>%
  dplyr::mutate(
    linenumber = dplyr::row_number(),
    chapter = cumsum(
      stringr::str_detect(text, stringr::regex("^chapter [0-9ivxlc]+",
                                               ignore_case = TRUE))
    )
  ) %>%
  dplyr::ungroup()

original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

To work with this as a tidy dataset, we need to restructure it in the on-token-per-row format, which as we saw earlier is done with the unnest_tokens() function.

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function usees the tokenizers package to seperate each line of text in the original dataframe into tokens.

The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs can be used.

Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr.

often in text analysis, we will want to remove stop words. stop words are words that are NOT USEFUL for an analysis.

These include words like the, of, to, and, and so forth.

we can remove stop words (kept in the tidytext dataset ‘stop_words’) with an anti_join.

data("stop_words")

tidy_books <- tidy_books %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

The stop words dataset in the tidytext package contains stop words from three lexicons. We can use them all together, as we have 3, or filter() to only use one set of stop words if thats more appropriate for your analysis.

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

Because we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most commmon words.

library(ggplot2)

tidy_books %>%
count (word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder (word, n)) %>%
ggplot(aes (n, word)) + 
geom_col() +
labs (y = NULL)
## Warning in plot_theme(plot): The `legend.positon` theme element is not defined
## in the element hierarchy.

The gutenbergr package This package provides acces to the public domain works from the gutenberg project (www.gutenberg.org). This package includes Shand i date datake of tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function autenberg download word frequencies

Lets look at some biology texts, starting with Darwin

The Voyage of the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227

The descent of man, and selection in relation to sex - 2300

we can access these workds using the gutenberg_download and the Project Gutenberg IDnumbers

library(gutenbergr)
darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

The book’s server no longer exists, so for the sake of the project, I have still included all the code for the data as if the book still did exist on the server:

Lets break these into tokens

tidy_darwin <- darwin %>%
  unnest_tokens(word,text) %>%
  anti_join(stop_words)

Lets checkout what the most common darwin words are.

tidy_darwin %>%
  count(word, sort = TRUE)
  

Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics -57460 Evoliton and Adaptation - 63540


morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "https://mirror2.sandyriver.net/pub/gutenberg") 

Lets tokenize THM


tidy_morgan <- morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
  

What are his most common words?

tidy_morgan %>%
  count(word, sort = TRUE)

Lastly lets look at Thomas Henry Huxley

Evidence as to mans place in nature - 2931 On the reception of the Origin of Species - 2089 Evolution and Ethics, and other essays -2940 Science and Culture and other essays - 52344

huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

tidy_huxley  <- huxley %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
  
tidy_huxley %>%
  count(word, sort = TRUE)

Now lets calculate the frequency for each word for the workd of Darwin, Morgan, and Huxley by binding the frames together.


library(tidyr)

frequency ‹- bind_rows (mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                        mutate (tidy_darwin, author = "charles Darwin"),
                        mutate (tidy_huxley, author = "Thomas Henry Huxley")) %>%
  mutate (word = str_extract (word, "[a-z']+")) %>%
  count (author, word) %>% group_by (author) %>%
  mutate (proportion = n/ sum(n)) %>%
  select (-n) %>%
  pivot_wider (names_from = author, values_from = proportion) %>%
  pivot_longer ('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")

Now we need to change the table so that each author has its own row


frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)

frequency2
library(scales)

ggplot (frequency2, aes(x = charles Darwin, y = Thomas Hunt Morgan*), color = abs(- 'Charles Darwin' = 'Thomas Hunt Morgan'))
  geom_abline (color = "gray40", lty = 2) +
  geom_jitter (alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text (aes (label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_1og10(labels = percent_format) +
  scale_y_1og10(labels = percent_format) +
  scale_color_gradient(limits = c(0, 0.001),
                        low = "darkslategray4", high = "gray75'") +
    theme (legend. position="none")
  labs (y ="Thomas Hunt Morgan", x = "Charles Darwin")

Sentiments

The Sentiments datasets

There are a variety of methods and dictionaries that exist for evaluatiing the opinion or emotion of the text.

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes nto positive, negative, anger, anticipation, fear, joy, sadness, suprise and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.

The function get_sentiments allows us to get the specific sentiments lexicon with the measures for each one.

BiocManager::install("textdata", update = FALSE)

library(tidytext)
library(textdata)

afinn <- get_sentiments("afinn")

afinn 

Lets look at bing

bing <- get_sentiments("bing")

bing
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

And lastly nrc

nrc <- get_sentiments("nrc")

nrc

These libraries were created either using crowdourcing or cloud computing/ai like Amazon mechanical turk, or by labor of one of the authors, then validated by crowdsourcing

Lets look at the words with a joy score from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

The server for the books has been archives, so I’ve again written the code below for the sake of the project!


darwin <- gutenberg_download(c(944, 1227, 1228, 2300, mirror = "https://mirror2.sandyriver.net/pub/gutenberg"))

tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^Chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

tidy_books

Lets add the book name instead of GID

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotion in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species by Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

tidy_books

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiments == "joy")
  
tidy_books %>%
  filter(book == "The Voyage of the Beagle") %>%
  innter_join(nrc_joy)%>%
  count(word, sort = TRUE)
  

We can also examine how to sentiment changed throuhout a work.


library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

library(ggplot2)
ggplot(CHarles_Darwin_sentiment, aes(index, sentiment, fill = book)) + 
  geom_col(show.legend = FALSE) +
  facet_wrap(~ book, ncol = 2, scales = "free_x")

Lets compare the three sentiment directions

There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three of our dictionariesand examine how the sentiment changes across the arc of TVOTB.

voyage <- tidy_books %>%
  filter(book == "The Voyage of the Beagle")
  
  voyage

Lets again use integer division (‘%/%’) to define larger sections of the text that span multipe lines, and we can use the same pattern with ‘court()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.


affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
  
bing_and_nrc <- bind_rows(
  voyage %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(method = "Bing et al."),
  voyage %>%
    inner_join(get_sentiments("nrc") %>%
                  filter(sentiment %in%) c("positive", "negative))
                  ) %>%
    mutate(method = "NRC")) %>%
    count(method, index = linenumber %/% 80, sentiment) %>%
    pivot_wider(names_from = sentiment, 
                    values_from = n,
                    values_fill = 0) %>%
                    
    mutate(sentiment = positive - negative)

We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexion (directionary). Lets bind them all together and visualize with ggplot

library(ggplot2)

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")
  

Lets look at the counts based on each dictionary

get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)

get_sentiments("bing") %>%
  count(sentiment)

bing_word_counts <- tidy_books %>% 
  inner_join(get_sentiments("bing") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
  
bing_word_counts

This can be shown visually


bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) + 
  facet_wrap(~sentiment, scale = "free_y") +
  labs(x = "Contribution to Sentiment, y = NULL)
  

Lets spot an anomoly in the dataset


custom_stop_word <- bind_rows(tibble(words = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)
custom_stop_word

Word clouds!

We can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques

Lets use the word cloud package

library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words= 100))
  

Lets also look at comparison.cloud() which may require turning the data frame into a matrix. We can change to matrix using the acast() function.


library(reshape2)

tidy_books %>%
  inner_join(get_sentiment, sort = TRUE) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiments, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80), max.words = 100)

Looking at units beyond words

Lots of useful work can be done by tokenizing at the word levvel, but soometimes it nice to look at different units of text. For example, we can look beuond just unigrams.

Ex. I am not having a good day


bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")
  
wordcounts <- tidy_bood %>%
  group_by(book, chapter) %>%
  summarize(words = n ())
  
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ration = negativewords/words) %>%
  filter(chapter !=0) %>%
  slice_max(ratio, n = 1) %>%
  ungroup()

N-Grams

So far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing relationships.


darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300, mirror = "https://mirror2.sandyriver.net/pub/gutenberg"))

colnames(darwin_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotion in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species by Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

darwin_bigrams <- darwin_books$books %>%
  unnest_token(bigram, text, token = "ngrams", n = 2)
  
darwin_bigrams

This is still in tidytext format and isnt structured as one=token-per-row. Each tolen is a bigram counting and filtering ngrams.

darwin_bigrams %>%
  count(bigram, sort = TRUE)

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.


library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1, "word2"), sep = "")
  
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
  
bigrams_filtered

New bigram counts

bigram_counts <- bigrams_filetered %>%
  unite(bigram, word1, word2, sep = "")
  
bigram_counts

We may also be interested in trigrams, which are three word combos

darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word 1", "word2", "word3"), sep = "") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
  
trigrams

Lets analyze some bigrams

bigrams_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)
  

Lets again look at tf-idf across bigrams across Darwins works.


bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))
  
bigram_tf_idf
  
bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>% 
  ungroup() %>%
  mutate(bigram = reorder(bigram, tf_idf) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 2, scale = "free") +
  labs(x = "Ctd-idf of bigrams", y = NULL)
  

Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negatinf words


AFINN <- get_sentiments("afinn")

AFINN

We can examine the most frequent words that were preceded by “not”, and associated with sentiment.


not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

not_words

Lets visualize

library(ggplot2)
not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(contribution)) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) + 
  labs(x = "Sentiment value * number of occurences", y = "words preceded by \"not"\)
negation_words <- c("not", "no", "never","non", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)
  
negated_words

Lets visualize these words

negated_words %>%
  mutate (contribution = n * value,
          word2= reorder (paste(word2, word1, sep = "_"), contribution)) %›%
  group_by (word1) %>%
  slice_max abs (contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes (word2, contribution, fill = n * value › 0)) +
  geom_col (show.legened = FALSE) +
  facet_wrap (~ word1, scales = "free") +
  scale_x_discrete (labels = function(x) gsub("_.+$", "", x)) +
  xlab("words preceded by negation term") + 
  ylab("Sentiment value * # of occurences") +
  coord_flip()
  

Visualize a network of bigrams with graphs


library(igraph)

bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)
  
bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()
  
bigram_graph

library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
  

Word Frequencies

A central question in text mining is how to quantify what a document is about. we can do this but looking at words that make up the document, and measuring term frequency. There are a lot of words that may not be imporant, these are the stop words. one way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words and inscreases the weight for words that are not used very much . Term frequency in Darwins works


library (dplyr)
library(tidytext)
book words <- gutenberg_download (c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
colnames (book_words) [1] <- "book"

book_words$book[book_words$book = 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book = 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book = 1228] <- "On the Origin of Species By Means of Natural selection"
book_words$book[book_words$book = 2300] <- "The Descent of Man, and selection in Relation to sex"

Now lets disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)
  
book_words

book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))
  
book_words

book_words <- left_join(book_words, total_words)

book_words

You can see that the usual suspects are the most common words, but don’t tell us anything about what the book topic is.


library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legen = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "freey_y")
  

Zipf’s Law

The frequency that a word appears is inversely proportional to its rank when predicting a topic.

Lets apply Zipf’s law to Darwin’s work


freq_by_rank <- book_words %>%
  group_by(book) %>%
  mutate(rank = row_number(),
          'term frequency' = n/total) %>%
  ungroup()
  
freq_by_rank

freq_by_rank %>%
  ggplot(aes(rank, 'term frequency', color = book)) +
  geom_line(soze = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()
  

Lets use TF - IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.


book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)
  
book_tf_idf

Lets look at terms with high tf-idk in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
  

Lets look at a visualization for these high tf - idf words


book_tf_idf %>% 
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)