Now lets take a look at some ggplot2 barplots

we’ll start with making a dataframe based on the tooth data

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                len = c(4.2, 10, 29.5))
                
df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

And now lets make a second dataframe

df2 <- data.frame(supp=rep(c("vc", "OJ"), 3),
                           dose = rep(c("D0.5", "D1", "D2"), 2),
                           len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   vc D0.5  6.8
## 2   OJ   D1 15.0
## 3   vc   D2 33.0
## 4   OJ D0.5  4.2
## 5   vc   D1 10.0
## 6   OJ   D2 29.5

Lets load up ggplot2

library(ggplot2)

Lets set our parameters for ggplot

theme_set(
  theme_classic()+
    theme(legend.position = "top")
  
)

Lets start with some basic barplots using tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue")+
  geom_text(aes(label = len), vjust = -0.3)

Now lets add the lables inside the bars

f + geom_col(fill = "darkblue")+
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white")+
  scale_color_manual(values = c("blue", "gold", "red"))

This is kind hard to see, so lets change the fill

f + geom_col(aes(fill = dose))+ 
  scale_fill_manual(values = c("blue", "gold", "red"))

How do we do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_stack()) +
  scale_color_manual(values = c("blue", "gold"))+
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len))+
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7)+
  scale_color_manual(values = c("blue", "gold"))+ 
  scale_fill_manual(values = c("blue", "gold"))
p

Now lets add those labels to the dodge barplot

p + geom_text(
  aes(label = len, group = supp), 
  position = position_dodge(0.8), 
  vjust = -0.3, size = 3.5 
)

Now what if we want to add labels to our stacked barplots? for this we need dolyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) - 0.5 * len)

Now lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y = len))+ 
  geom_col(aes(fill = supp), width = 0.7)+
  geom_text(aes(y = len, label = len, group = supp), color = "white")+
  scale_color_manual(values = c("blue", "gold"))+
  scale_fill_manual(values = c("blue", "gold"))

Lets look at some boxplots:

data("ToothGrowth")

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() +
  theme(legend.position = "top")
)

Lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch = "True", fill = "lightgrey") + 
  stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

tg + geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

we can also change boxplot color by groups

tg + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = c("indianred", "blue1", "green3"))

What if we want to display our data subset by ej vs vitamin e?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#999999", "#E69F00"))

tg2

we can also arrange this as thwo plots with facet_wrap

tg2 + facet_wrap(~supp)

GGpolt histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 56), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now lets lead dplyr

library(dplyr)

mu <- wdata %>%
  group_by(sex)%>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))
            
a + geom_histogram(bins = 30, color = "black", fill = "grey") +
  geom_vline(aes(xintercept = mean(weight)), 
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>
a + geom_histogram(aes(color = sex), fill = "white", position = "identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>
  scale_fill_manual(values = c("indianred", "lightblue1"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

waht if we want to combine desity plots and histograms

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666")
## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex),
                   fill = "white", position = "identity")+
geom_density(aes(color = sex), size = 1) +
scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplots dotplots

first lets load the required paclages

library(ggplot2)

lets set our theme

theme_set(
  theme_dark()+
    theme(legend.position = "top")
)

first lets initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x=dose, y = len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "lightgray")+ 
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

tg + geom_boxplot(width = 0.5)+
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") +
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') +
  scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center', 
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))+
  scale_color_manual(values = c("#00AfBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

line plots:

well start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff that were modifying

 df <- data.frame(dose = c("D0.5", "D1", "D2"),
                  len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("vc", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2), 
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
##   supp dose  len
## 1   vc D0.5  6.8
## 2   vc   D1 15.0
## 3   vc   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Now Lets again loap ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray()+
    theme(legend.position = "right")

)

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldpar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim =c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab = "")
  for(i in 0:6) lines(c(0.3, 0.7), c(i,i), lty=i, lwd = 3)
  text(rep(0.1,6), 0:6, labels = c("0. 'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'", 
                                   "4.'dotdash'", "5.'longdash", "6.'twodash'"))
par(mar=oldpar$mar, font=oldpar$font)
}

generateRLineTypes()

Now lets build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))

p + geom_line() + geom_point()

Noe lets modify the line type and color

p + 
  geom_line(color = "steelblue", linetype = "solid") +
  geom_point(color = "steelblue")

Now lets try a step graph, which indicates a threshold type progression

p + geom_step() + geom_point()

Now lets move on to making multiple groups. First we’ll create our ggplot object

p <- ggplot(df2, aes(x=dose, y=len, group = supp))

Now lets change line types and point shapes by group

p + geom_line(aes(linetype = supp, color = supp)) + 
  geom_point(aes(shape = supp, color = supp)) + 
  scale_color_manual(values = c("red", "blue"))

Now lets look at line plots with numeric x axis

df3 <- data.frame(supp = rep(c("vc", "oj"), each = 3), 
                  dose = rep(c("0.5", "1", "2"), 2), 
                  len=c(6.8, 15, 33, 4.2, 10, 29.5))
df3
##   supp dose  len
## 1   vc  0.5  6.8
## 2   vc    1 15.0
## 3   vc    2 33.0
## 4   oj  0.5  4.2
## 5   oj    1 10.0
## 6   oj    2 29.5

Now lets plot where both axises are treated as continuous labels

df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group = supp, color = supp)) +
  geom_line() + geom_point()

Now lets look at a line graph with having the x axis as dates. we’ll use tech built in economics time series for this example

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data = economics, aes(x = date, y = pop)) + 
  geom_line()

Now lets subset the data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

we can also change the line size, for instance by another variable like employment

ggplot(data = economics, aes(x=date, y = pop)) + 
  geom_line(aes(size = unemploy/pop))

We can also plot multiple time series data

ggplot(economics, aes(x = date))+ 
  geom_line(aes(y=psavert), color = "darkred") + 
  geom_line(aes(y = uempmed), color = "steelblue", linetype = "twodash")

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date))+ 
  geom_area(aes(y = psavert), fill = "#999999",
            color = "#999999", alpha = 0.5) + 
  geom_area(aes(y = uempmed), fill = "#E69F00", 
                color = "#E69F00", alpha = 0.5)

Ridgeplots:

First lets load the required packages

library(ggplot2)
library(ggridges)

#Biocmanger::install("ggridges")

Now Lets load some sample data

?airqaulity
## No documentation for 'airqaulity' in specified packages and libraries:
## you could try '??airqaulity'
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air
## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..)+ 
  geom_density_ridges_gradient() +
  scale_fill_viridis(option = "c", name = "Temp")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'c' does not exist. Defaulting to 'viridis'.
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65

LAst thing we will do is create a facet plot for all our data

library(tidyr)

airquality %>%
  gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot(aes(x = value, y = factor(Month), fill = factor(Month))) + 
  geom_density_ridges(alpha = 0.7) + 
  facet_wrap(~ Measurement, scales = "free") +
  labs(title = "Air Quality Measurements in New York (May-September 1973)",
       x = "Measurement Value",
       y = "Month",
       fill = "Month") +
  theme_ridges() +
  theme(legend.position = "none")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

Density Plots:

a desnity plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  Weight = c(rnorm(200, 55), rnorm(200, 58))
)
ggplot(wdata, aes(x = sex, y = Weight, fill = sex)) +
  geom_boxplot() +
  labs(title = "Weight Distribution by Sex",
       x = "Sex",
       y = "Weight") +
  theme_minimal()

Now leats load the graphing packages

library(ggplot2)
theme_set(
  theme_classic()+ 
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

#d + geom_density()+ 
  #geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now lets change the y axis to count instead of density

#d + geom_density(aes(y = stat(count)), fill = "lightgray") +
  #geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
#d + geom_density(aes(color = sex)) + 
  #scale_color_manual(values= c("darkgray", "gold"))

Lastly, lets fill the density plots

#d = geom_density(aes(fill = sex), alpha = 0.4) + 
  #geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed")+ 
  #scale_color_manual(values = c("grey", "gold"))+ 
  #scale_fill_manual(values = c("grey", "gold"))

Ploty lines

First lets load our required package

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Lets start with a scatter plot of the orange dataset

orange <- as.data.frame(Orange)

plot_ly(data = orange, x = ~age, y = ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

p <- plot_ly(data = Orange, x = ~age, y = ~circumference, 
             color = ~Tree, size = ~age, 
             text = ~paste("Tree ID:", Tree, "<br>Age:", age, "days<br>Circumference:", circumference, "mm"),
             hoverinfo = "text") %>%
  add_markers() %>%
  layout(title = "Orange Tree Growth",
         xaxis = list(title = "Age (days)"),
         yaxis = list(title = "Circumference (mm)"),
         legend = list(title = list(text = "Tree ID")))

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(orange, trace_1)

We’ll use the random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age, 
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line, and add labels.

p <- plot_ly(data = Orange, x = ~age, y = ~circumference, 
             color = ~Tree, size = ~circumference, 
             text = ~paste("Tree ID:", Tree, "<br>Age:", age, "days<br>Circumference:", circumference, "mm"),
             hoverinfo = "text") %>%
  add_markers(name = "Markers") %>%
  add_lines(name = "Lines") %>%
  layout(
    title = "Orange Tree Growth with Switchable Trace",
    xaxis = list(title = "Age (days)"),
    yaxis = list(title = "Circumference (mm)"),
    legend = list(title = list(text = "Tree ID")),
    updatemenus = list(
      list(
        type = 'buttons',
        direction = 'right',
        x = 0.1,
        y = 1.2,
        buttons = list(
          list(method = 'restyle',
               args = list('visible', list(TRUE, FALSE)),
               label = "Markers"),
          list(method = "restyle",
               args = list('visible', list(FALSE, TRUE)),
               label = "Lines")
        )
      )
    )
  )

p
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Plotly 3D

First lets load our required packages

library(plotly)

Now lets create a random 3d matrix

# Create the data frame
d <- data.frame(
  x = seq(1, 10, by = 0.5),
  y = seq(1, 10, by = 0.5)
)

# Create the matrix
z <- matrix(rnorm(length(d$x) * length(d$y)), 
            nrow = length(d$x), 
            ncol = length(d$y))

Now lets plot our 3d data

plot_ly(d, x=~x, y = ~y, z = ~z) %>%
  add_surface()

Lets add some more aspects to it, such as at topogrophy

p <- plot_ly(x = ~d$x, y = ~d$y, z = ~z) %>%
  add_surface(
    contours = list(
      z = list(
        show = TRUE,
        usecolormap = TRUE,
        highlightcolor = "#FF0000",
        project = list(z = TRUE)
      )
    ),
    hoverinfo = "text",
    text = ~paste("x:", d$x, "<br>y:", d$y, "<br>z:", z),
    colorscale = "Viridis"
  ) %>%
  layout(
    scene = list(
      xaxis = list(title = "X"),
      yaxis = list(title = "Y"),
      zaxis = list(title = "Z"),
      camera = list(
        eye = list(x = 1.87, y = 0.88, z = 0.64)
      )
    ),
    title = "3D Surface Plot with Contours"
  )

p

Now lets look at a 3d scatter plot

p <- plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, frame = ~Year) %>%
  add_markers(
    marker = list(
      color = ~GNP,
      colorscale = 'Viridis',
      colorbar = list(title = 'GNP')
    ),
    text = ~paste("Year:", Year, 
                  "<br>GNP:", GNP, 
                  "<br>Population:", Population, 
                  "<br>Employed:", Employed),
    hoverinfo = 'text'
  ) %>%
  layout(
    scene = list(
      xaxis = list(title = "GNP"),
      yaxis = list(title = "Population"),
      zaxis = list(title = "Employed")
    ),
    title = "US Economic Data (1947-1962)"
  ) %>%
  animation_opts(
    frame = 1000,
    transition = 500,
    easing = 'linear'
  ) %>%
  animation_slider(
    currentvalue = list(prefix = "Year: ")
  )

# Display the plot
p

Error Bars:

First lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position = 'top')
)

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE), 
    stderr = std.error(len, na.rm = TRUE), 
    len = mean(len), 
)

df.summary
## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Lets now look at some key functions geom_crossbar() for hollow bars with middle indicated by a horizontal line geom_errorbar() for error bars geom_linerange() for drawing an interval represted by a veritcal line geom_pointrange() for creating an interval represented by a vertical line; with a point in the middle

lets start by creating a ggplot object

tg <- ggplot(
  df.summary, 
  aes(x = dose, y = len, ymin = len - sd, ymax - len + sd)
)

Now lets look at the most basic error bars

tg <- ggplot(
  df.summary, 
  aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
)

Now lets create horizontal error bars by manipulating our graph

ggplot(df.summary, aes(x=len, y=dose, xmin = len-sd, xmax = len+sd))+ 
  geom_point()+ 
  geom_errorbarh(heihgt = 0.2)
## Warning in geom_errorbarh(heihgt = 0.2): Ignoring unknown parameters: `heihgt`

This just gives you an idea or error bars on the horizontal axis

now lets look at adding jitter points (actual measurements) to our data.

ggplot(df, aes(dose, len)) + 
  geom_jitter(position = position_jitter(0.2), color = "darkgray") + 
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary) +
  labs(x = "Dose", y = "Length", title = "Effect of Dose on Length") +
  theme_classic()

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) + 
  geom_violin(color = "darkgray", trim = FALSE) + 
  geom_pointrange(aes(ymin = len - sd, ymax = len+sd), data = df.summary)

Now how about with a line graph?

ggplot(df.summary, aes(dose, len)) +
  geom_line(aes(group = 1)) + # always specify when you have 1 line 
  geom_errorbar(aes(ymin = len-stderr, ymax = len-stderr), width = 0.2) + 
  geom_point(size = 2)

Now lets make a bar graph with halve error bars

ggplot(df.summary, aes(dose, len)) + 
  geom_col(fill = "lightgrey", color = "black")+
  geom_errorbar(aes(ymin = len, ymax = len+stderr), width = 0.2)

You can see that by not specifying wmin = len-stderr, we have in essence cut our error bar in half hwo about we add jitter points to line plots? we need to use the original dataframe for the jitter plot, and the summary df for geom layers.

 library(ggplot2)

ggplot(df, aes(x = dose, y = len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey", alpha = 0.5) +
  geom_line(data = df.summary, aes(group = 1), color = "blue") + 
  geom_errorbar(
    data = df.summary,
    aes(ymin = len - stderr, ymax = len + stderr), 
    width = 0.2,
    color = "blue"
  ) +
  geom_point(data = df.summary, size = 3, color = "blue") +
  labs(
    title = "Dose-Length Relationship with Error Bars",
    x = "Dose",
    y = "Length"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold")
  )

What about adding jitterpoints to a barplot?

ggplot(df, aes(x = dose, y = len)) +
  geom_col(data = df.summary, fill = NA, color = "blue", alpha = 0.5) + 
  geom_jitter(position = position_jitter(0.2), color = "darkgrey", alpha = 0.5) + 
  geom_errorbar(
    data = df.summary,
    aes(ymin = len - stderr, ymax = len + stderr),
    width = 0.2,
    color = "blue"
  ) +
  geom_point(data = df.summary, size = 3, color = "blue") +
  labs(
    title = "Dose-Length Relationship with Error Bars",
    x = "Dose",
    y = "Length"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title = element_text(face = "bold")
  )

What if we wanted to have error bars per group? ()

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise(
    sd = sd(len), 
    stderr = std.error(len), 
    len = mean(len)  
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

library(dplyr)

# Assuming you have a dataframe 'df' with columns: dose, len, supp
df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise(
    len = mean(len),
    stderr = sd(len) / sqrt(n()),
    .groups = 'drop'
  )

How about line plots with multiple error bars?

ggplot(df.summary2, aes(x = dose, y = len, color = supp, group = supp)) +
  geom_line(aes(linetype = supp)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr), width = 0.2, position = position_dodge(0.05)) +
  labs(
    title = "Effect of Dose and Supplement on Length",
    x = "Dose",
    y = "Length",
    color = "Supplement",
    linetype = "Supplement"
  ) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold"),
    axis.title = element_text(face = "bold")
  )

And the same with a bar plot

ggplot(df.summary2, aes(dose, len)) + 
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7)

qq Plots:

Now lets make a look at qqplots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some data

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
  
)

Lets set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
  theme_classic()+
    theme(legend.position = "top")
  
)

create a qq plot of the weight

ggplot(wdata, aes(sample=weight))+
  stat_qq(aes(color = sex))+
  scale_color_manual(values = c("#0073c2FF", "#FC4E07")) +
  labs(y = "weight")

#install.packages(ggpubr)
library(ggpubr)

ggqqplot(wdata, x = "weight", 
         color = "sex",
         palettes = c("#0073c2FF", "#FC4E07"),
         ggtheme = theme_pubclean())

Now what would a non-normal distrubution look like?

#install.packages(mnonr)

library(mnonr)

data2 <- mnonr::mnonr(n = 1000, p = 2, ms = 3, mk = 61, Sigma=matrix(c(1, 0.5, 0.5, 1), 2, 2), initial = NULL)

data2 <- as.data.frame(data2)

Now lets plot the non normal data

library(ggplot2)

ggplot(data2, aes(sample = V1)) + 
  stat_qq() +
  stat_qq_line(color = "red") +
  labs(title = "Q-Q Plot for Variable 1",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_minimal()

library(ggpubr)

ggqqplot(data2, x = "V1", 
         color = "#0073c2FF",
         ggtheme = theme_pubclean()) +
  labs(title = "Q-Q Plot for Variable 1",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

Facet Plots:

Lets look at how to put multiple pots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw() + 
    theme(legend.position = "top")
)

First lets create a nice boxplot lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

library(ggplot2)

# Assuming you're using the ToothGrowth dataset
df <- ToothGrowth

p <- ggplot(df, aes(x = dose, y = len)) +
  geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  labs(title = "Tooth Growth by Dose and Supplement Type",
       x = "Dose (mg/day)",
       y = "Tooth Length",
       fill = "Supplement Type") +
  theme_minimal()

print(p)

Now lets look at the gvgplot facit function

p + facet_grid(rows = vars(supp))

Now lets do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols = vars(supp))

p

Now lets look at the facet_wrap function. This allows facets to be placed side by side

p + facet_wrap(vars(dose), ncol = 2)

Now how do we combine multiple plots using ggarrange(

Lets start by making some basic plots. First we will define a color palette and data

my3cols <- c("#e7B800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) + 
  scale_color_manual(values = my3cols)

ok now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose), 
                       binaxis = 'y', stackdir = 'center') + 
  scale_color_manual(values = my3cols) + 
  scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) + 
  geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure

This looks great, but we can make it look even better

library(ggplot2)
library(ggpubr)

# Create example plots (replace these with your actual plots)
lp <- ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() + 
  labs(title = "Weight vs MPG")

bxp <- ggplot(mtcars, aes(x = as.factor(cyl), y = mpg)) + 
  geom_boxplot() + 
  labs(title = "MPG by Cylinders")

dp <- ggplot(mtcars, aes(x = mpg)) + 
  geom_density() + 
  labs(title = "MPG Density")

# Now create the composite figure
figure2 <- ggarrange(
  lp, 
  ggarrange(bxp, dp, ncol = 2, labels = c("B", "C")),
  nrow = 2,
  labels = "A"
)

# Display the figure
figure2

ok this looks really good, but youll notice that there are thwo legends that are the same.

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom")

Lastly, we should export the plot

ggexport(figure2, filename = "facefigure.pdf")
## file saved to facefigure.pdf

we can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")
## file saved to multi.pdf

lastly we can export to pdf with multiple pages and multiple columns

ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol = 1)
## file saved to test2.pdf

Heat Maps

Lets get started with heatmaps

#install.packages(heatmap3)
library(heatmap3)

Now lets get our data

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, colv = NA)
## Warning in plot.window(...): "colv" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "colv" is not a graphical parameter
## Warning in title(...): "colv" is not a graphical parameter

Now lets play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now lets apply our color selections

heatmap(data2, colsidecolors = cc)
## Warning in plot.window(...): "colsidecolors" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "colsidecolors" is not a graphical parameter
## Warning in title(...): "colsidecolors" is not a graphical parameter

library(RColorBrewer)
heatmap(data2, ColSideColors = cc, 
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Theres more that we can customize

# Load required libraries
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
## 
##     plotCI
## The following object is masked from 'package:stats':
## 
##     lowess
library(RColorBrewer)

# Assuming data2 is your data matrix
# If it's not already a matrix, convert it
data2 <- as.matrix(data2)

# Create color palette for the heatmap
heatmap_colors <- colorRampPalette(brewer.pal(8, "PiYG"))(25)

# Create column side colors (example)
# Replace this with your actual column side colors if you have them
cc <- rainbow(ncol(data2))

# Create the heatmap
heatmap.2(data2, 
          ColSideColors = cc,
          col = heatmap_colors,
          trace = "none",  # Remove trace lines
          density.info = "none",  # Remove density plot
          key = TRUE,  # Show color key
          keysize = 1.5,
          cexRow = 0.6,  # Adjust row label size
          cexCol = 0.6,  # Adjust column label size
          margins = c(8, 8),  # Adjust margins
          main = "Heatmap of data2")

missing values

If you encounter a unusual value in your dataset, and simply want to move on to the rest of your analysis, you have two options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

In this instance, y is width of the diamond, so anything under 3mm or above 20 is excluded.

I dont reccommend this option, just because there is one bad measurement doesn’t mean they are all bad

Instead, I recommend replacing the unusual values with missing values

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values shouldnt pass silently into the right

ggplot(data = diamonds3, mapping = aes(x = x, y=y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to supress that warning you can use na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) + 
  geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with missing values different to the observations with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled so you might want to tocompare the scheduled depearture times for cancelled and non cancelled times

library(nycflights13)
library(dplyr)
library(ggplot2)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100, 
    sched_min = sched_dep_time %% 100, 
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) +
  geom_freqpoly(aes(color = cancelled), binwidth = 1/4)

Outliers:

What if we want to knwo what our outliers are?

First we need to load the required libraries

library(outliers)
library(ggplot2)

and reload the dataset because we removed outliers

Air_data <- (“AirQualityUCI.xlsx”)

Lets create a function using the grubb test to identify all outliers The grubbs test identifies outliers in a unvariate dataset that is presumed to come from a normal distribution

#’ Detect outliers using Grubbs’ test #’ #’ This function applies Grubbs’ test iteratively to detect outliers in a univariate dataset. #’ #’ @param x A numeric vector containing the data to be tested for outliers. #’ @param threshold The p-value threshold for considering a value as an outlier (default: 0.05). #’ #’ @return A data frame with the original data and a boolean vector indicating outliers. #’ #’ @examples #’ data <- c(1, 2, 3, 4, 5, 100) #’ result <- grubbs_flag(data) #’ print(result) #’ #’ @importFrom outliers grubbs.test #’ @export grubbs_flag <- function(x, threshold = 0.05) { if (!is.numeric(x)) { stop(“Input ‘x’ must be a numeric vector.”) }

if (!requireNamespace(“outliers”, quietly = TRUE)) { stop(“Package ‘outliers’ is required. Please install it.”) }

outliers <- logical(length(x)) test <- x

repeat { grubbs_result <- outliers::grubbs.test(test) pv <- grubbs_result$p.value

if (pv >= threshold) break

outlier_value <- as.numeric(strsplit(grubbs_result$alternative, " ")[[1]][3])
outlier_index <- which(x == outlier_value)[1]
outliers[outlier_index] <- TRUE
test <- x[!outliers]

}

return(data.frame(x = x, is_outlier = outliers)) }

#identified_outliers <- grubbs.flag(Air_data$AH)

Now we can create a histogram showing where the outliers were

ggplot(grubbs.flag(Air_data\(AH), aes(x=Air_data\)AH, color = outlier, fill = outlier)) + geom_histogram(bindwidth = diff(range(air_data$))/30)+ theme_bw()

Covariation

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution because the counts differ so much

ggplot(diamonds) + 
  geom_bar(mapping = aes(x = cut))

to make the comparison easier, we need to swap the display on the y-axis. instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe thats because frequency polygons are la little hard to interpret.

another alternative is the boxplot. A boxplot is a type of visual shorthand for a distributionof values.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot

We see much less information about the disrubution, but the boxplots are much more compact, so we can more easilty compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average!

lets look at some car data

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it 80 degrees.

ggplot(data = mpg) + 
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + 
  coord_flip()

To visulaize the correlation between two continupus variables, we can use a scatter plot.

ggplot(data = diamonds)  +
  geom_point(mapping = aes(x = carat, y = price ))

scatterplots become less useful as the size of your dataset grows, becuase we get overplot. we can fix this using the alpha aesthetic

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

Exploratory Data Analysis

First lets load a required library

# Install packages if not already installed
if (!requireNamespace("RCurl", quietly = TRUE)) install.packages("RCurl")
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")

# Load libraries
library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

college_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(college_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

what if we want arrange our dataset alphabedically by college?

alphabetical <- college_Data %>%
  arrange(college_Data$college)

The glimpse package is another way to preview data

glimpse(college_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

we can also subset with select()

college_cases <- select(college_Data, college, cases)

we can also filter or subset with the filter function

Louisiana_cases <- filter(college_Data, state =="Louisiana")

Lets filter our smaller amount of states

south_cases <- filter(college_Data, state =="Louisiana" | state == "Texas" | state == "Arkansas"
| state == "Mississippi")

Lets look at some time series data

first we’ll load the required libraries

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

state_Data <- read.csv(state_site)

Lets create group_by object using the state column

state_cases <- college_Data %>%
  group_by(state) %>%
  summarize(total_cases = sum(cases, na.rm = TRUE))

class(state_cases)
## [1] "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting

Days_since_first_reported <- tally(state_cases)

Lets visualize some data

First lets start off with some definitions

Data - obvious - the stuff we want to visualize

Layer - made of geometric elements and requisite statistical information. Include geometric objects which represent the plot

Scales = used to map values in the data space that is used for creation of values (color, size, shape, etc)

coordinate system - describes how the data coordinates are mapped together in relation to the plan on the graphic

Faceting - how to break up data in to subsets to display multiple types or groups of data

theme - controls the finer points of the display, such as font size and backgroud color

options(repr.plot.width = 6, rep.plot.height = 6)

class(college_Data)
## [1] "data.frame"
head(college_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(college_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Lets start by creating a scatter plot of the college data

ggplot(data = college_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Total COVID-19 Cases vs Cases in 2021 for Colleges",
       x = "Total Cases",
       y = "Cases in 2021")
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point() +
  theme_minimal()

Lets color coordinate our college data

ggplot(data = college_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Total COVID-19 Cases vs Cases in 2021 for Colleges",
       x = "Total Cases",
       y = "Cases in 2021")
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Lets color coordinate the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Iris Sepal Width vs Length",
       x = "Sepal Width (cm)",
       y = "Sepal Length (cm)")

Lets run a simple histogram of our louisiana case data

hist(Louisiana_cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total cases", ylab = "frequency", min = " Total college covid-19 Infections (Louisiana)")
## Warning in plot.window(xlim, ylim, "", ...): "min" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "min"
## is not a graphical parameter
## Warning in axis(1, ...): "min" is not a graphical parameter
## Warning in axis(2, at = yt, ...): "min" is not a graphical parameter

Lets run a simpke histogram for the Iris data

library(ggplot2)

ggplot(iris, aes(x = Sepal.Width)) +
  geom_histogram(binwidth = 0.2, fill = "skyblue", color = "black") +
  geom_vline(aes(xintercept = mean(Sepal.Width)), color = "red", linetype = "dashed", size = 1) +
  labs(title = "Histogram of Iris Sepal Width",
       x = "Sepal Width (cm)",
       y = "Frequency") +
  theme_minimal() +
  annotate("text", x = max(iris$Sepal.Width), y = 30, 
           label = paste("Mean =", round(mean(iris$Sepal.Width), 2)), 
           hjust = 1, vjust = 1)

library(ggplot2)

histogram_college <- ggplot(data = Louisiana_cases, aes(x = cases))

histogram_college + 
  geom_histogram(binwidth = 100, color = "black", fill = "skyblue") +
  labs(x = "Number of Cases", 
       y = "Frequency", 
       title = "Histogram of COVID-19 Cases in Louisiana",
       subtitle = paste("Total cases:", sum(Louisiana_cases$cases))) +
  theme_minimal() +
  scale_x_continuous(labels = scales::comma) +
  scale_y_continuous(labels = scales::comma)

Lets create a ggplot for the IRIS data

library(ggplot2)

histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

histogram_iris + 
  geom_histogram(binwidth = 0.2, color = "black", aes(fill = Species)) +
  labs(x = "Sepal Width (cm)", 
       y = "Frequency", 
       title = "Histogram of Iris Sepal Width by Species",
       fill = "Species") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

Maybe a density plot makes more sense for our college data

library(ggplot2)

ggplot(south_cases, aes(x = cases, fill = state)) + 
  geom_density(alpha = 0.50) +
  labs(title = "Density Plot of COVID-19 Cases in Southern States",
       x = "Number of Cases",
       y = "Density",
       fill = "State") +
  theme_minimal() +
  scale_x_continuous(labels = scales::comma) +
  scale_fill_brewer(palette = "Set2")

Lets do it with the iris data

library(ggplot2)

ggplot(iris) +
  geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25) +
  labs(title = "Density Plot of Iris Sepal Width by Species",
       x = "Sepal Width (cm)",
       y = "Density",
       fill = "Species") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

library(ggplot2)

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) + 
  geom_violin() + 
  geom_boxplot(width = 0.1, fill = "white") +
  theme_classic() + 
  theme(legend.position = "none") +
  labs(title = "Distribution of Sepal Length by Iris Species",
       x = "Species",
       y = "Sepal Length (cm)")

Now lets try the south data

ggplot(data = south_cases, aes(x=state, y = cases, color = state)) + 
  geom_violin() + 
  theme_gray() + 
  theme(legend.position = "none")

Now lets take at risidual plots. This graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispered in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.

Lets start with the iris data

ggplot(lm(sepal.length = sepal.width, data = iris)) + 
  geom_point(aes(x=.fitted, y = .resid))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'sepal.length' will be disregarded

Now look at the souther states cases

ggplot(lm(cases ~ cases_2021, data = south_cases)) + 
  geom_point(aes(x=.fitted, y = .resid))

A linear model is not a good call for the state cases

Now lets do some correlations

obesity <- read.csv("Obesity_insurance.csv")

Lets loook at the structure of the data set

str(obesity)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Lets look at the column classes

class(obesity)

We can also get an idea of the distribution using a boxplot

class(obesity)
## [1] "data.frame"
boxplot(obesity$charges)

Now lets look at correlations. The cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or two data frames. The cov() command, on the other hand examines the covariance. The cor.test() command carries out a test as to the significance of the correlation

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

This test uses a spearman Rho correlation or you can use Kendalls tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397

This correlation measures strength of a correlation between -1 and 1.

Now lets look at the Tietjen=Moore Test. This is used for univariate datasets. The algorith depicts the detection of the outliers in a unvariate datasets. The algorithm depicts the detection of the outliers in a unvariate dataset.

TietjenMoore <- function(dataseries, k)
{
  n <- length(dataseries)
  # compute the absolute residuals 
  r <- abs(dataseries - mean(dataseries))
  # sort data according to size of residual 
  df <- data.frame(dataseries, r)
  dfs <- df[order(df$r),]
  # create a subset of the data without the largest values. 
  klarge <- c((n-k+1):n)
  subdataseries <- dfs$dataseries[-klarge]
  # compute the sums of squares. 
  ksub <- sum((subdataseries - mean(subdataseries))^2)
  all <- sum((df$dataseries - mean(df$dataseries))^2)
  # compute the test statistic 
  test_statistic <- ksub / all
  
  return(test_statistic)
}

This function helps to compute the absolute residuals and sorts the data according to the size of the residuals. Later, we will focus on the computation of sum of squares.

FindoutlierTietjenMooreTest <- function(dataseries, k, alpha = 0.5){
  ek <- TeijenMoore(dataseries, k)
  # compute critical values based on simulation
  test = c(1:10000)
  for (i in 1:10000){
    dataseriesdataseries = rnorm(length(dataseries))
    testi = TietjenMoore(dataseriesdataseries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
  }

This function helps to compute the absolute resudiaulas and sort data according to the size of the residuals. Later we will focus on the computationof sum of squares.

#' Tietjen-Moore Test for Outliers with Simulation
#'
#' This function performs the Tietjen-Moore test for outliers and computes
#' critical values based on simulation.
#'
#' @param dataseries A numeric vector containing the data to be tested for outliers.
#' @param k The number of potential outliers to test for.
#' @param alpha The significance level (default: 0.05).
#' @param simulations The number of simulations to run (default: 10000).
#'
#' @return A list containing the test statistic (T) and the critical value (Talpha).
#'
#' @examples
#' data <- c(1, 2, 3, 4, 5, 100, 101)
#' result <- FindoutlierTietjenMooreTest(data, 2)
#' print(result)
#'
#' @export
FindoutlierTietjenMooreTest <- function(dataseries, k, alpha = 0.05, simulations = 10000) {
  if (!is.numeric(dataseries)) {
    stop("Input 'dataseries' must be a numeric vector.")
  }
  
  if (!is.numeric(k) || k < 1 || k >= length(dataseries)) {
    stop("'k' must be a positive integer less than the length of 'dataseries'.")
  }
  
  if (!is.numeric(alpha) || alpha <= 0 || alpha >= 1) {
    stop("'alpha' must be a number between 0 and 1.")
  }
  
  ek <- TietjenMoore(dataseries, k)
  
  # Compute critical values based on simulation
  test <- numeric(simulations)
  n <- length(dataseries)
  
  for (i in 1:simulations) {
    simulated_data <- rnorm(n)
    test[i] <- TietjenMoore(simulated_data, k)
  }
  
  Talpha <- quantile(test, 1 - alpha)  # Changed to 1 - alpha for upper tail
  
  return(list(T = ek, Talpha = Talpha))
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obesity dataset for evaluating the algorithm.

The critical region for the Tietjen.Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and comuting the Tietjen Moore test statistic. Typlically, 10000 random sample are use The values of the Tietjen Moore statistic is between zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to zero. Thus, test is alwasy a lower, one tailed test regardless of which test statistic issued, lk or ek

First we will look at charges

boxplot(obesity$charges)

FindoutlierTietjenMooreTest(obesity$charges, 4)
## $T
## [1] 0.9528324
## 
## $Talpha
##      95% 
## 0.974038

Lets check out bmi

boxplot(obesity$bmi)

FindoutlierTietjenMooreTest(obesity$bmi, 2)
## $T
## [1] 0.9801578
## 
## $Talpha
##       95% 
## 0.9862772

Probability Plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the probablity plot function and their output dnorm: density function of the normal distributution. Using the density, it is possible to determine the probability of events. Or for examples, you may wonder “what is the likelihood that a person has a BMI of exactly ___? IKn this case, you would need to retrieve the density of th eBMI distribution at values 140. The IQ distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

Lets create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = density)) + 
  geom_point()

This gives us the probability of every single point occuring

Now lets use the pnorm function for more info

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) + 
  geom_point()

What if we want to find probabiltiy of the bmi being greater than 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x,, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)
## [1] "6.29 %"

What about probability that a bmi is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30, sd = 6, lower.tail = FALSE)),2), "%")
  
}

pp_less(40)
## [1] "95.22 %"
pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Lets use the pnorm function. we need to assume a normal distribution for this.

what bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769

What if you want a random sampling of valies within you distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

Shapiro-wilk test

So now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

Now lets check out age

shapiro.test(obesity$charges[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

and lastly bmi

shapiro.test(obesity$bmi[1:500])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:500]
## W = 0.99623, p-value = 0.2831

Time series data

First lets load our packages

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(readxl)

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Date - date of measurement time - time of measurement co(GT) - average hourly co2 PT08,s1(co) - tin oxide hourly average sensor response NMHC - average hourly non-metallic hydrocarbon concentration c6HC - average benzene concentration PT08.S3(NMHC) - titania average hourly sensor response NOX - average hourly NOx concentration NO2 - Average hourly NO2 concentration T - temper RH - relative humidity AH - Absolute humidity

str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Lets get rid of the date in the time column

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis" , xlab = "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)
## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

Text mining:

First we’ll look at the unrest_token function

Lets start by looking at an Emily Dickenson passage

text <- c("because I could not stop from Death - ",
          "He kindly stopped for  me - ", 
          "the carriage held but just ourselves -", 
          "and immortality")

text 
## [1] "because I could not stop from Death - "
## [2] "He kindly stopped for  me - "          
## [3] "the carriage held but just ourselves -"
## [4] "and immortality"

This is a typlical character vector that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "because I could not stop from Death - "
## 2     2 "He kindly stopped for  me - "          
## 3     3 "the carriage held but just ourselves -"
## 4     4 "and immortality"

Reminder: A tibble is a modern class of data frame within R. Its available in the dplyr and tibble package. that has a convenient print method, will not convert strings to factors, and does not use row names. Tibbles are great for use with tidy tools.

Next we will use the ‘unest_tomens’ function.

First we have the output column name that will be created as the text is unested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeaustinr package jane austin texts. there are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(), 
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup()

original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

To work this as a tidy dataset, we need to restructure it in the one-token-per-row format, which as we saw earlier is done with the untest_tokens() function

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function uses the token izers package to seperate each line of text in the original dataframe into tokens.

The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs can be used.

NOw that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr

often in text analysis we will want to remove stop words. stop words are words that are not useful for an analysis. These include words like the, of, to, and, and so forth.

we can remove stop words ( kept in the tidytext dataset ‘stop_words’) with an anti_join().

data(stop_word)
## Warning in data(stop_word): data set 'stop_word' not found
tidy_books <- tidy_books %>% 
  anti_join(stop_words)
## Joining with `by = join_by(word)`

The stop words dataset in the tidytext package contains stop words from three locations. we can use them all together, as we have here, or filter() to only one set of stop words if thats more appropriate for your analysis

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

Because we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows use to pipe this directly into ggplot2. From example we can create a visualization of the most common words.

library(janeaustenr)
library(dplyr)
library(tidytext)
library(ggplot2)

# Create tidy_books data frame
tidy_books <- austen_books() %>%
  unnest_tokens(word, text)

# Create and display the plot
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() + 
  labs(y = NULL, x = "Word Count", title = "Most Frequent Words in Jane Austen's Books") +
  theme_minimal()

The gutenbergr package

This package provides acess to the public domain works form the gutenberg project (www.gutenberg.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenber_download().

word frequencies

lets look at some biology texts, starting with Darwin

the voyage of the beagle - 944 on the orgin of species by the means of natural selection - 1228 the expression of emotion in man and animals - 1227 the descent of man, and selection in relation to sex - 2300

we can access these works using the gutenberg_download() and the Project Gutenberg IDnumbers

install.packages(“gutenbergr”) library(gutenbergr)

#darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = “https://mirror2.sandydriver.net/pub/gutenberg”)

Lets break these into tokens

tidy_darwin <- darwin %>% unrest_tokens(word, text) %>% anti_join(stop_words)

Lets check out what the most common darwin words are

tidy_darwin %>% count(word, sort = TRUE)

Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes

regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and adaptation - 63540

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = “https://mirror2.sandydriver.net/pub/gutenberg”)

Lets tokenize THM

tidy_morgan <- morgan %>% unnest_tokens(word, text) %>% anti_join(stop_words)

What are THM’s most common words?

tidy_morgan %>% count(word, sort = TRUE)

Lastly Lets look at Thomas Henry Huxley

Evidence as to mans place in nature - 1931 on the reception of the orgin of species -2089 Evolution and Ethics, and other essays - 2940 Science and culture, and other essays = 52344

huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror = “https://mirror2.sandydriver.net/pub/gutenberg”)

tidy_huxley <- huxley %>% unnest_tokens(word, text) %>% anti_join(stop_words)

tidy_huxley %>% count(word, sort = TRUE)

Now Lets calculate the frequency for each word for the works of Darwing, Morgan and huxley by binding the frames together.

library(dplyr)

frequency <- bind_rows(mutate(tidy_morgan, author = “Thomas Hunt Morgan”), mutate(tidy_darwin, author = “Charles Darwin”), mutate(tidy_morgan, author = “Thomas Henry Huxley”)) %>% mutate(word = str_extract(word, “[a-z’]+”)) %>% count(author, word) %>% group_by(author) %>% mutate(proportion = n/ sum(n)) %>% select(-n) %>% pivot_wider(names_from = author, values_from = proportion) %>% pivot_longer(‘Thomas Hunt Morgan’: ‘Charles Darwin’, names_to = “author”, values_to = “proportion”)

frequency

Now we need to change the table so that each author has its own row

frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)

frequency2

Now lets plot

library(scales)

ggplot(frequency2, aes(x = ‘charles darwin’ , y = ‘Thomas Hunt Morgan’, color = abs(‘charles darwin’ - ‘Thomas hunt Morgan’))) + geom_abline(color = “gray30”, lty = 2) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + geom_text(aes(labels - word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + scale_color_gradient(limits = c(0, 0.001), low = “darkslategrey4” high = “gray75”) + facet_wrap(~author, ncol = 2) + theme(legend.position=“none”) + labs(y = “Thomas Hunt Morgan”, x = “Charles Darwin”)

library(scales)

ggplot(frequency2, aes(x = ‘charles darwin’ , y = ‘Thomas Henry Huxley’, color = abs(‘charles darwin’ - ‘Thomas Henry Huxley’))) + geom_abline(color = “gray40”, lty = 2) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + geom_text(aes(labels - word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + scale_color_gradient(limits = c(0, 0.001), low = “darkslategrey4” high = “gray75”) + facet_wrap(~author, ncol = 2) + theme(legend.position=“none”) + labs(y = “Thomas Henry Huxley”, x = “Charles Darwin”)

Sentimental Analysis:

Then sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, discust, fear, joy, sadness, suprise and trust. AFFIN assigns a score between -5 and 5, with negative indicationg negative sentimetn, and 5 positive The function get_sentiments() allows us to get the specific sentiments lexicon with the meansures for each one

library(tidytext) library(textdata)

afinn <- get_setniments(“afinn”)

afinn

And lastly nrc

nrc <- get_sentiments(“nrc”)

nrc

These libraries were created either using crowdourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing

library(gutenbergr) library(dplyr) library(stringr)

darwin <- gutenbergr_download(c(944, 1227, 1228, 2300), mirror = “https://mirror2.sandyriver.net/pub/gutenberg”)

tidy_books <- darwin %>% group_by(gutenberg_id) %>% mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex(“^chapter [\divxlc]”, ignore_case = TRUE)))) %>% ungroup()%>% unnest_tokens(word, text)

tidy_books

Lets add the book name instead of GID

colnames(tidy_books)[1] <- “book”

tidy_books\([tidy_books\)book == 944] <- “The voyage of the Beagle” tidy_books\([tidy_books\)book == 1227] <- “The Expression fo the emotions in Man and Animals” tidy_books\([tidy_books\)book == 1228] <- “On the orgin of species by means of Natural Selection” tidy_books\([tidy_books\)book == 2300] <- “The Descent of Man, and selection in Reflection to Sex”

tidy_books

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments(“nrc”) %>% filter(sentiment == “joy”)

tidy_books %>% filter(book == “The voyage of the Beagle”) %>% inner_join(nrc_joy) %>% count(word, sort = TRUE)

We can also examine how sentiment changes throughout a work

library(tidy)

charles_darwin_sentiment <- tidy_books %>% inner-join(get_sentiments(“bring”)) %>% count(book, index = linenumber) %/% 80, sentiment) %>% pivot_wider(names_from = sentiment, values_from = n, value_fill = 0) %>% mutate(sentiment = positive - negative)

Now lets plot it

library(ggplot2)

ggplot(charles_darwin_sentimet, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = “free_x”)

Lets compare the three sentiment dictions

There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB

library(tidyr)

voyage <- tidy_books %>% filter(book == “The voyage of the beagle”)

voyage

Lets again use interger division (‘%%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.

affin <- voyage %>% inner_join(get_sentiments(“affinn”)) %>% group_by(index = linenumber %/% 80) %>% summarise(sentiment = sum(values)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( voyage %>% mutate(method = “bing et al.”), voyage %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”)) ) %>% mutate(method = “NRC”)) %>% count(method, index = linenumber %/% 80,sentiment) %>% pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% mutate(sentiment = positive - negative)

We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexion (dictionary). Lets bind them all together and visualize with ggplot

bind_rows(affin, bing_and_nrc) %>% ggplot(aes(index, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”)

Lets look at the counts based on each dictionary

get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”)) %>% count(sentiment)

get_sentiments(“bing”) %>% count(sentiment)

bing_word_counts <- tidy_books %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_word_counts

This can be shown visually, and we can pipe straight into ggplot2

bing_word_counts %>% group_by(sentiment) %>% slice_max(n, n = 10) %>% ungroup() mutate(word = recorder(word, n)) %>% ggplot(aes(n, word, fill = sentiment)) + facet_wrap(~sentiment, scale = “free_y”) + labs(x = “contribution to sentiment”, y = NULL)

Lets spot an anomoly in the dataset

custom_stop_words <- bind_rows(tibble(words = c(“wild”, “dark”, “great”, “like”), lexicon = c(“custom”)), stop_words)

custom_stop_words

Word Clouds!

we can see that tidy text mining and sentiment analysis works well with ggplot2, but havign our data in tidy format leads to other nice graphing techniques

lets use the wordcloud package!!

library(wordcloud)

tidy_books %>% anti_join(stop_words) %>% count(word) %>% with(wordcloud(word, n, max.words = 100))

lets also look at comparison.cloud(), which may require turning the dataframe into a matrix.

we can change to matrix using the acast() function.

tidy_books %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = “n”, fill = 0) %>% comparison.cloud(colors = c(“gray20”, “gray80”)), max.words = 100)

Looking at units beyond words

Lots of useful work can be done by tokenizing at the word level, but sometimes its nice to look at different units of text. For example, we can look beyond just unigrams.

Ex. I am not having a good day.

bingnegative <- get_sentiments(“bing”) %>% filter(sentiment == “negative”) wordcounts <- tidy_books %>% group_by(book, chapter) %>% summarize(words = n())

tidy_books %>% semi_join(bingnegative) %>% group_by(book, chapter) %>% summarize(negativewords = n()) %>% left_join(wordcounts, by = c(“book”, “chapter”)) %>% mutate(ratio = negativeword/words) %>% filter(chapter !=0) %>% slice_max(ratio, n = 1) %>% ungroup()

N-grams:

So far we’ve only looked at single words, but many interesting (more accurate) analyses

library(dplyr) library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = “https://mirror2.sandyriver.net/pub/gutenberg”)

colnames(tidy_books)[1] <- “book”

darwin_books\(darwin_books\)book == 944] <- “The voyage of the Beagle” darwin_books\([darwin_books\)book == 1227] <- “The Expression fo the emotions in Man and Animals” darwin_books\([darwin_books\)book == 1228] <- “On the orgin of species by means of Natural Selection” darwin_books\([darwin_books\)book == 2300] <- “The Descent of Man, and selection in Reflection to Sex”

darwin_bigrams <- darwin_books %>% unrest_tokens(bigram, text, token = “ngrams”, n = 2)

darwin_bigrams

This data is still in tidytext format, and is structured as one-token-per-row. Each token is a bigram.

counting and filtering n-gram

darwin_bigrams %>% count(bigram, sort = TRUE)

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>% separated(biggram, c(“word1”, “word2”), sep = ” “)

bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words\(word) %>% filter(!word2 %in% stop_words\)word)

bigrams_filtered

New bigram counts

bigram_counts <- bigrams_filtered %>% unite(bigram, word1, word2, sep = ” “)

bigram_counts

We may also be interested in trigrams, which are three word combos

darwin_books %>% unnest_tokens(trigram, texr, token = ngrams, n = 3) %>% seperate(trigram, c(“word1”, “word2”, “word3”, sep = ” “) %>% filter(!word1 %>% stop_words\(word, !word2 %>% stop_words\)word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort = TRUE)

darwin_books

Lets analyze some bigrams

bigrams_filtered %>% filter(word2 == “selection”) %>% count(book, word1, sort = TRUE)

Lets again look at tf-idf across bigrams across darwins works

bigram_tf_idf <- bigrams_united %>% count(book, bigram) %>% bind_tf_idf(bigram, book, n) %>% arrange(desc(tf_idf))

bigram_tf_idf

bigram_tf_idf %>% arrange(desc(tf_idf)) %>% group_by(books) %>% slice_max(tf_idf, n = 100) %>% ungroup() %>% mutate(bigram = recorder(bigram, tf-idf)) %>% ggplot(aes(tf_idf, bigram, fill = book)) + geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = “free”) + labs(x = “tf-idf of bigrams”, y = NULL)

Using bigrams to provide context in sentiment analysis

bigrams _seperated %>% filter(word1 == “not”) %>% count(word1, word2, sort = TRUE)

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negating words.

AFINN <- get_setniments(“afinn”)

AFINN

We can examine the most frequent words that were preceded by “not”, and associate with sentiment.

not_words <- biograms_separeated %>% filter(word1 == “not”) %>% inner_join(AFINN, by = c(word2 = “word”)) %>% count(word2, value, sort = TRUE)

not_words

Lets vizualize

library(ggplot2)

not_words %>% mutate(contribution = n * values) arrange(desc(abs(contribution))) %>% head(20) %>% mutate(word2 = recorder(word2, contribution)) %>% ggplot(aes(n * value, word2 fill = n * value > 0)) + geom_col(show.legend = FALSE) + labs(x = “sentimant value * number or occurences”, y = “words preceded by "not"’)

negation_words <- c(“not”, “no”, “never”, “non”, “without”)

negated_words <- bigrams_separated %>% filter(word1 %in% negation_words) %>% inner_join(AFINN, by = c(word2 = “word”)) %>% count(word1, word2, value, sort = TRUE)

Lets vizualize the negation words

negated_words %>% mutate(contribution = n * value, word2 = recorder(paste(word2, word1, sep = “”). contribution)) %>% group_by(word1) %>% slice_max(abs(contribution), n = 12, with_ties = FALSE) %>% ggplot(aes(word2, contribution, fill = n* value > 0)) + geom_col(show.legend = FALSE) + facet_wrap(~ word1, scales = ”free”) + scale_discrete(labels = function(x) gsub(”.+$”, ““, x)) + xlab(”words preceded by negation term”) + ylab(“sentiment value * # of occurences”) + coord_flip()

vizualize a network of bigrams with ggraph

library(graph)

bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>% filter(n > 20) %>% graph_from_ data_frame()

bigram_graph

library(ggraph) set.seed(1234)

ggraph(bigram_graph, layout = “fr”) + geom_edge_link() + geom_node_point() + geom_node_text(aes(label = name), vjust = 1, hjust = 1)

We can also add directionality to this network

set.seed(1234)

a <- grid::arrow(type = “closed”. length = unit(0.15, “inches”))

ggraph(bigram_graph, layout() = “fr”) + geom_edge_link(aes(edge_alpha = m), show.legend = FALSE, arrow = a, end_cap = circle(0.7, ‘inches’)) + geom_node_point(color = “lightblue”, size = 5) + geom_node_text (aes(label=name), vjust = 1, hjust = 1) + theme_void()

Word Frequencies:

A central questionin text mining is how to quantify what a document is about. We can do this but looking at words that make up a document, and measuring term frequency.

There are a lot of words that may not be important, these are the stop words.

one way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words and increases with the weight for words that are not used very much.

library(dplyr) library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = “https://mirror2.sandyriver.net/pub/gutenberg”)

colnames(books_words)[1] <- “book”

darwin_words\(book[book_words\)book == 944] <- “The voyage of the Beagle” darwin_words\([book_words\)book == 1227] <- “The Expression fo the emotions in Man and Animals” darwin_words\([book_words\)book == 1228] <- “On the orgin of species by means of Natural Selection” darwin_words\([book_words\)book == 2300] <- “The Descent of Man, and selection in Reflection to Sex”

Now lets disect

book_words <- book_words %>% unnest_tokens(word, text) %>% count(book, word, sort = TRUE)

book_words

book_words\(n <- as.numeric(book_words\)n)

total_words <- book_words %>% group_by(book) %>% summarize(total = sum(n))

book_words

book_words <- left_join(book_words, total_words)

book_words

You can see that the usual suspects are the most common words, but dont tell us anything about about what the books topic is.

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) + geom_histogram(show.legend = FALSE) + xlim(NA, 0.0009) + facet_wrap(~book, ncol = 2, scales = “free_y”)

zoipf’s Law

The frequency that a word appears is inversely proportional to its rank when predicting a topic.

lets apply zipf’s law to Darwins work

freq_by_rank <- book_words %>% group_by(book) mutate(rank = row_number(), ‘term frequency’ = n/total %>% ungroup()

freq_by_rank

freq_by_rank %>% ggplot(aes(rank, ‘term frequency’, color = book)) + geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + scale_x_log10() + scale_y_log10()

Let us TF - IDF to find words for each document by decreasing by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.

book_tf_idf <- book_words %>% bind_tf_idf(word, book, n)

book_td_idf

Lets look at terms with high tf-idf in Darwins works

book_tf_idf %>% select(-total) %>% arrange(desc(tf_idf))

Lets look at a visualization for these high tf-idf words

book_tf_idf %>% group_by(book) %>^% slive_max(tf_idf, n = 15) %>% ungroup() %>% ggplot(aes(tf_idf, fct_recorder(word, tf_idf), fill = book)) + geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = “free”) + labs(x = “tf-idf”, y = NULL)