GG PLot

Now lets take a look at some ggplot2 barplots

we will start on making a dataframe based on the tooth data.

df <- data.frame(dose = c("D0.5", "D1", "D2"), 
                 len = c(4.2, 10, 29.5))

df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

and now lets make a second dataframe

df2 <- data.frame(supp=rep(c("vc", "oj"), each=3),
                  dose=rep(c("D0.5","D1","D2"), 2),
                  len=c(6.8, 15, 33, 4.2, 10, 29.5))
df2
##   supp dose  len
## 1   vc D0.5  6.8
## 2   vc   D1 15.0
## 3   vc   D2 33.0
## 4   oj D0.5  4.2
## 5   oj   D1 10.0
## 6   oj   D2 29.5

lets load up ggplot2

library(ggplot2)

Lets set up our parameters for ggplot2

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f+geom_col()

Now lets change the fill, and add labels to the top

f+geom_col(fill="darkblue") +
  geom_text(aes(label=len), vjust=-0.3)

Now lets add some labels inside the bars

f+geom_col(fill="darkblue") +
  geom_text(aes(label=len), vjust=1.6, color="white")

Now lets change the barplot colors by group

f+geom_col(aes(color=dose), fill= "white") +
  scale_color_manual(values= c("blue", "gold","red"))

Lets change the fill

f+geom_col(aes(fill=dose))+
  scale_fill_manual(values=c("blue","gold","red"))

Lets do this with multiple groups

ggplot(df2, aes(x=dose, y=len))+
  geom_col(aes(color=supp, fill=supp,), position = position_stack())+
  scale_color_manual(values=c("blue","gold")) +
  scale_fill_manual(values=c("blue","gold"))

p <- ggplot(df2, aes(x=dose, y=len)) +
  geom_col(aes(color = supp, fill=supp), position = position_dodge(0.8), width=0.7) +
  scale_color_manual(values=c("blue", "gold"))+
  scale_fill_manual(values=c("blue","gold"))
p

Now lets add those labels to the dodged barplot

p+geom_text(
  aes(label=len, group=supp),
  position=position_dodge(0.8),
  vjust= -0.3, size= 3.5
)

Now what if we want to add labels to our stacked barplots? we need dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) -0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 vc    D0.5    6.8      3.4
## 2 oj    D0.5    4.2      8.9
## 3 vc    D1     15        7.5
## 4 oj    D1     10       20  
## 5 vc    D2     33       16.5
## 6 oj    D2     29.5     47.8

Lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y=len))+
  geom_col(aes(fill=supp),width=0.7) +
  geom_text(aes(y=lab_ypos, label=len, group=supp), color="white")+
  scale_color_manual(values=c("blue", "gold"))+
  scale_fill_manual(values=c("blue","gold"))

data("ToothGrowth") 

lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)
head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load a ggplot

library(ggplot2)

Lets set the theme for our box plots to classic

theme_set(
  theme_bw()+
    theme(legend.position = "top")
)

Start with a very basic box template with dose vs length

tg <- ggplot(ToothGrowth, aes(x=dose, y=len))
tg+geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch= TRUE, fill="lightgrey")+
  stat_summary(fun.y = mean, geom="point", shape=18, size=2.5, color="indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included, and their order

tg+geom_boxplot()+
  scale_x_discrete(limits=c("0.5","2"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

lets put our x axis in descending order

tg+geom_boxplot()+
  scale_x_discrete(limits=c("2","1","0.5"))

We can also change our boxplot colors by groups

tg +geom_boxplot(aes(color=dose))+
  scale_color_manual(values=c("indianred","blue1","green2"))

what if we want to display our data subset by oj vs vitamin c

tg2 <- tg+ geom_boxplot(aes(fill=supp), position = position_dodge(0.9))+
  scale_fill_manual(values=c("#999999","#E69F00"))

tg2

we can also arrange this as two plots with facet_wrap

tg2+facet_wrap(~supp)

set.seed(1234)

wdata= data.frame(
  sex= factor(rep(c("F", "M"), each=200)),
  weight=c(rnorm(200,56), rnorm(200,58))
)
head(wdata, 4)
##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now lets load dplyr

library(dplyr)
mu<- wdata%>%
  group_by(sex)%>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)
theme_set(
  theme_classic() +
  theme(legend.position="bottom")

)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") +
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values=c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

what if we want to combine density plots and histograms?

a + geom_histogram(aes(y=stat(density)),
                   color="black",fill="white") +
  geom_density(alpha=0.2, fill="#ff6666")
## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y=stat(density), color=sex),
                       fill="white",position="identity") +
                     geom_density(aes(color=sex), size=1)+
                     scale_color_manual(values=c("indianred","lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

First lets load the required packages

library(ggplot2)

Lets set our theme

theme_set(
  theme_dark()+
    theme(legend.position = "top"))

First lets initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg<- ggplot(ToothGrowth, aes(x=dose, y=len))

Lets create a dotplot with a summary statistic

tg+geom_dotplot(binaxis="y", stackdir = "center", fill="white")+
  stat_summary(fun=mean,fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_segment()`).

Lets add a boxplot and a dotplot together

tg+geom_boxplot(width=0.5)+
  geom_dotplot(binaxis="y", stackdir = "center", fill="white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg+geom_violin(trim = FALSE)+
  geom_dotplot(binaxis = "y", stackdir = "center", fill="#999999")+
  stat_summary(fun=mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_segment()`).

Lets create a dotplot with multiple groups

tg+geom_boxplot(width=0.5)+
  geom_dotplot(aes(fill=supp), binaxis = "y", stackdir = "center")+
  scale_fill_manual(values=c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg+geom_boxplot(aes(color=supp), width=0.5, position=position_dodge(0.8))+
  geom_dotplot(aes(fill=supp, color=supp), binaxis="y", stackdir = "center",
               dotsize = 0.8, position = position_dodge(0.8))+
  scale_fill_manual(values=c("#00AFBB", "#E7B800"))+
  scale_color_manual(values=c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Now lets change it up and look at some lineplots

We will start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff we’re modifying

df<- data.frame(dose=c("D0.5","D1","D2"),
                len=c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2<- data.frame(supp=rep(c("vc","oj"), each=3),
                 dose=rep(c("D0.5","D1","D2"), 2),
                 len=c(6.8, 15, 33, 4.2, 10, 29.5))
df2
##   supp dose  len
## 1   vc D0.5  6.8
## 2   vc   D1 15.0
## 3   vc   D2 33.0
## 4   oj D0.5  4.2
## 5   oj   D1 10.0
## 6   oj   D2 29.5

Now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position="right"))

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <-function(){
  oldPar <- par()
  par(font=2, mar=c(0,0,0,0))
  plot(1, pch="", ylim=c(0,6), xlim=c(0,0.7), axes=FALSE, xlab="", ylab="")
  for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty=i, lwd=3)
  text(rep(0.1,6), 0:6, labels=c("0. 'Blank'", "1, 'Solid'", "2. 'dashed'", "3. 'dotted'",
                                 "4. 'dotdash'", "5. 'longdash", "5. 'twodash'"))
  par(mar=oldPar$mar, font=oldPar$font)
}
generateRLineTypes()

Now lets build a basic line plot

p<- ggplot(data=df, aes(x=dose, y=len, group=1))

p+geom_line() + geom_point()

Lets modify line type and color

p+geom_line(linetype="dashed", color="steelblue")+
  geom_point(color="steelblue")

Now lets try a step graph, which indicates a threshold type progression

p+geom_step()+ geom_point()

Now lets move on to making multiple groups. First we’ll create our ggplot object

p<- ggplot(df2, aes(x=dose, y=len, group=supp))

Now lets change line types and point shapes by group

p+geom_line(aes(linetype=supp, color=supp))+
  geom_point(aes(shape=supp, color=supp))+
  scale_color_manual(values=c("red", "blue"))

Lets look at line plots with a numeric x axis

df3<- data.frame(supp=rep(c("vc","oj"), each=3),
                 dose=rep(c("0.5","1","2"),2),
                 len=c(6.8, 15, 33, 4.2, 10, 29.5))

df3
##   supp dose  len
## 1   vc  0.5  6.8
## 2   vc    1 15.0
## 3   vc    2 33.0
## 4   oj  0.5  4.2
## 5   oj    1 10.0
## 6   oj    2 29.5

Now lets plot where both axises are treated as continuous labels

df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data=df3, aes(x=dose, y=len, group=supp, color=supp))+
  geom_line()+geom_point()

Now lets look at a line graph with having the x axis as dates. We will use the built in economics time series for this example.

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data=economics, aes(x=date, y=pop))+
  geom_line()

NOw lets subset the data

ss<- subset(economics, date> as.Date("2006-1-1"))
ggplot(data = ss, aes(x=date, y=pop))+geom_line()

We can also change the line size, for instance by another variable like unemployment

ggplot(data = economics, aes(x=date, y=pop))+
  geom_line(aes(size=unemploy/pop))

we can also plot multiple time series data

ggplot(economics, aes(x=date))+
  geom_line(aes(y=psavert), color="darkred")+
  geom_line(aes(y=uempmed), color="steelblue", linetype="twodash")

Lastly, lets make it into an area plot

ggplot(economics, aes(x=date))+
  geom_area(aes(y=psavert), fill="#999999",
            color="#999999", alpha=0.5)+
  geom_area(aes(y=uempmed), fill="#E69F00",
            color="#E69F00", alpha=0.5)

First lets load the required packages

library(ggplot2)
library(ggridges)
BiocManager::install("ggridges")
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.18 (BiocManager 1.30.25), R 4.3.3 (2024-02-29)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'ggridges'
## Installation paths not writeable, unable to update packages
##   path: /usr/lib/R/library
##   packages:
##     boot, class, cluster, codetools, foreign, KernSmooth, lattice, nlme, nnet,
##     rpart, spatial, survival
## Old packages: 'adegenet', 'aplot', 'BiocFileCache', 'biomaRt', 'Biostrings',
##   'broom', 'BSgenome', 'bslib', 'car', 'cowplot', 'credentials', 'csaw',
##   'dbplyr', 'dendextend', 'DESeq2', 'GenomicAlignments', 'GenomicFeatures',
##   'gert', 'ggdendro', 'ggforce', 'ggfun', 'ggraph', 'ggrepel', 'ggsci',
##   'ggtree', 'gh', 'graphlayouts', 'haven', 'Hmisc', 'htmlTable', 'htmlwidgets',
##   'httpuv', 'httr2', 'igraph', 'imager', 'labelled', 'leaflet', 'lme4',
##   'Luminescence', 'mosaic', 'patchwork', 'pbkrtest', 'phangorn', 'phytools',
##   'pkgbuild', 'pkgdown', 'pkgload', 'plotly', 'profvis', 'ragg', 'readr',
##   'reprex', 'rgl', 'rmarkdown', 'roxygen2', 'seqinr', 'shiny',
##   'shinycssloaders', 'SparseArray', 'testthat', 'textshaping', 'tidygraph',
##   'tidyr', 'tidyselect', 'tidytext', 'tidytree', 'tm', 'topicmodels', 'tweenr',
##   'usethis', 'viridis', 'vroom', 'xopen'

Now lets load some sample data

?airquality
air<- ggplot(airquality)+ aes(Temp, Month, group= Month)+ geom_density_ridges()

air
## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)
## Loading required package: viridisLite
ggplot(airquality)+aes(Temp, Month, group=Month, fill=..x..) +
  geom_density_ridges_gradient()+
  scale_fill_viridis(option="C", name ="Temp")
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data

```{r library(tidyr)

airquality%>% gather(key=“Measurement”, value=“value”, Ozone, Solar.R, Temp) %>% ggplot()+aes(value, Month, group=Month) + geom_density_ridges()+ facet__wrap(~ Measurement, scales=“free”)


A density plot is a nice alternative to a histogram 

``` r
set.seed(1234)
wdata=data.frame(
  sex=factor(rep(c("F","M"), each=200)),
  weight=c(rnorm(200,55), rnorm(200,58))
)
library(dplyr)
mu<- wdata %>%
  group_by(sex)%>%
summarize(grp.mean=mean(weight))

Now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic()+
    theme(legend.position="right")
)

Now lets do the basic plot function. First we will create a ggplot object

d<- ggplot(wdata, aes(x=weight))

Now lets do a basic density plot

d+geom_density()+
  geom_vline(aes(xintercept=mean(weight)), linetype= "dashed")

Now lets change the y axis to count instead of density

d+geom_density(aes(y=stat(count)), fill="lightgray")+
  geom_vline(aes(xintercept= mean(weight)), linetype="dashed")

d+geom_density(aes(color=sex))+
  scale_color_manual(values=c("darkgray", "gold"))

lastly lets fill the density plots

d+geom_density(aes(fill=sex), alpha=0.4)+
  geom_vline(aes(xintercept=grp.mean, color=sex), data=mu, linetype= "dashed")+
  scale_color_manual(values=c("gray", "gold"))+
  scale_fill_manual(values=c("gray","gold"))

First lets load our required packages

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

lets start with a scatter plot of the orange dataset

Orange<- as.data.frame(Orange)

plot_ly(data=Orange, x= ~age, y= ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data=Orange, x= ~age, y= ~circumference,
        color= ~Tree, size= ~age,
        text= ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference ))
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean=120, sd=10)
new_data <- data.frame(Orange, trace_1)

We’ll use the random numbers as lines on thee graph

plot_ly(data = new_data, x= ~age, y= ~circumference, color= ~Tree, size= ~age,
        text= ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y= ~trace_1, mode= 'lines') %>%
  add_trace(y= ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line, and add labels.

plot_ly(data=Orange, x= ~age, y= ~circumference,
        color= ~Tree, size = ~circumference,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ", circumference)) %>%
  add_trace(y=~circumference, mode='markers') %>%
  layout(
    title= "Plot or orange data with switchable trace",
    updatemenus = list(
      list(
        type= 'downdrop',
        y=0.8, 
        buttons = list(
          list(method = 'restyle',
               args = list('mode','markers'),
               label = "Marker"
        ),
        list(method="restyle",
             args = list('mode','lines'),
             label = "Lines"
        )
        )
      )
    )
  )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.

First lets load our required packages

library(plotly)

Now lets create a random 3d matrix

d <- data.frame(
  x <- seq(1, 10, by =0.5),
  y <- seq(1, 10, by = 0.5)
)
z<- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol= length(d$y))

Now lets plot our 3D data

plot_ly(d, x=~x, y=~y, z=~z) %>%
  add_surface()

Now lets add some more aspects to it such as topography

plot_ly(d, x=~x, y=~y, z=~z) %>%
  add_surface(
    contours= list(
      z=list(
        show= TRUE,
        usecolormap=TRUE,
        highlightcolor= "FF0000",
        project= list(z=TRUE)
      )
    )
  )

Now lets look at a 3d scatter plot

plot_ly(longley, x=~GNP, y= ~Population, z=~Employed, marker= list(color= ~GNP)) %>%
  add_markers()

Lets load the required packages

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic()+
    theme(legend.position= 'top')
)

Lets again use our tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <- df %>%
  group_by(dose)%>%
  summarise(
    sd=sd(len, na.rm = TRUE),
    len=mean(len),
    stderr=std.error(len, na.rm=TRUE)
  )
df.summary
## # A tibble: 3 × 4
##   dose     sd   len stderr
##   <fct> <dbl> <dbl>  <dbl>
## 1 0.5    4.50  10.6     NA
## 2 1      4.42  19.7     NA
## 3 2      3.77  26.1     NA

Now lets look at some key functions

Lets start by creating a ggplot object

tg<- ggplot(
  df.summary,
  aes(x=dose, y=len, ymin= len-sd, ymax= len+sd)
)

Now lets look at the most basic error bars

tg+geom_pointrange()

tg+ geom_errorbar(width=0.2)+
  geom_point(size=1.5)

Now lets create a horizontal error bar by manipulating our data

ggplot(df.summary, aes(x=len, y=dose, xmin= len-sd, xmax= len+sd))+
  geom_point()+
  geom_errorbarh(height= 0.2)

This just gives up an idea of error bars on the horizontal axis

Now lets look at adding jitter points (actual measurements to our data

ggplot(df, aes(dose, len))+ 
  geom_jitter( position = position_jitter(0.2), color= "darkgray")+
  geom_pointrange(aes(ymin= len-sd, ymax= len+sd), data= df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len))+
  geom_violin(color= "darkgray", trim = FALSE)+
  geom_pointrange(aes(ymin= len-sd, ymax= len+sd), data= df.summary)

Now how about with a line graph?

ggplot(df.summary, aes(dose, len))+
  geom_line(aes(group=1))+ #always specify this when you have 1 line
  geom_errorbar(aes(ymin= len- stderr, ymax= len-stderr), width= 0.2)+
  geom_point(size= 2)

Now lets make a bar graph with halve error bars

ggplot( df.summary, aes( dose, len))+
  geom_col(fill= "lightgrey", color="black")+
  geom_errorbar(aes(ymin= len, ymax= len+stderr), width= 0.2)

You can see that by not specifyign wmin= len-stderr, we have in essence cut our error bar in half.

How about we add jitter points to line plots? We need to use the original dataframe for the jitter plot, and the summary df for the geom layers

ggplot(df, aes(dose, len))+
  geom_jitter(position = position_jitter(0.2), color="darkgrey")+
  geom_line(aes(group=1), data = df.summary)+
  geom_errorbar(
    aes(ymin= len-stderr, ymax= len+stderr),
    data= df.summary, width= 0.2)+
  geom_point(data= df.summary, size= 0.2)

What about adding jitterpoints to a barplot?

ggplot(df, aes(dose, len))+
  geom_col(data = df.summary, fill= NA, color= "black")+
  geom_jitter(position = position_jitter(0.2), color= "darkgrey")+
  geom_errorbar(aes(ymin= len-stderr, ymax= len+stderr),
                data= df.summary, width=0.2)

What if we wanted to have pur error bars per group (OJ vs VC)

df.summary2 <- df%>%
  group_by(dose, supp) %>%
  summarise(
    sd=sd(len),
    stderr= std.error(len),
    len= mean(len)
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len))+
  geom_pointrange(
    aes(ymin= len-stderr, ymax= len+stderr, color=supp),
    position = position_dodge(0.3))+
  scale_color_manual(values=c("indianred", "lightblue"))

How about line plots with multiple error bars?

ggplot(df.summary2, aes(dose, len))+
  geom_line(aes(linetype=supp, group=supp))+
  geom_point()+
  geom_errorbar(aes(ymin= len-stderr, ymax=len+stderr, group=supp), width=0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len))+
  geom_col(aes(fill=supp), position = position_dodge(0.8), width= 0.7)+
  geom_errorbar(
    aes(ymin= len-sd, ymax= len+sd, group=supp),
    width= 0.2, position = position_dodge(0.8))+
  scale_fill_manual(values=c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color=supp))+
  geom_jitter(position = position_dodge(0.2))+
  geom_line(aes(group=supp), data=df.summary2)+
  geom_point()+
  geom_errorbar(aes(ymin= len-stderr, ymax= len+stderr, group=supp), data = df.summary2, width=0.2)

ggplot(df, aes(dose, len, color= supp))+
  geom_col(data = df.summary2, position = position_dodge(0.8), width= 0.7, fill= "white")+
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width=0.8))+
  geom_errorbar(
    aes(ymin=len-stderr, ymax= len+stderr), data = df.summary2,
    width=0.2, position = position_dodge(0.8))+
  scale_color_manual(values= c("indianred", "lightblue"))+
  theme(legend.position = "top")

Now lets do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata= data.frame(
  sex=factor(rep(c("F", "M"), each=200)),
  weight= c(rnorm(200,50), rnorm(200,58)))

Now lets look at out dataframe

head(wdata,5)
##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430
## 5   F 50.42912

Now lets load our plotting package

library(ggplot2)

theme_set(
  theme_classic()+
    theme(legend.position="bottom")
  )

Now lets create our ECDF plot

ggplot(wdata, aes(x=weight))+
  stat_ecdf(aes(color=sex, linetype=sex),
            geom="step", size=1.5)+
  scale_color_manual(values=c("#00AFBB", "#E7B900"))+
  labs(y="weight")

Now lets take a look at qq plots. These are used to determine if the given data follows a normal distribution

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some data

wdata=data.frame(
  sex=factor(rep(c("F","M"), each=200)),
  weight= c(rnorm(200,55), rnorm(200,58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)
theme_set(
  theme_classic()+
    theme(legend.position = "top")
)

create a qq plot of the weight

ggplot(wdata, aes(sample=weight))+
  stat_qq(aes(color=sex))+
  scale_color_manual(values=c("#0073C2FF", "#FC4E07"))+
  labs(y="weight")

#install.packages(ggpubr)
library(ggpubr)

ggqqplot(wdata, x="weight",
         color="sex",
         palettes=c("#0073C2FF", "#FC4E07"),
         ggtheme=theme_pubclean())

Now what would a non-normal distribution look like?

#install.packages(mnonr)

library(mnonr)

data2<- mnonr::mnonr(n=100, p=2, ms=3, mk=61, Sigma = matrix(c(1,0.5,0.5,1), 2, 2), initial= NULL)
data2 <- as.data.frame(data2)

Now lets plot the non-normal data

ggplot(data2, aes(sample=V1))+
  stat_qq()

ggqqplot(data2, x="V1",
         palette= "#0073C2FF",
         ggtheme= theme_pubclean())

Lets look at how to put multiple plots into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw()+
    theme(legend.position= "top")
)

First lets create a nice boxplot lets load the data

df<- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x=dose, y=len))+
  geom_boxplot(aes(fill=supp), position = position_dodge(0.9))+
  scale_fill_manual(values=c("#00AFBB", "#E7B800"))

p

Now lets look at the ggplot facet function

p+facet_grid(rows=vars(supp))

Now lets do it with multiple variables

p+facet_grid(rows=vars(dose), cols=vars(supp))

p

Now lets look at the facet_wrap function. This allows the facets to be placed side-by-side

p+facet_wrap(vars(dose), ncol=2)

Now how do we combine multiple plots using ggarrange()

Lets start by making some basic plots. First we will define a color pallete and data

my3cols <- c("#e7B800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x=dose, y=len))
bxp <- p+geom_boxplot(aes(color=dose))+
  scale_color_manual(values=my3cols)

ok now lets do a dotplot

dp <- p+geom_dotplot(aes(color=dose, fill=dose),
                     binaxis = 'y', stackdir = 'center')+
  scale_color_manual(values=my3cols)+
  scale_fill_manual(values=my3cols)

Now lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert))+
  geom_line(color="indianred")
figure <- ggarrange(bxp, dp, lp, label= c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning in as_grob.default(plot): Cannot convert object of class character into
## a grob.
figure

This looks great but we can make it look even better

figure2 <- ggarrange(
  lp, 
  ggarrange( bxp, dp, ncol=2, labels =c("B", "C")),
  nrow = 2,
  labels="A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure2

This looks really good, but you will notice that there are two legends that are the same

ggarrange(
  bxp, dp, labels= c("A","B"),
  common.legend = TRUE, legend = "bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")
## file saved to facetfigure.pdf

We can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

Lastly we can export to pdf with multiple plots and multiple columns

ggexport( bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol = 1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Lets get started with heatmaps

#install.packages(heatmaps)
library(heatmap3)

Now lets get our data

data<- ldeaths

data2<- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heatmap

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

NOw lets play with the colors

rc <- rainbow(nrow(data2), start=0, end=0.3)
cc <- rainbow(ncol(data2), start=0, end=0.3)

Now lets apply our color selection

heatmap(data2, ColSideColors = cc)

library(RColorBrewer)

heatmap(data2, ColSideColors = cc,
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Theres more that we can customize

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
## 
##     plotCI
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(data2, ColSideColors = cc,
          col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Missing values

If you encounter an unusual value in your dataset, and simply want to move move on to the rest of your analysis, you have two options:

Drop the entire row with strange values

library(dplyr)
library(ggplot2)

diamonds<- diamonds
diamons2<- diamonds %>%
  filter(between (y, 3, 20))

In this instance, y is the width of the diamond so anything under 3 mm or above 20 is excluded

I dont reccomend this this option, just because there is one bad measurement does not mean they are all bad

Instead, I reccomend replacing the unusual values with missing values

diamonds3<- diamonds %>%
  mutate(y= ifelse(y<3 | y>20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values shouldnt be silent

ggplot(data = diamonds3, mapping = aes(x=x, y=y))+
  geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

If you want to suppress the warning you can use na.rm=true

ggplot(data = diamonds3, mapping = aes(x=x, y=y))+
  geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled and non-cancelled times.

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled= is.na(dep_time),
    sched_hour= sched_dep_time %/% 100, 
    sched_min= sched_dep_time %/% 100,
    sched_dep_time = sched_hour+ sched_min /60
  ) %>%
  ggplot(mapping = aes(sched_dep_time))+
  geom_freqpoly(mapping = aes(color=cancelled), bindwith = 1/4)
## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4):
## Ignoring unknown parameters: `bindwith`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we want to know what our outliers are?

First we need to load the required libraries

library(outliers)
library(ggplot2)
library(readxl)

And reload the dataset because we removed the outliers

``{r} Air_data <- read_xlsx( “AirQualityUCI.xlsx”)


Lets create a function using the grubb test to identify all outliers.
The grubbs test identifies outliers in an univariate dataset that is presumed
to come from a normal distribution


``` r
grubbs.flag <-function(x) {
  #lets create a variable called outliers and save nothing in it, we'll add to the variable 
  #as we identify them
  outliers <- NULL
  #We'll create a variable called test to identify which univariate we are testing
  test<- x
  #Now using the outliers package , use grubbs.test to find outliers in our variable
  grubbs.result <- grubbs.test(test)
  #Lets get the p-values of all tested variables
  pv<- grubbs.result$p.value
  # Now lets search through our p-values for ones that are outside of 0.5
  while(pv <0.5){
    # Anythhing with a p-values greater greater than p=0.5, we add to our empty outliers vector
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, "") [[1]][[3]]))
    # Now we want to remove these outliers from our test variables
    test<- x[!x %in% outliers]
    # and run the grubbs test again without the outliers
    grubbs.result <- grubbs.test(test)
    # and save the new p values
    pv <- grubbs.result$p.value
  }
  return(data.frame(x=x, outliers=(x%in%outliers)))
}

``{r} identified_outliers <- grubbs.flag(Air_data$AH)

NOw we can create a histogram showign where the outliers were

```{r
ggplot(grubbs.flag(Air_data$AH), aes(x=Air_data$AH, color=outliers, fill=outliers))+
  geom_histogram(bindwidth= diff(range(Air_data$AH))/30)+
  theme_bw()
library(ggplot2)

ggplot(data= diamonds, mapping = aes(x=price))+
  geom_freqpoly(mapping = aes(color=cut), bindwidth=500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution because the counts differ so much

ggplot(diamonds)+
  geom_bar(mapping = aes(x=cut))

To make the comparison easier , we need to swap the display on the y axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one.

ggplot(data = diamonds, mapping = aes(x=price, y=..density..))+
  geom_freqpoly(mapping = aes(color=cut), bindwidth=500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe thats because frequency polygons are a little hard to interrupt.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.

ggplot(data = diamonds, mapping = aes(x=cut, y=price))+
  geom_boxplot()

We see much less information about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average.

Lets look at some car data

ggplot(data = mpg, mapping = aes(x=class, y=hwy))+
  geom_boxplot()

ggplot(data = mpg)+
  geom_boxplot(mapping = aes(x=reorder(class,hwy, FUN = median), y=hwy))

If you have long variable names , you can switch the axis and flip the it 90 degrees

ggplot(data=mpg)+
  geom_boxplot(mapping = aes(x=reorder(class,hwy, FUN = median), y=hwy))+
  coord_flip()

To visualize the correalation between the continuous variables, we can use a scatter plot

ggplot(data = diamonds)+
  geom_point(mapping = aes(x=carat, y=price))

Scatterplots become less useful as the size of your dataset grows, because we get overplot. We can fix this using the alpha aesthetic

ggplot(data = diamonds)+
  geom_point(mapping = aes(x=carat, y=price), alpha=1/100)

First lets load the required libraries

library(RCurl)
library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges//colleges.csv"

College_data <- read.csv(site)
str(College_data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our data alphabetically by college

alphabetical <- College_data %>%
  arrange(College_data$college)

The glimpse package is another way to preview data

glimpse(College_data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_data, college, cases)

We can also filter our subset with the filter function

Louisiana_Cases <- filter(College_data, state=="Louisiana")

Lets filter our smaller amount of states

South_Cases <- filter(College_data, state=="Louisiana"| state=="Texas"| state=="Arkansas"| state=="Mississippi")

Lets look at some time series data

First lets load the required libraries

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
state_data <- read.csv(state_site)

Lets create group_by object using the state column

state_cases <- group_by(state_data, state)
class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started responding

days_since_first_reported <- tally(state_cases)

Lets visualize some data

First lets start off with some definitions

data- obvioulsy the stuff we want to visualize

layer- made up of geometric elements and requisite statistical information. Include geometric objects which represents the plot

scales- used to map values in the data space that is used for creation of values (color, size, shape, etc)

coordinate system- describes how the data coordinates are mapped together in relation to the plan on the graphic

faceting- how to break up data into subsets to display multiple types or groups of data

theme- controls the finer points of the display, such as font size and background color

options(repr.plot.width=6, replr.plot.height=6)

class(College_data)
## [1] "data.frame"
head(College_data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(College_data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now lets take a look at a different data set

iris<- as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Lets start by creating a scatter plot of the college data

ggplot(data= College_data, aes(x=cases, y=cases_2021))+
  geom_point()+
  theme_minimal()
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_point()`).

NOw lets do the iris data

ggplot(data = iris, aes(x= Sepal.Width, y= Sepal.Length))+
  geom_point()+
  theme_minimal()

lets color coordinate our college data

ggplot(data = College_data, aes(x= cases, y=cases_2021, color=state))+
  geom_point()+
  theme_minimal()
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_point()`).

Lets color coordinate the iris data

ggplot(data = iris, aes(x=Sepal.Width, y=Sepal.Length, color= Species))+
  geom_point()+
  theme_minimal()

Lets run a simple histogram of our Louisiana Case Data

hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency",
     main = "Total College Covid-19 Infections (Louisiana)")

Lets run a simple histogram for Iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width",
     ylab = "Frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college +geom_histogram(bindwidth=100, color="black", aes(fill=county))+
  xlab("cases")+ylab("Frequency")+ ggtitle("Histogram of Covid 19 Cases in Louisiana")
## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lets create a ggplot for iris data

histogram_iris <- ggplot(data = iris, aes(x=Sepal.Width))

histogram_iris +geom_histogram(bindwwidth=0.2, color="black", aes(fill= Species))+
  xlab("Sepal Width")+ ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")
## Warning in geom_histogram(bindwwidth = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindwwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Maybe a density plot makes more sense for the college data

ggplot(South_Cases)+
  geom_density(aes(x=cases, fill=state), alpha=0.50)

Lets do it with the Iris data

ggplot(iris)+
  geom_density(aes(x=Sepal.Width, fill= Species), alpha=0.25)

Lets look at a violin plot for iris

ggplot(data = iris, aes(x=Species, y=Sepal.Length, color= Species))+
  geom_violin()+
  theme_classic()+
  theme(legend.position = "none")

Now lets try the south data

ggplot(data = South_Cases, aes(x=state, y=cases, color=state))+
  geom_violin()+
  theme_gray()+
  theme(legend.position = "none")

Now lets take a look at residual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed a nonlinear model is more appropriate.

Lets start with our iris data

ggplot(lm(Sepal.Length~Sepal.Width, data=iris))+
  geom_point(aes(x=.fitted, y=.resid))

Now lets look at southern case data

ggplot(lm(cases~ cases_2021, data=South_Cases))+
  geom_point(aes(x=.fitted, y=.resid))

A linear model is not a good call for the state cases

Now lets do some correlations

{r obesity <- read.csv("obesity_insurance_csv")

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Lets look at the structure of the dataset ``{r} str(obesity)


Lets look at the column classes

``{r}
class(obesity)

And get a summary of distribution of the variables ``{r} summary(obesity)


Now lets look at the distrubtion for insurance charges

``{r}
hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

``{r} boxplot(obesity$charges)


``{r}
boxplot(obesity$bmi)

Now lets look at the correaltions. The cor() command is used to determine correlations between two vectors, all the columns of a dataframe, or two data frames. The cov() command, on the otherhand, examines the the covariance. The cor.test() command carries out a test as to the significance of the correlation. {r cor(obesity$charges, obesity$bmi)

This test uses a Spearman Rho correlation, or you can use Kendall’s tau by specifying it

{r cor(obesity$charges, obesity$bmi, method = "kendall")

This correlation measures strength of correlation between -1 and 1

Now lets look at the Tietjen=Moore test. This is used for univariate datasets. The algorithm depicts the detection of the outliers in a univariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n=length(dataSeries)
  #compute the absolute residuals
  r= abs(dataSeries - mean(dataSeries))
  # Sort data according the the size of residuals
  df=data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  # CReate a subset of the data without the largest values 
  klarge=c((n-k+1):n)
  subdataSeries = df$dataSeries[-klarge]
  #Compute the sums of squares.
  ksub=(subdataSeries -mean(subdataSeries))**2
  all= (df$dataSeries -mean(df$dataSeries))**2
  #Compare the test statistic
  sum(ksub)/sum(all)
  
}

This function helps compute the absolute residuals and sorts data according to the size of the residuals. Later, we will focus on the computation of the sum of sqaures.

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha=0.5){
  ek <- TietjenMoore(dataSeries,k)
  # Compare Criticaal values based on simulation
  test=c(1:10000)
  for(i in 1:10000){
    dataSeriesdataSeries= rnorm(length(dataSeries))
    test[i]= TietJenMoore(dataSeriesdataSeries,k)}
  Talpha=quantile(test,alpha)
  list(T=ek, Talpha=Talpha)
  
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obseity dataset dor evalutaing the algorithm.

The critical region for the TieJen=Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and computing the TietJen Moore Test statistic. Typically 10,000 random sample are used. The values of the TietJen Moore statistic obtained from the data is compared to this reference distribution. The values of the test statistic is between zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regrdless of which test statistic issued, Lk or Ek.

First we will look at charges

``{r} boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 50)


Lets check out bmi

```{r
boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 2)

Probabilty Plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the probaility plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probabilty of events. Or for examples, you may wonder “what is the likelihood that a person has a bmi of exactly _? In this case, you would need to retrieve the density of bmi distribution at values 140. The bmi distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:

{r bmi.mean <- mean(obesity$bmi) bmi.sd <- sd(obesity$bmi)

Lets create a plot of our normal distribution

bmi.dist<- dnorm(obesity$bmi, mean= bmi.mean, sd= bmi.sd)
bmi.df <- data.frame("bmi"=obesity$bmi, "density"= bmi.dist)


ggplot(bmi.df, aes(x=bmi, y=Density))+
  geom_point()

This gives us the probabilty of every single point occuring

Now lets use the pnorm function for more info

``{r} bmi.dist <- pnorm(obesity\(bmi, mean = bmi.mea, sd=bmi.sd) bmi.df <- data.frame( "bmi" = obesity\)bmi, “Density”= bmi.dist)

ggplot(bmi.df, aes(x=bmi, y=Density))+ geom_point()



What if we want to find the probability of the bmi being greater than 40 in our distribution?


``` r
pp_greater <- function(x) {
  paste(round(100 * pnorm(x, mean = 30.66339, sd= 6.09818, lower.tail = FALSE), 2), "%")
}
pp_greater(40)
## [1] "6.29 %"

What about the probability that a bmi is less than 40 in our population?

pp_less<- function(x) {
  paste(round(100 *(1-pnorm(x, mean= 30.66339, sd= 6.09818, lower.tail = FALSE)), 2), "%")
}
pp_less(40)
## [1] "93.71 %"

What if we want to find the area in between?

{r pnormGC(c(20,40), region="between", mean=30.66339, sd=6.09818, graph= TRUE)

What if we want to know the quantiles? Lets use the qnorm funtion. We need to assume a normal distribution for this.

what bmi representss the lowest 1% of the population?

qnorm(0.01, mean=30.66339, sd=6.09818,lower.tail = TRUE)
## [1] 16.4769

What if you want a random sampling fo values within your distribution?

subset<- rnorm(50, mean = 30.66339, sd= 6.09818)

hist(subset)

subset2 <- rnorm(50000, mean= 30.66339, sd= 6.09818)

hist(subset)

Shapiro-Wilk Test

So now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution? ``{r} shipiro.test(obesity$charges[1:5])


You can see here, with a small sample size, we would reject the null hypothesis that
the samples came from a normal distribution. We can increase the power of test by 
increasing the sample size

``{r}
shapiro.test(obesity$charges[1:1000])

Now lets check out age

{r shapiro.test(obesity$age[1:1000])

And lastly bmi

```{r shapiro.test(obesity$bmi[1:1000])


Time Series data
 
First lets load the packages

```{r
library(readr)
library(readxl)

Air_data <- read.xls( "AirQualityUCI.xlsx")

Date- date of measurement time- time of management CO(GT) - average hourly CO2 PT08, s1(CO) - tin oxide hourly average sensor response NMHC - average hourly non-metallic hydrocarbon concentration C6HC - average benzene concentration PT08.S3(NMHC) - titania average hourly sensor response NOx - average hourly NOx concentration NO2 - average hourly NO2 concentration T - temper RH - relative humidity AH - absolute humidity

``{r} str(Air_data)



``` r
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

```{r Air_data\(Time <- as_hms(Air_data\)Time)

glimpse(Air_data)



```{}
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

{r t.test(Air_data$RH, Air_data$AH)

First we’ll look at an unnest_token function

Lets start by looking at an Emily Dickenson passage

text<- c("Because I could not stop from Death =",
         "He kindly stopped for me -",
         "The carriage held but just ourselves -",
         "and Immortality")

text
## [1] "Because I could not stop from Death =" 
## [2] "He kindly stopped for me -"            
## [3] "The carriage held but just ourselves -"
## [4] "and Immortality"

This is a typical character vector that we want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)

text_df <- tibble(line=1:4, text =text)

text_df
## # A tibble: 4 × 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop from Death = 
## 2     2 He kindly stopped for me -            
## 3     3 The carriage held but just ourselves -
## 4     4 and Immortality

Reminder: A tibble is a moder class of sof data frame within R. Its available in the dplyr and tibble packages, that has a conveniant print method, will not convert strongs to factors, and does not use row names. Tibbles are great for use with tidy tools.

Next we will use the ‘unest_tokens’ function.

First we have the output column name that will be created as the text is unnested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeautenr package to analyze some Jane Austen texts. There are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)

To work with this as a tidy dataset, we need to restructure it in the one-token-per-row frame, which as we saw earlier is done with the unnest_tokens() function

``{r} library(dplyr) library(janeaustenr)

d <- tibble(txt = prideprejudice) d

d %>% unnest_tokens(output = word, input = txt)

d %>% unnest_tokens(output = sentence, input = txt, token = “sentences”)

d %>% unnest_tokens(output = ngram, input = txt, token = “ngrams”, n = 2)

d %>% unnest_tokens(chapter, txt, token = “regex”, pattern = “Chapter [\\d]”)

d %>% unnest_tokens(shingle, txt, token = “character_shingles”, n = 4)

custom function

d %>% unnest_tokens(word, txt, token = stringr::str_split, pattern = ” “)

tokenize HTML

h <- tibble(row = 1:2, text = c(“Text is”, “here”))

h %>% unnest_tokens(word, text, format = “html”)


``{r}
library(tidytext)
tidy_books <- original_books%>%
unnest_tokens(word, text)

tidy_books

This function uses the tokenizers package to seperate each line of text in the original dataframe into token.

The default tokening is for words but other options including characters, n-grams, sentences, lines, or paragraphs can be used.

Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr.

Often in text analysis, we will want to remove stop words. Stop words are words that are not USEFUL for an analysis.

We can remove stop words (kept in the tidyset dataset ‘stop_words’) with an anti_join().

data(stop_words)

The stop words dataset in the tidyext package contains stop words from three lexicons. We can use thrm all togeter, as we have three, or filter() to only use one set of stop words if thats more appropriate for your analysis.

``{r}

tidy_books %>% count(word, sort = TRUE)


Because we've been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly
into ggplot2. For example, we can create a visualization of the most common words.

```{r
library(ggplot2)

tidy_books %>%
  count(word, sort=TRUE) %>%
  filter( n>600) %>%
  mutate(word=reorder(word, n)) %>%
  ggplot( aes(n, word))+
  geom_col()+
  labs(y=NULL, x= "Word count") 

The gutenbergr package

This package provides access to the public domain workd from the gutenberg project (www.gutenberg.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().

Word frquencies

Lets look at some biology texts, starting with Darwin

The Voyage of the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227 The descent of man, and selection in relation to sex - 2300

We can access these works using the gutenberg_download() and the Project Gutenberg IDnumbers

# install libraries
install.packages("tidyverse")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("gutenbergr")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("DT")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("flextable")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## also installing the dependency 'gdtools'
## Warning in install.packages("flextable"): installation of package 'gdtools' had
## non-zero exit status
## Warning in install.packages("flextable"): installation of package 'flextable'
## had non-zero exit status
install.packages("dplyr")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
# install klippy for copy-to-clipboard button in code chunks
remotes::install_github("rlesur/klippy")
## Skipping install of 'klippy' from a github remote, the SHA1 (378c247f) has not changed since last install.
##   Use `force = TRUE` to force installation
library(gutenbergr)

darwin <- gutenberg_works(author == "Darwin, Charles") 

This link does not work anymore. It says the book has been archived. I searched up the website and still could not find it.

Lets break these into tokens

``{r} tidy_darwin <- darwin + unnest_tokens(word, text) + anti_join(stop_words)


Lets check out what the most popular Darwin words are 

``{r}
tidy_darwin %>%
  count(word, sort = TRUE)

Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and Adaptation - 63540

```{r

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = “https://mirror2.sandyriver.net/pub/gutenberg” )

Lets tokenize THM

```{r
tidy_morgan <-morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(word, sort = TRUE)

What are THM’s most common words?


tidy_morgan %>%
  count(word, sort = TRUE)

Lastly lets look at Thomas Henry Haxley

Evidence as to mans place in nature - 2911 On the reception of the Origin of Species - 2069 Evolution and Ethics, and other essays - 2940 Science and Culture, and other essays - 52344

huxley - gutenberg_download(c(2911, 2069, 2940, 52344), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

{r} tidy_huxley <- huxley %>% unnest_tokens(word, text) %>% anti_join(stop_words) ```{r} tidy_huxley %>% count(word, sort= TRUE)


Now lets calculate the frequency for each word for the works of Darwin, Morgan, and Huxley by binding the frames together.

```{}
library(tidyr)

frquency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                      mutate(tidy_darwin, author = "Charles Darwin"),
                      mutate(tidy_huxley, author ="Thomas Henry Huxley")) %>%
  mutate(word = str_extract(word, "[A-Z]+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion =n/ sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from =wuthor, values_from = proportion) %>%
  pivot_longer("Thomas Hunt Morgans": "Charles Darwin", names_to ="author", values_to = "proportion")

frequency

Now we need to change the table so that each author has its own row

frequency2 <- pivot_wider(frequency, names_from =author, values_from=proportion)

frequency1

Now lets plot

Yes{r} library(scales)

ggplot(frquency2, aes(x= ‘Charles Darwin’, y= ‘Thomas Hunt Morgan’), color=abs(- ‘Charles Darwin’ -‘Thomas Hunt Morgan’)) + geom_abline(color= “gray40”, lty =2 )+ geom_jitter(alpha= 0.1, size= 2.5, width = 0.3, height = 0.3)+ geom_text(aes(label=word), check_overlap = TRUE, vjust=1.5)+ scale_x_log10(labels= percent_format())+ scale_y_log10(labels = percent_format())+ scale_color_gradient(limits= c(0, 0.001), low= “darkslategray4”, high =“gray75”)+ theme(legend.position = “none”)+ labs(y= “Thomas Hunt Morgan”, x= “Charles Darwin”)


``{r}
ggplot(frquency2, aes(x= 'Thomas Hunt Morgan', y= 'Thomas Henry Huxley'), color=abs(- 'Thomas Hunt Morgan' -'Thomas Henry Huxley')) +
  geom_abline(color= "gray40", lty =2 )+
  geom_jitter(alpha= 0.1, size= 2.5, width = 0.3, height = 0.3)+
  geom_text(aes(label=word), check_overlap = TRUE, vjust=1.5)+
  scale_x_log10(labels= percent_format())+
  scale_y_log10(labels = percent_format())+
  scale_color_gradient(limits= c(0, 0.001),
                       low= "darkslategray4", high ="gray75")+
  theme(legend.position = "none")+
  labs(y= "Thomas Henry Huxley", x= "Thomas Hunt Morgan")

The Sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.

AFFIN bing nrc

Bing categorizes words in a binary fashion into positive or negative nrc categorizes intp positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust AFFIN assigns a score between -5 and 5, with negative indicating negative sentimetn, and 5 positive

The function get_sentiments() allows us to get specific sentiments lexicon with the measures for each one.

library(tidytext)
library(textdata)

afinn <- get_sentiments("afinn")

afinn
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Lets look at bing

bing <- get_sentiments("bing")

bing
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

And lastly nrc

nrc<- get_sentiments("nrc")

nrc
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created either using crowdsourcing or cloud computing/ai like Amazon Mechanical Turk, or by labout of one of the authors, and then validated with crowdsourcing.

Lets look at the words with a joy score from nrc

```{r library(gutenbergr) library(dplyr) library(stringr)

darwin <- gutenberg_download(c( 944, 1227, 1228, 2300), mirror = “https://mirror2.sandyriver.net/pub/gutenberg”)

tidy_books <- darwin %>% group_by(gutenberg_id) %>% mutate(linenumber= row_number(), chapter = cumsum(str_detect(text, regex(“^chapter [//divxlc]”, ignore_case = TRUE)))) %>% ungroup() %>% unnest_tokens(word, text)

tidy_books


Lets add the book name instead of GID

``{r}
colnames(tody_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species by Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

tidy_books

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

We can also examine how sentiment changes throughout a work.

``{r} library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>% inner_join(get_sentiments(“bing”)) %>% count(book, index= linenumber %/% 80, sentiment) %>% pivot_wider(names_from = sentiment, values_from=n, values_fill = 0) %>% mutate(sentiment = positive - negative)


Now lets plot it

```r}
library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book))+
  geom_col(show.legend = FALSE)+
  face_wrap(~book, ncol =2, scales= "free_x")

Lets compare the three sentiment dictions

There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three of our dictionaries and examine how sentiment changes across the arc of TVOTB.

library(tidyr)

Lets again use integer division (‘%>%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’ to find the net sentiment in each of these sections of text.


affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %>% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  voyage %>%
    inner_join(get_sentiments("nrc")%>%
                 filter(sentiment %in% c("positive", "negative")))%>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %>% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              
              
              values_fill = 0) %>%
  mutate(sentimen = positive - negative)

We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexion (dictionary), Lets bind them all together and visualize with ggplot

``{r} bind_rows(affin, bing_and_nrc) %>% ggplot(aes(index, sentiment, fill = method))+ geom_col(show.legend = FALSE)+ facet_wrap(~method, ncol=1, scales = “free_y”)



Lets look at th counts based on each dictonary
``{r}
get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative"))%>%
  count(sentiment)

{r} get_sentiments("bing") %>% count(sentiment) ```{r} bing_word_counts <- tidy_books %>% inner_join(get_sentiments(“bing”))%>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_word_counts


This can be shown visually, and we can pipe right into ggplot2

``{r}
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n=10)%>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y")+
  labs(x = "Contribution to Sentiment", y=NULL)

Lets spot an anamoly in the dataset

```r} custom_stop_words <- bind_rows(tibble(word = c(“wild”, “dark”, “great”, “like”), lexicon =c(“custom”)), stop_words)

custom_stop_words


Word Clouds!

We can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques

Lets use the wordcloud package!!

```{r
library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words=100))

Lets also look at comparison.cloud(), which may require turning the dataframe into a matrix.

We can change to matrix using the acast() function.

```{r library(reshape2)

tidy_books %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = “n”, fill=0) %>% comparison.cloud(colors = c(“gray20”, “gray80”), max.words=100)


Looking at units beyond words

Lots of useful work can be done by tokenizing at the word leveel, but sometimes its nice to look at different units of text
For example, we can look beyond just unigrams.

Ex: I am not having a good day.

```{r
bingnegative <- get_sentiments("bing") %>%
  filter(sentiment== "negative")

wordcounts <- tidy_books%>%
  group_by(book, chapter) %>%
  summarize(words=n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("books", "chapter")) %>%
  mutate(ratio =  negativewords/words) %>%
  filter(chapter !=0) %>%
  slice_max(raatio, n=1) %>%
  ungroup()

So far we’ve only looked at single words, but many interesting (more accurate) analysis are based on the relationship betweem words.

Lets look at some methods of tidytext for calculating and visualizing word relationships.

library(dplyr)
library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and ANimals"
darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Speciess by Menas of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token ="ngrams", n=2)

darwin_bigrams

This data is still in tidyset format, and is structured as one-token-per-row. Each token is a bigram.

Counting and filtering n-gram

{r darwin_bigrams %>% count(bigram, sort = TRUE)

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate commmand which splits a column into multiple based on a delemiter. This will let us make a column for word one and word two.

```{r library(tidyr)

bigrams_seperated <- darwin_bigrams %>% seperate(bigram, c(“word1”, “word2”), sep= ” “)

bigrams_filtered <- bigrams_seperated %>% filter(!word1 %in% stop_words\(word)%>% filter(!word2 %in% stop_words\)word)

bigrams_filtered



New bigram counts

```{r
bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep= " ")

bigram_counts

We may also be interested in trigrams, which are three word combos

``{r}

trigrams <- darwin_books %>% unnest_tokens(trigram, text, token= “ngrams”, n=3) %>% seperate(trigram, c(“word1”, “word2”, “word3”), sep=” “) %>% filter(!word1%in% stop_words\(word, !word2 %in% stop_words\)word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort= TRUE)

trigrams


Lets analyze some bigrams
```{}
bigrams_filtered %>
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)

Lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n)%>%
  arrange(desc(tf_idf))

bigram_tf_idf

r} bigram_tf_idf %>% arrange(desc(tf_idf))%>% group_by(book) %>% slice_max(tf_idf, n=10) %>% ungroup() %>% mutate(bigram = reorder(bigram, tf_idf)) %>% ggplot(aes(tf_idf, bigram, fill=book))+ geom_col(shpw.legend = FALSE)+ facet_wrap(~book, ncol = 2, scales = "free")+ labs(x="tf-idf of bigrams", y= NULL)

Using bigrams to provide context in sentiment analysis

r} bigrams_seperated %>% filter(word1=="not")%>% count(word1, word2, sort= TRUE)

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negating words.

```{r

AFINN <- get_sentiments(“afinn”)

AFINN


We can examine the most frequent words that were preceded by "not", and associate with sentiment.

```{r

not_words<- bigrams_seperated %>%
  filter(word1== "not") %>>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort= TRUE)

not_words

Lets visualize

``{r} library(ggplot2)

not_words %>% mutate(contribution = n * value) %>% arrange(desc(abs(contribution))) %>% head(20) %>% mutate(word2 = reorder(word2, contribution)) %>% ggplot(aes(n * value, word2, fill = n *value >0)) + geom_col(show.legend = FALSE)+ labs( x= “Sentiment Value * number of occurences”, y= “words preceded by "not"”)


``{r}
negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_seperated %>% 
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)


negated_words

Lets visualize the negation words

{r negated_words %>% mutate(contribution = n*value, word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>% group_by(word1) %>% slice_max(abs(contribution), n=12, with_ties = FALSE) %>% ggplot(aes(word2, contribution, fill = n*value >0))+ geom_col(show.legend = FALSE)+ facet_wrap(~word1, scales = "free")+ scale_x_discrete(labels= function(x) gsub("_.+$", "", X))+ xlab("Words precended by negation term")+ ylab("Sentiment Value * # of occurences")+ coord_flip()

Visualize a network of bigrams with ggraph

```{r library(igraph)

bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>% filter(n>20) %>% graph_from_data_frame()

bigram_graph


```{r
library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout ="fr")+
  geom_edge_link()+
  geom_node_point()+
  geom_node_text(aes(label= name), vjust=1, hjust=1)

We can also add directionality to this network

``{r bigramgraph, dependson=“bigram_graph”, fig.width=12, fig.height=10} set.seed(1234)

a<- grid::arrow(type = “closed”, length = unit(0.15, “inches”))

ggraph(bigram_graph, layout = “fr”)+ geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, “inches”))+ geom_node_point(color = “lightblue”, size=5)+ geom_node_text(aes(label=name), vjust=1, hjust=1)+ theme_void()


A central question in text mining is how to quantify what a document is about. We can do this by looking at words that make up the document, and measuring term frequency.

There are a lot of words that may not be important, these are the stop words.

One way to remedy this is to look at inverse document frequency words, which descreases the weight for commonly used words and increases the weight for words that are not used very much.

Term frequency in Darwins Works

``{r}
library(dplyr)
library(tidytext)

book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

colnames(book_words)[1] <- "book"

book_words$book[book_words$book ==944] <- "The Voyage of the Beagle"
book_words$book[book_words$book ==1277] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book ==1228] <- "On the Origin of Species by means of Natural Selection"
book_words$book[book_words$book ==2300] <- "The Descent of Man, and Selection in Relation to Sex"

Now lets disect

``{r} book_words <- book_words %% unnest_tokens(word,text) %>% count(book, word, sort = TRUE)

book_words


```r}
book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))

book_words

``{r} book_words <- left_join(book_words, total_words)

book_words


You can see that the usual suspects are the most common words, but don't tell us anything about what the books topic is.

``{r}
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book))+
  geom_histogram(show.legend = FALSE)+
  xlim(NA, 0.0009)+
  facet_wrap(~book, ncol=2, scales ="free_y")

``{r} freq_by_rank <- book_words %>% group_by(book) %>% mutate(rank = row_number(), ‘term frequency’ = n/total) %>% ungroup()

freq_by_rank


``{r}
freq_by_rank %>%
  ggplot(aes(rank, 'term frequency', color=book))+
  geom_line(size=1.1, alpha=0.8, show.legend = FALSE)+
  scale_x_log10()+
  scale_y_log10()

Lets use TF- IDF to find words for each focument by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.

``{r} book_tf_idf <- book_words %>% bind_tf_idf(word, book, n)

book_tf_idf



Lets look at terms with high tf-idf in Darwins works

``{r}
book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))

Lets look at a visualization for thesee high tf-idf words

``{r} book_tf_idf%>% group_by(book)%>% slice_max(tf_idf, n=15)%>% ungroup()%>% ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill=book))+ geom_col(show.legend = FALSE)+ facet_wrap(~book, ncol = 2, scales = “free”)+ labs(x=“tf-idf”, y= NULL) ```