Final_Project

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

First lets load the required packages

library(ggplot2)

Lets set out therm

theme_set(
  theme_dark()+
    theme(legend.position = "top")
)

First lets initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x=dose, y=len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill="white") +
  stat_summary(fun = mean, fun.args = list(mult=1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

tg + geom_boxplot(width =0.5) +
  geom_dotplot(binaxis = "y", stackdir = "center",fill="white")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim=FALSE) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill ="#999999") +
  stat_summary(fun=mean,fun.args = list(mult=1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(aes(fill = supp), binaxis = "y",stackdir = "center") +
  scale_fill_manual(values = c("indianred","lightblue1"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color=supp),width = 0.5, position = position_dodge(0.8))+
  geom_dotplot(aes(fill=supp, color=supp), binaxis = "y", stackdir = "center",
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB","#E7B800"))+
  scale_color_manual(values = c("#00AFBB","#E7B800"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Now let do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata =data.frame(
sex =factor(rep(c("F", "M"), each =200)),
weight= c(rnorm(200, 50), rnorm(200, 58)))

Now lets look at our dataframe

head(wdata, 5)

##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430
## 5   F 50.42912

Now lets load our plotting package

library(ggplot2)

theme_set(
theme_classic() +
theme (legend.position="top")
)

ggplot(wdata, aes(x=weight)) +
stat_ecdf(aes (color = sex, linetype =sex),
          geom ="step", size =1.5) +
scale_color_manual(values=c("#00AFBB", "#E7B900"))+
labs(y = "weight")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

First lets load our required libraries

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position ='top')
)

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <- df %>% 
  group_by(dose) %>% 
  summarise(
    sd = sd(len, na.rm = TRUE),
    stderr = std.error(len, na.rm =TRUE),
    
    len = mean(len),
  )

df.summary

## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Lets now look at some key functions geom_corssbar() for hollow bars with middle indicated by a horizontal lin geom_erorrbar() for error bars geom_errorbarh() for horizontal error bars geom_linerange() form drawing an intervl represted by a vertical line *geom_pointrange() for creating an interval represented by a vertical line; with a point in the middle

lets start by creating a ggplot object

tg <- ggplot(
  df.summary,
  aes(x=dose, y = len, ymin = len - sd, ymax =len + sd)
)

Now lets look at the most basic error bars

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2)+
  geom_point(size =1.5)

Now lets look at adding jitter point (actual measurements) to our data.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgray") +
  geom_pointrange(aes(ymin = len - sd, ymax = len + sd), data = df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgray", trim = FALSE) + 
  geom_pointrange(aes(ymin = len - sd, ymax = len + sd), data = df.summary)

Now how about with a line graph?

ggplot(df.summary, aes(dose, len)) +
  geom_line(aes(group =1)) + #always specify this when you have 1 line
  geom_errorbar(aes(ymin = len - stderr, ymax = len +stderr), width = 0.2) +
  geom_point(size =2)

Now lets make a bar graph with halve error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgrey", color = "black") +
  geom_errorbar(aes(ymin = len, ymax = len+stderr), width = 0.2)

You can see that by not specifiying wmin = len-stderr, we have in essence cut our error bar in half.

How about we add jitter point to line plots? We need to use the orgininal dataframe for the jitter plot, and the summary df for the geom layers.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_line(aes(group = 1), data = df.summary) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len +stderr),
    data = df.summary, width = 0.2) +
  geom_point(data = df.summary, size = 0.2)

Wath about addin jitterpoints to a barplot?

ggplot(df, aes(dose, len))+
  geom_col(data = df.summary, fill = NA, color = "black")+
  geom_jitter(position = position_jitter(0.3), color = "darkgrey") +
  geom_errorbar(aes(ymin = len - stderr, ymax = len+ stderr),
                data = df.summary, width = 0.2)

What if we wanted to have our error bars per group> (OJ vs VC)

df.summary2 <- df %>% 
  group_by(dose, supp) %>% 
  summarise(
    sd = sd(len),
    stderr = std.error(len),
    len = mean(len)
)

## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.

df.summary2

## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) +
  geom_pointrange(
    aes(ymin = len - stderr, ymax = len + stderr, color = supp),
    position = position_dodge(0.3)) + 
  scale_color_manual(values = c("indianred", "lightblue"))

How about line plots with multiple error bars?

ggplot(df.summary2,aes(dose, len)) +
  geom_line(aes(linetype = supp, group = supp)) +
  geom_point() +
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr, group = supp), width =0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len)) +
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr, group = supp), 
    width = 0.2, position =position_dodge(0.8)) +
      scale_fill_manual(values =c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color = supp))+
  geom_jitter(position = position_dodge(0.2))+
  geom_line(aes(group = supp), data = df.summary2) +
  geom_point() +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), data = df.summary2, width =0.2)

ggplot(df, aes(dose, len, color = supp))+
  geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white")+
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
  geom_errorbar(
    aes(ymin = len-stderr, ymax = len+stderr), data = df.summary2,
    width = 0.2, position = position_dodge(0.8))+
  scale_color_manual(values = c("indianred","lightblue")) +
  theme(legend.position = "top")

Lets look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)
theme_set(
theme_bw() +
theme(legend.position="top")
)

First lets create a nice boxplot

lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes (x=dose, y =len)) +
geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
scale_fill_manual(values=c("#00AFBB", "#E78800")) 

p

Now lets like at the gvplot facit function

p + facet_grid(rows = vars(supp))

Now lets do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols =vars(supp))

Now lets look at the facet_wrap function. This allows facets to be placed side-by-side

p + facet_wrap(vars(dose),ncol = 2)

Now how do combine multiple plots using ggarrange()

Lets start by making some basic plots. First we will define a color palette and data

my3cols <- c("#e7B800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x= dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
scale_color_manual(values = my3cols)

ok now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
binaxis ='y', stackdir = 'center') +
scale_color_manual(values = my3cols) +
scale_fill_manual (values =my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert))+
geom_line(color ="indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol= 2, nrow = 2)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

figure

This looks great, but we can make it look even better

figure2 <- ggarrange(
lp,
ggarrange(bxp, dp, ncol = 2, labels = c("B", "C")),
nrow = 2,
labels = "A")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

figure2

ok this looks really good, but you’ll notice that there are two legeneds that are the same.

ggarrange(
bxp, dp, labels =c("A", "B"),
common.legend = TRUE, legend = "bottom")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly, we should export the plot

ggexport(figure2, filename ="facetfigure.pdf")

## file saved to facetfigure.pdf

we can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename ="multi.pdf")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

lastly, we can export to pdf with multiple pages and multiple columns

ggexport(bxp, dp, lp, bxp, filename="test2.pdf", nrow = 2, ncol= 1)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Now lets change it up and look at some line plots

We’ll start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff that we’re modifying

df <- data.frame(dose =c("D0.5", "D1","D2"),
                 len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose =rep(c("D0.5", "D1","D2"),2),
                  len = c(6.8, 15, 33, 4.2,10,29.5))

df2

##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
)

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim=c(0,6), xlim=c(0,0.7), axes= FALSE,xlab="", ylab="")
  for(i in 0:6) lines(c(0.3, 0.7), c(i,i), lty=i, lwd =3)
  text(rep(0,1,6), 0:6, labels=c("0.'Blank'","1. 'solid'","2. 'dashed'","3. 'dotted'",
                                 "4. 'dotdash'","5. 'longdash'","6. 'twodash'"))

  par(mar=oldPar$mar, font=oldPar$font)
}

generateRLineTypes()

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                len = c(4.2, 10, 29.5))
df

##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

Second dataframe

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5","D1","D2"),2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))
                  
df2

##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Lets load up ggplot2

library(ggplot2)

Lets set our parameters for ggplot

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x=dose,y=len))
f + geom_col()

Lets look at some boxplots

data("ToothGrowth")

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth,4)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw()+
    theme(legend.position = "top")
)

Lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth,aes(x=dose,y=len))
tg+geom_boxplot()

Now lets look at a boxplot with points for the mean

tg +geom_boxplot(notch = TRUE,fill="lightgrey")+
  stat_summary(fun.y=mean,geom="point",shape=18,size=2.5,color="indianred")

## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can also change the scale number of variables included, and their order

tg + geom_boxplot()+
  scale_x_discrete(limits=c("0.5","2"))

## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

Let put our x axis in descending order

tg + geom_boxplot()+
  scale_x_discrete(limits=c("2","1","0.5"))

We can also change boxplot colors by groups

tg+geom_boxplot(aes(color =dose))+
  scale_color_manual(values=c("indianred","blue1","green2"))

What if we want to display our data subset data subset by oj vs vitamin c?

tg2 <- tg + geom_boxplot(aes(fill=supp),position=position_dodge(0.9))+
  scale_fill_manual(values = c("#999999","#E69F00"))
tg2

We can also arrange this as two plots with facet_wrap

tg2+facet_wrap(~supp)

Lets get started with heatmaps

#install.packages(heatmap3)

library(heatmap3)

Now lets get our data.

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle (data)))

dimnames(data2) <- dimnames(.preformat.ts (data))

Now lets generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

Now lets play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now lets apply our color selections

heatmap(data2, ColSideColors = cc)

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
col = colorRampPalette(brewer.pal (8, "PiYG"))(25))

Theres more that we can customize

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:plotrix':
## 
##     plotCI

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(data2, ColSideColors = cc,
          col =colorRampPalette(brewer.pal(8, "PiYG"))(25))

set.seed(1234)
wdata=data.frame(
  sex = factor(rep(c("F","M"),each = 200)),
  weight = c(rnorm(200,56),rnorm(200,58))
)

head(wdata,4)

##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now let load dplyr

library(dplyr)

mu <- wdata %>% 
  group_by(sex) %>% 
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position ="bottom")
  
)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black",fill="grey") + 
  geom_vline(aes(xintercept = mean(weight)), 
          linetype ="dashed",size = 0.6 )

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill="white", position = "identity")+
  scale_color_manual(values= c("#00AFBB","#E7B800"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill=sex), position = "identity")+
  scale_color_manual(values= c("#00AFBB","#E7B800")) +
  scale_fill_manual(values = c("indianred","lightblue1"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we want to combine density plot and histograms?

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666")

## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex), 
                   fill = "white", position = "identity")+
  geom_density(aes(color = sex), size = 1) +
  scale_color_manual(values = c("indianred","lightblue1"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

plot(pressure)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), bindwith = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwith = 500): Ignoring
## unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution because the counts differ so much

ggplot(diamonds) + geom_bar(mapping = aes(x = cut))

to make the comparison easier, we need to swap the display on y-axis. Instead od displaying count, we’ll display density, which is count that area under the curve

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(color = cut),bindwidth = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

the fair diamonds have the highest average price. Thats because frequnecy polygons are la little hard to interpret.

Another alternative is the boxplot. A boxplot is a type of visual shorthand

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()

We see much less information about the distrubution, but the boxplots are much more compact, so we can more easily compare them.Supports the counterintuitive finding the better quaility diamonds

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip()

visualize the correlation between to continuos variable, use a scatter plot

ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))

Scatterplots becomes less useful as the size of your dataset grows, because we get overplot. We can fix this using the alpha aesthetic

ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

irst lets load a required library

library(RCurl)
library(dplyr)

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"

College_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(College_Data)

## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset aplhadically by college

aplhabetical <- College_Data %>%
  arrange(College_Data$college)

The glimpse package is another way to preview data

glimpse(College_Data)

## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

We can also subset with select()

College_Cases <- select(College_Data, college, cases)

We can also filter or subset the filter function

Louisiana_Cases <- filter(College_Data , state == "Louisiana")

Lets filter out smaller amount of states

South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)
library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

State_Data <- read.csv(state_site)

lets create group_by object using the state column

state_cases <- group_by(State_Data, state)

class(state_cases)

## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurfments were made by states? This gives us an idea of when states reporting

Days_since_first_reported <- tally(state_cases)

Lets visualize some data Frist lets start off with some definitions

Data - obvous - the stuff want to visualize

Layer - made of gemetric elements and statistical info

Scales = used to map values in teh data space used for creation of values

Coordinate SYstem - describes how data coordinates are mapped together

Faceting - how to break up data into subsets to display types

theme - controls the finer points of the display, font,size and background color

options(repr.plot.width = 6, repr.plot.height = 6)
class(College_Data)

## [1] "data.frame"

head(College_Data)

##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA

summary(College_Data)

##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Now lets take a look at a different dataset

iris <- as.data.frame(iris)
class(iris)

## [1] "data.frame"

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Lets start by creating a scatter plot of the College Data

ggplot(data = College_Data, aes(x = cases, y = cases_2021)) + geom_point() + theme_minimal()

## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

ggplot(data = iris , aes(x = Sepal.Width, y = Sepal.Length)) + geom_point() + theme_minimal()

Lets color coordinate our college data

ggplot(data = College_Data, aes(x = cases, y = cases_2021,color=state)) + geom_point() + theme_minimal()

## Warning: Removed 337 rows containing missing values (`geom_point()`).

Color the iris data

ggplot(data = iris , aes(x = Sepal.Width, y = Sepal.Length,color = Species)) + geom_point() + theme_minimal()

simple histogram

hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", main = "Total College Covid-19 Infections (Louisiana)")

simple histogram for iris data

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", ylab = "Frequency",main = "Iris Sepal width")

histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))

histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county)) + xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 Cases in Louisiana")

## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lets create a ggplot for iris data

histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))

histogram_iris + geom_histogram(bindwith = 0.2, color = "black", aes(fill = Species)) + xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")

## Warning in geom_histogram(bindwith = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindwith`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(South_Cases) + geom_density(aes(x = cases, fill = state), alpha = 0.25)

ggplot(iris) + geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

Violin plots

ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) + geom_violin()+theme_classic()+theme(legend.position = "none")

Now lets try the south data

ggplot(data = South_Cases, aes(x=state , y = cases, color = state)) + geom_violin() + theme_gray() + theme(legend.position = "none")

Taking a look at residuals plots the graph displays the residuals on a vertical axis and the independent variable on the horizontal.

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) + geom_point(aes(x = .fitted, y = .resid))

Now look at the southern states

ggplot(lm(cases ~ cases_2021, data = South_Cases)) + geom_point(aes(x = .fitted, y = .resid))

A linear model is not good for state cases

obesity <- read.csv("Obesity_insurance.csv")

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:RCurl':
## 
##     complete

library(dplyr)
library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:ggpubr':
## 
##     mutate

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

class(obesity)

## [1] "data.frame"

summary(obesity)

##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

hist(obesity$charges)

we can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now test look at correlation. The cor() command is used to determine correlations betwwen two vectors, all of the colums of a data frame, or two data frames. The cov() command, on the other hand examins the covariance.

cor(obesity$charges, obesity$bmi)

## [1] 0.198341

This test uses a spearman correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi,method = "kendall")

## [1] 0.08252397

correlation measures strength of a correlation between -1 and 1 Now lets look at th Tietjen=Moore test. This is used for univariate datasets.

TietjenMoore <- function(dataSeries,k)
{
  n = length(dataSeries)
  #Compute the absolute residuals
  r = abs(dataSeries - mean(dataSeries))
  # Sort data according to size of residual
  df = data.frame(dataSeries,r)
  dfs = df[order(df$r),]
  #create a subset of the data without larger values
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  # Compute the sums of squares
  ksub = (subdataSeries = mean(subdataSeries)) ** 2
  all = (df$dataSeries - mean(df$dataSeries)) **2
  sum(ksub)/sum(all)
}

function coputes the absolutre residuals and sorts data accordingly

FindOutliersTietjenMooreTest <- function(dataSeries,k,alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  test = c(1:10000)
  for(i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries,k)
  }
  Talpha = quantile(test,alpha)
  list(T = ek, Talpha = Talpha)
}

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges,50)

## $T
## [1] 0.0007249096
## 
## $Talpha
##          50% 
## 2.979474e-07

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi,2)

## $T
## [1] 0.01886975
## 
## $Talpha
##          50% 
## 2.533423e-07

First, it loads the necessary package

library(ggplot2)
library(ggridges)

BiocManager::install("ggridges")

## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: https://cloud.r-project.org

## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'ggridges'

## Installation paths not writeable, unable to update packages
##   path: /usr/lib/R/library
##   packages:
##     boot, class, cluster, codetools, foreign, KernSmooth, lattice, nlme, nnet,
##     rpart, spatial, survival

## Old packages: 'abind', 'ade4', 'adegenet', 'apcluster', 'ape', 'aplot',
##   'askpass', 'backports', 'bbmle', 'bdsmatrix', 'BH', 'bio3d', 'BiocFileCache',
##   'BiocManager', 'biomaRt', 'Biostrings', 'bit', 'bit64', 'bitops', 'brew',
##   'brio', 'broom', 'BSgenome', 'bslib', 'cachem', 'callr', 'car', 'caTools',
##   'checkmate', 'circlize', 'cli', 'coda', 'colorspace', 'commonmark',
##   'corrplot', 'cowplot', 'cpp11', 'crayon', 'credentials', 'csaw', 'curl',
##   'data.table', 'DBI', 'dbplyr', 'deldir', 'dendextend', 'desc', 'DESeq2',
##   'digest', 'distory', 'doRNG', 'dotCall64', 'downlit', 'DT', 'e1071', 'edgeR',
##   'evaluate', 'expm', 'fansi', 'farver', 'fastcluster', 'fastmap', 'fastmatch',
##   'ff', 'fields', 'filelock', 'fontawesome', 'fs', 'GenomeInfoDb',
##   'GenomicAlignments', 'GenomicFeatures', 'gert', 'ggdendro', 'ggforce',
##   'ggfun', 'ggplot2', 'ggraph', 'ggrepel', 'ggsci', 'ggtree', 'gh', 'glue',
##   'googleVis', 'gplots', 'graphlayouts', 'gtable', 'haven', 'highr', 'Hmisc',
##   'htmlTable', 'htmltools', 'htmlwidgets', 'httpuv', 'httr2', 'igraph',
##   'imager', 'interp', 'jsonlite', 'kernlab', 'knitr', 'labelled', 'lamW',
##   'later', 'leaflet', 'LiblineaR', 'lme4', 'locfit', 'lubridate',
##   'Luminescence', 'maps', 'markdown', 'matrixStats', 'mclust', 'metapod',
##   'minqa', 'mosaic', 'munsell', 'mvtnorm', 'nloptr', 'NLP', 'openssl',
##   'patchwork', 'pbkrtest', 'phangorn', 'phytools', 'pillar', 'pixmap',
##   'pkgbuild', 'pkgdown', 'pkgload', 'plotly', 'polyclip', 'pool', 'processx',
##   'profvis', 'progress', 'promises', 'ps', 'purrr', 'quantreg', 'R6', 'ragg',
##   'raster', 'Rcpp', 'RcppArmadillo', 'RcppEigen', 'RcppParallel', 'RcppThread',
##   'RCurl', 'readr', 'remotes', 'reprex', 'rgl', 'Rhtslib', 'rJava', 'rjson',
##   'rlang', 'rmarkdown', 'RMySQL', 'roxygen2', 'rrBLUP', 'RSQLite',
##   'rstudioapi', 'rvest', 'S4Arrays', 'sass', 'segmented', 'seqinr',
##   'sessioninfo', 'shape', 'shiny', 'shinycssloaders', 'slam', 'sp', 'spam',
##   'SparseArray', 'SparseM', 'stringi', 'sys', 'systemfonts', 'terra',
##   'testthat', 'textshaping', 'tidygraph', 'tidyr', 'tidyselect', 'tidytext',
##   'tidytree', 'timechange', 'tinytex', 'tm', 'topicmodels', 'tweenr',
##   'usethis', 'uuid', 'vctrs', 'vegan', 'viridis', 'vroom', 'waldo', 'withr',
##   'xfun', 'XML', 'xml2', 'xopen', 'xts', 'yaml', 'yulab.utils', 'zip',
##   'zlibbioc', 'zoo'

?airquality

air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air

## Picking joint bandwidth of 2.65

library(viridis)

## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:scales':
## 
##     viridis_pal

ggplot(airquality) + aes(Temp, Month , group = Month, fill = ..x..) + 
  geom_density_ridges_gradient() + 
  scale_fill_viridis(option = "C",name = "Temp")

## Picking joint bandwidth of 2.65

library(tidyr)

airquality %>%
  gather(key = "measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot(aes(x = value, y = Month, group = Month)) +
  geom_density_ridges() +
  facet_wrap(~ measurement, scales = "free")

## Picking joint bandwidth of 11

## Picking joint bandwidth of 40.1

## Picking joint bandwidth of 2.65

## Picking joint bandwidth of 1.44

## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

First lets load our required packages

library(plotly)

## 
## Attaching package: 'plotly'

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Now lets create a random 3d matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x),ncol = length(d$y))

Now lets plot our 3D data

plot_ly(d, x=~x, y= ~y, z = ~z) %>% 
  add_surface()

Lets add some more aspects to it, such as at topogrophy

plot_ly(d, x =~x,y=~y,z=~z) %>% 
  add_surface(
    contours = list(
      z=list(
        show=TRUE,
        usecolormap=TRUE,
        highlightcolor="FF0000",
        project = list(z=TRUE)
      )
    )
  )

Now lets look at a 3d scatter plot

plot_ly(longley,x=~GNP,y=~Population, z=~Employed, marker= list(color = ~GNP)) %>% 
  add_markers()

First lets load our required package

library(plotly)

Lets start with a scatter plot of the Orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data =Orange, x =~age, y = ~circumference)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference)
        )

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean =120, sd =10)
new_data <- data.frame(Orange, trace_1)

We’ll use the random numbers as line on the graph

plot_ly(data = new_data, x = ~age, y= ~circumference, color =~Tree, size =~age,
        text =~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>% 
  add_trace(y =~trace_1, mode = 'lines') %>% 
  add_trace(y = ~circumference, mode ='markers')

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line and add labels

plot_ly(data = Orange, x= ~age, y = ~circumference,
       color = ~Tree, size = ~circumference,
       text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:",circumference)) %>% 
  add_trace(y = ~circumference, mode = 'markers') %>% 
  layout(
    title = "Plot Orange data with switchable trace",
    updatemenus = list(
      list(
        type = "downdrop",
        y = 0.8,
        buttons = list(
          list(method = "restyle",
               args = list("mode", "markers"),
               label = "Marker"
          ),
          list(method = "restyle",
               args = list("mode", "lines"),
               label = "Lines"
          )
        )
      )
    )
    
  )

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets take a look at qq plots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some datal

wdata =data.frame(
  sex = factor(rep(c("F", "M"), each =200)),
  weight =c(rnorm(200, 55), rnorm(200, 58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
theme_classic()+
theme (legend.position = "top")
)

create a qq plot of the weight

ggplot(wdata, aes(sample = weight)) +
stat_qq(aes(color = sex))+
scale_color_manual(values = c("#0073C2FF", "#FC4E07")) +
labs(y ="weight")

#install.packages(ggpubr)
library (ggpubr)

ggqqplot(wdata, x= "weight",
color = "sex", 
palettes = c("#0073C2FF", "#FC4E07"),
ggtheme =theme_pubclean())

#install.packages(mnonr)
library (mnonr)

data2 <- mnonr::mnonr(n = 1000, p=2, ms = 3, mk = 61, Sigma = matrix(c(1,0.5, 0.5, 1), 2, 2), initial = NULL)
data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample=V1)) +
stat_qq()

ggqqplot(data2, x= "V1",
         palette = "#0073C2FF",
         ggthem = theme_pubclean())

library(ggplot2)
library(ggridges)

#BiocManager::install("ggridges")

Now lets load some sample data

?airquality

air <- ggplot(airquality) +  aes(Temp, Month, group = Month) + geom_density_ridges()

air

## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)

ggplot(airquality) + aes(Temp, Month, group = Month, fill=after_stat(x)) +
  geom_density_ridges_gradient() + 
  scale_fill_viridis(option = "C", name = "Temp")

## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data.

library(tidyr)

airquality %>% 
gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) +
  geom_density_ridges()+
  facet_wrap(~ Measurement, scales = "free")

## Picking joint bandwidth of 11

## Picking joint bandwidth of 40.1

## Picking joint bandwidth of 2.65

## Picking joint bandwidth of 1.44

## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

The Sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.

AFFIN bing nrc

bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, discust, fear, joy, sadness, suprise and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.

The function get sentiments() allows us to get the specific sentiments lexicon with the measures for each one.

install.packages("textdata")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(tidytext)
library(textdata)

affin <- get_sentiments("afinn")

affin

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Lets look at bing

bing <- get_sentiments("bing")

bing

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

And lastly nrc

nrc <- get_sentiments("nrc")

nrc

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created either using crowdourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one oif the authors, and then validated with crowdsourcing.

Lets look at the words with a joy score from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(c(944, 1227, 1228,2300), mirror ="http://mirror.csclub.uwaterloo.ca/gutenberg")

tidy_books <- darwin %>%
group_by(gutenberg_id)  %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup() %>% 
unnest_tokens(word, text)

Lets add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944]<-"The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book ==  1228] <- "On the origin of species By Means of Natural selection"
tidy_books$book[tidy_books$book ==  2300] <- "The Descent of Man, and selection in Belation to sex"

tidy_books

## # A tibble: 786,575 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 786,565 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  dplyr::count(word) %>%
  arrange(desc(n))  # Sort in descending order

## Joining with `by = join_by(word)`

## # A tibble: 0 × 2
## # ℹ 2 variables: word <chr>, n <int>

We can also examine how sentiment changes throughout a work.

library(dplyr)
library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  dplyr::count(book, index = linenumber %/% 80, sentiment) %>%  # Explicitly calling dplyr::count
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

Now lets plot it

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) + 
facet_wrap(~book, ncol = 2, scales = "free_x")

There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB.

library(tidyr)

voyage <- tidy_books %>%
filter(book == "The Voyage of the Beagle")

voyage

## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows

Lets again use interger division (‘%/%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.

affin <- voyage %>%
inner_join(get_sentiments("afinn")) %>% 
group_by(index = linenumber %/% 80) %>% 
summarise(sentiment = sum(value)) %>%
mutate (method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
voyage %>%
  inner_join(get_sentiments("bing")) %>% 
mutate (method = "Bing et al."),
voyage %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))
)%>%
  mutate(method="NRC")) %>% 
dplyr::count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%

mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1154 of `x` matches multiple rows in `y`.
## ℹ Row 4245 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

We can now estimate the net sentiment (positive negative) in each chunk of the novel text for each lexion (dictionary). Lets bind them all together and visualize with ggplot

bind_rows(affin, bing_and_nrc) %>%
ggplot(aes (index, sentiment, fill= method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales ="free_y")

Lets look at the counts based on each dictionary

get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>% 

dplyr::count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  dplyr::count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort = TRUE) %>%
ungroup()

## Joining with `by = join_by(word)`

bing_word_counts

## # A tibble: 2,492 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive   1226
##  2 well       positive    855
##  3 like       positive    813
##  4 good       positive    487
##  5 doubt      negative    414
##  6 wild       negative    317
##  7 respect    positive    310
##  8 remarkable positive    295
##  9 important  positive    281
## 10 bright     positive    258
## # ℹ 2,482 more rows

This can be shown visually, and we can pipe straight into ggplot2

bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill =sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "contribution to Sentiment", y = NULL)

Lets spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)

custom_stop_words

## # A tibble: 1,153 × 2
##    word      lexicon
##    <chr>     <chr>  
##  1 wild      custom 
##  2 dark      custom 
##  3 great     custom 
##  4 like      custom 
##  5 a         SMART  
##  6 a's       SMART  
##  7 able      SMART  
##  8 about     SMART  
##  9 above     SMART  
## 10 according SMART  
## # ℹ 1,143 more rows

bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "Contribution to Sentiment", y = NULL)

word clouds!

We can see that tidy text mining and sentimnet analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques lets use the wordcloud package!!

library(dplyr)
library(wordcloud)

## 
## Attaching package: 'wordcloud'

## The following object is masked from 'package:gplots':
## 
##     textplot

tidy_books %>%
  anti_join(stop_words) %>%
  dplyr::count(word) %>%  # Ensuring dplyr::count() is used
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

Lets also look at comparison.cloud(), which may require turing the dataframe into a matrix.

We can change to matrix using the acast() function.

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(dplyr)

tidy_books %>%
inner_join(get_sentiments("bing")) %>%
dplyr::count(word, sentiment, sort = TRUE) %>%
acast(word ~sentiment, value.var= "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)

## Joining with `by = join_by(word)`

Looking at units beyond words

Lots of useful work can be done by tokenizing nat the word level, but sometimes ints nice to look at different units of text. For example, we can look beyond just unigrams.

Ex I am not having a good day.

library(dplyr)

bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  dplyr::summarize(words = n(), .groups = "drop")  # Explicitly use dplyr::summarize()

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  dplyr::summarize(negativewords = n(), .groups = "drop") %>%  # Explicitly use dplyr::summarize()
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords / words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>%
  ungroup()

## Joining with `by = join_by(word)`

## # A tibble: 1 × 5
##   book                                        chapter negativewords words  ratio
##   <chr>                                         <int>         <int> <int>  <dbl>
## 1 The Expression of the Emotions in Man and …      10           249  4220 0.0590

First we’ll look at the unnest token function Lets start by looking at an Emily Dickenson passage

text <- c("Because I could not stop from Death =",
"He kindly stopped for me - ",
"The Carriage held but just ourselves -",
"and Immortality")

text

## [1] "Because I could not stop from Death =" 
## [2] "He kindly stopped for me - "           
## [3] "The Carriage held but just ourselves -"
## [4] "and Immortality"

This is a typlical character vector that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a datafra

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df

## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "Because I could not stop from Death =" 
## 2     2 "He kindly stopped for me - "           
## 3     3 "The Carriage held but just ourselves -"
## 4     4 "and Immortality"

Reminder: A tibble is a modern clas sof data frmae within R. ITs available in the dplyr and tibble packages, that has a 36 convenient print method, will not convert strongs to factors, and does not use row names. Tibbles are great for use with tidy tools.

Next we will use the ’unest_tokens function.

First we have the output column name that will be created as the text is unnested into it

library(tidytext)

text_df %>%
unnest_tokens(word, text)

## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeaustenr package to analyze some Jane Austen texts. There are 6 books in this package. 52

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\dtvxlc]",
ignore_case = TRUE)))) %>%

ungroup()

original_books

## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

so far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing word relationships.

library(dplyr)
library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228,2300), mirror = "http://mirror.csclub.uwaterloo.ca/gutenberg")
colnames (darwin_books) [1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The voyage of the Beagle"
darwin_books$book[darwin_books$book ==1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the origin of Species By Means of Natural selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and selection in Relation to Sex"


darwin_bigrams <- darwin_books %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)

darwin_bigrams

## # A tibble: 724,531 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The voyage of the Beagle the voyage    
##  2 The voyage of the Beagle voyage of     
##  3 The voyage of the Beagle of the        
##  4 The voyage of the Beagle the beagle    
##  5 The voyage of the Beagle beagle by     
##  6 The voyage of the Beagle charles darwin
##  7 The voyage of the Beagle <NA>          
##  8 The voyage of the Beagle <NA>          
##  9 The voyage of the Beagle <NA>          
## 10 The voyage of the Beagle <NA>          
## # ℹ 724,521 more rows

This data is still in tidytext format, and ist structured as one-token-per-row. Each token is a bigram.

Counting and filtering n-gram

library(dplyr)  # Ensure dplyr is loaded

darwin_bigrams %>%
  dplyr::count(bigram, sort = TRUE)  # Explicitly use dplyr::count()

## # A tibble: 238,516 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the   11297
##  2 <NA>      8947
##  3 in the    5257
##  4 on the    4093
##  5 to the    2849
##  6 the same  2048
##  7 that the  1947
##  8 it is     1830
##  9 of a      1610
## 10 and the   1590
## # ℹ 238,506 more rows

Most of the common bigrams are stop-words. Thiscan be a good time to use tidyr’s seperate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
separate(bigram, c("word1", "word2"), sep =" ")

bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)

bigrams_filtered

## # A tibble: 94,896 × 3
##    book                     word1   word2  
##    <chr>                    <chr>   <chr>  
##  1 The voyage of the Beagle charles darwin 
##  2 The voyage of the Beagle <NA>    <NA>   
##  3 The voyage of the Beagle <NA>    <NA>   
##  4 The voyage of the Beagle <NA>    <NA>   
##  5 The voyage of the Beagle <NA>    <NA>   
##  6 The voyage of the Beagle <NA>    <NA>   
##  7 The voyage of the Beagle online  edition
##  8 The voyage of the Beagle <NA>    <NA>   
##  9 The voyage of the Beagle degree  symbol 
## 10 The voyage of the Beagle degs    italics
## # ℹ 94,886 more rows

New bigram counts

bigram_counts <- bigrams_filtered %>% 
  unite(bigram, word1,word2, sep =" ")

bigram_counts

## # A tibble: 94,896 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The voyage of the Beagle charles darwin
##  2 The voyage of the Beagle NA NA         
##  3 The voyage of the Beagle NA NA         
##  4 The voyage of the Beagle NA NA         
##  5 The voyage of the Beagle NA NA         
##  6 The voyage of the Beagle NA NA         
##  7 The voyage of the Beagle online edition
##  8 The voyage of the Beagle NA NA         
##  9 The voyage of the Beagle degree symbol 
## 10 The voyage of the Beagle degs italics  
## # ℹ 94,886 more rows

we may also be interested in trigrams, which are three word combos

library(dplyr)
library(tidytext)
library(tidyr)

trigrams <- darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(word1 %in% stop_words$word,
         !word2 %in% stop_words$word, 
         !word3 %in% stop_words$word) %>%
  dplyr::count(word1, word2, word3, sort = TRUE)  # Explicitly using dplyr::count()

trigrams

## # A tibble: 40,256 × 4
##    word1   word2      word3         n
##    <chr>   <chr>      <chr>     <int>
##  1 the     lower      animals     127
##  2 of      natural    selection    86
##  3 the     breeding   season       83
##  4 through natural    selection    73
##  5 through sexual     selection    70
##  6 of      sexual     selection    52
##  7 the     zoological gardens      49
##  8 the     upper      lip          44
##  9 by      natural    selection    42
## 10 in      south      america      42
## # ℹ 40,246 more rows

Lets analyze some bigrams

library(dplyr)

bigrams_filtered %>% 
  filter(word2 == "selection") %>% 
  dplyr::count(book, word1, sort = TRUE)  # Explicitly calling dplyr::count()

## # A tibble: 39 × 3
##    book                                                   word1           n
##    <chr>                                                  <chr>       <int>
##  1 The Descent of Man, and selection in Relation to Sex   sexual        254
##  2 On the origin of Species By Means of Natural selection natural       250
##  3 The Descent of Man, and selection in Relation to Sex   natural       156
##  4 On the origin of Species By Means of Natural selection sexual         18
##  5 On the origin of Species By Means of Natural selection continued       6
##  6 The Descent of Man, and selection in Relation to Sex   unconscious     6
##  7 On the origin of Species By Means of Natural selection methodical      5
##  8 The Descent of Man, and selection in Relation to Sex   continued       5
##  9 On the origin of Species By Means of Natural selection unconscious     4
## 10 The Expression of the Emotions in Man and Animals      natural         4
## # ℹ 29 more rows

lsts again look at tf-idf across bigrams across Darwins work.

library(dplyr)
library(tidytext)

bigram_tf_idf <- bigram_counts %>% 
  dplyr::count(book, bigram) %>%  # Explicitly using dplyr::count()
  bind_tf_idf(bigram, book, n) %>% 
  arrange(desc(tf_idf))

bigram_tf_idf

## # A tibble: 60,595 × 6
##    book                                       bigram     n      tf   idf  tf_idf
##    <chr>                                      <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and… nerve…    47 0.00350 1.39  0.00485
##  2 On the origin of Species By Means of Natu… natur…   250 0.0160  0.288 0.00460
##  3 The Expression of the Emotions in Man and… la ph…    35 0.00260 1.39  0.00361
##  4 The voyage of the Beagle                   bueno…    54 0.00245 1.39  0.00339
##  5 The voyage of the Beagle                   capta…    53 0.00240 1.39  0.00333
##  6 On the origin of Species By Means of Natu… glaci…    36 0.00230 1.39  0.00319
##  7 The voyage of the Beagle                   fitz …    50 0.00227 1.39  0.00314
##  8 The Expression of the Emotions in Man and… muscl…    30 0.00223 1.39  0.00310
##  9 The Expression of the Emotions in Man and… orbic…    29 0.00216 1.39  0.00299
## 10 The Expression of the Emotions in Man and… dr du…    26 0.00194 1.39  0.00268
## # ℹ 60,585 more rows

bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(book) %>%
slice_max(tf_idf, n = 10) %>%
ungroup() %>%
mutate(bigram = reorder(bigram, tf_idf)) %>%
ggplot(aes (tf_idf, bigram, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf of bigrams", y = NULL)

Using bigrams to profice contextin sentiment analysis

library(dplyr)

bigrams_separated %>% 
  filter(word1 == "not") %>% 
  dplyr::count(word1, word2, sort = TRUE)  # Explicitly using dplyr::count()

## # A tibble: 867 × 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      128
##  2 not   have    104
##  3 not   only    103
##  4 not   a       100
##  5 not   to       98
##  6 not   been     89
##  7 not   the      82
##  8 not   at       70
##  9 not   know     60
## 10 not   so       58
## # ℹ 857 more rows

By doing sentiment analysis on bigrams, we can examine hwo oftenn sentiment-associated words are preceded by a modifier like “not” or other negating words.

AFINN <- get_sentiments("afinn")

AFINN

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

We can examine the most frequent words that were preceded by “not”, and associate with sentiment.

library(dplyr)

not_words <- bigrams_separated %>% 
  filter(word1 == "not") %>% 
  inner_join(AFINN, by = c(word2 = "word")) %>% 
  dplyr::count(word2, value, sort = TRUE)  # Explicitly using dplyr::count()

not_words

## # A tibble: 114 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 doubt        -1    25
##  2 like          2    11
##  3 pretend      -1     9
##  4 wish          1     8
##  5 admit        -1     7
##  6 difficult    -1     5
##  7 easy          1     5
##  8 reach         1     5
##  9 extend        1     4
## 10 forget       -1     4
## # ℹ 104 more rows

Lets visualize

library(ggplot2)

not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment value * number or occurences", y = "words preceded by \"not\"")

library(dplyr)

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>% 
  filter(word1 %in% negation_words) %>% 
  inner_join(AFINN, by = c(word2 = "word")) %>% 
  dplyr::count(word1, word2, value, sort = TRUE)  # Explicitly using dplyr::count()

negated_words

## # A tibble: 208 × 4
##    word1   word2     value     n
##    <chr>   <chr>     <dbl> <int>
##  1 no      doubt        -1   210
##  2 not     doubt        -1    25
##  3 no      great         3    19
##  4 not     like          2    11
##  5 not     pretend      -1     9
##  6 not     wish          1     8
##  7 without doubt        -1     8
##  8 not     admit        -1     7
##  9 no      greater       3     6
## 10 not     difficult    -1     5
## # ℹ 198 more rows

Lets visualize the negation words

negated_words %>%
mutate(contribution = n *value,
word2= reorder(paste(word2, word1, sep = "_"), contribution)) %>%
group_by(word1) %>%
slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
ggplot(aes(word2, contribution, fill = n* value > 0)) +
geom_col(show.legened = FALSE) +
facet_wrap(~ word1, scales = "free") +
scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
xlab("Words preceded by negation term") +
ylab("sentiment value # of occurences") +
coord_flip()

## Warning in geom_col(show.legened = FALSE): Ignoring unknown parameters:
## `show.legened`

Visualize a network of bigrams with ggraph

library(dplyr)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:plotly':
## 
##     groups

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

bigram_counts <- bigrams_filtered %>%
  dplyr::count(word1, word2, sort = TRUE)  # Explicitly use dplyr::count()

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>% 
  graph_from_data_frame()

## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"

bigram_graph

## IGRAPH 4eef891 DN-- 203 140 -- 
## + attr: name (v/c), n (e/n)
## + edges from 4eef891 (vertex names):
##  [1] NA        ->NA          natural   ->selection   sexual    ->selection  
##  [4] vol       ->ii          lower     ->animals     sexual    ->differences
##  [7] south     ->america     distinct  ->species     secondary ->sexual     
## [10] breeding  ->season      closely   ->allied      sexual    ->characters 
## [13] tierra    ->del         del       ->fuego       vol       ->iii        
## [16] de        ->la          natural   ->history     fresh     ->water      
## [19] north     ->america     bright    ->colours     sexual    ->difference 
## [22] allied    ->species     tail      ->feathers    strongly  ->marked     
## + ... omitted several edges

library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes (label= name), vjust = 1, hjust = 1)

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

we can also add directionality to this network

set.seed (1234)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, 
               arrow =a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label=name), vjust = 1, hjust = 1) +
theme_void()

You can see that the usual supspects are the most common words, but don’t tell us anything about what the books topic is.

A density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F","M"), each =200)),
  weight = c(rnorm(200,55), rnorm(200,58))
)

library(dplyr)
mu <- wdata %>% 
  group_by(sex) %>% 
summarise(grp.mean = mean(weight))

Now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

d + geom_density() + 
  geom_vline(aes(xintercept = mean(weight)), linetype= "dashed")

Now lets hange the y axis to count instead of density

d + geom_density(aes(y = after_stat(count)), fill  = "lightgray")+
  geom_vline(aes(xintercept = mean(weight)), linetype ="dashed")

d + geom_density(aes(color = sex)) +
  scale_color_manual(values = c("darkgray","gold"))

Lastly, lets fill the desity plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, colr = sex), data = mu, linetype = "dashed") + 
  scale_color_manual(values =c("grey","gold")) +
  scale_fill_manual(values = c("grey","gold"))

## Warning in geom_vline(aes(xintercept = grp.mean, colr = sex), data = mu, :
## Ignoring unknown aesthetics: colr

First lets load the required packages

library(ggplot2)

Lets set out therm

theme_set(
  theme_dark()+
    theme(legend.position = "top")
)

what if we wat to know what our outliers are first we need to load libraries

library(outliers)
library(ggplot2)
library(readxl)

reloud the dataset because the removed outliers

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Lets create a function using the grubb test to indetify all outliers.The grubbs test identifies outliers in a univariate dataset that is suppose to be normal

grubbs.flag <- function(x) {
  #lets create a variable called outlier and save nothing in it
  outliers <- NULL
  # Well create a varibale called test to identify whci univariate we are testing
  test <- x
  # use grubbs .test to find outliers in our variable
  grubbs.result <- grubbs.test(test)
  pv <- grubbs.result$p.value
  while(pv < 0.05) {
    # anything with pvalues greater than p = 0.05, we add to our empty outliers vector
    outliers <- c(outliers,as.numeric(strsplit(grubbs.result$alternative," ")[[1]][[3]]))
    # remove those outliers from our test variable
    test <- x[!x %in% outliers]
    # run the grubbs test without the outliers
    grubbs.result <- grubbs.test(test)
    # save the new p values
    pv <- grubbs.result$p.value
  }
  return(data.frame(X=x,Outliers = (x %in% outliers)))
}

identified_outliers <- grubbs.flag(Air_data$AH)

now a histogram can be created

ggplot(grubbs.flag(Air_data$AH),aes(x=Air_data$AH,color = Outliers, fill = Outliers))+ geom_histogram(bindwidth = diff(range(Air_data$AH)))+theme_bw()

## Warning in geom_histogram(bindwidth = diff(range(Air_data$AH))): Ignoring
## unknown parameters: `bindwidth`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3 , 20))

y is the width of the diamond, so anythin under 3mm or above 20 is excluded

diamonds3 <- diamonds %>%
  mutate(y=ifelse(y< 3 | y > 20, NA, y))

ggplot2 subscribes to the idea that missing values shouldn’t pass

ggplot(data = diamonds3,mapping = aes(x = x, y= y)) + geom_point()

## Warning: Removed 9 rows containing missing values (`geom_point()`).

you want to supress the warnings use na.rm = TRUE

    ggplot(data = diamonds3, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with makes observation with missing values different to the observation with recorded values. NYCflights13 dataset, missing values in the dep_time varibable indicate that the flight

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot(mapping = aes(sched_dep_time)) + 
  geom_freqpoly(mapping = aes(color = cancelled), binwidth = 1/4)

Final_Project

2025-02-25

R Markdown

Including Plots

R Markdown

Including Plots