In this example we will calculate mean and 95% confidence intervals for a numerical column grouped by another column. eg. We will use Species and Sepal.Length fields and calculate the mean and the confidence intervals of Sepal.Length for each Species.
Wath the youtube video link for easier understanding of the following examples
Plot the mean and confidence intervals in ggplot
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Using the iris built in data set we will use Species and Sepal.Length columns for our example. Sepaal.Length is our numerical column for which we will calculate the meam and the 95% confidence intervals for the mean.
As we are using Species in our data we will be grouping our data by this column. Which means that we will calculate mean and the confidence for each Species.
data <- iris %>% select(Species, Sepal.Length)
t.test(data$Sepal.Length,conf.level = 0.95)
##
## One Sample t-test
##
## data: data$Sepal.Length
## t = 86.425, df = 149, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 5.709732 5.976934
## sample estimates:
## mean of x
## 5.843333
dt <- data%>%
dplyr::group_by(Species)%>%
dplyr::summarise(
mean = mean(Sepal.Length),
lci = t.test(Sepal.Length, conf.level = 0.95)$conf.int[1],
uci = t.test(Sepal.Length, conf.level = 0.95)$conf.int[2])
dt
pl1 <- ggplot(data = dt)
pl1 <- pl1 + geom_bar(aes(x=Species, y=mean, fill = Species), stat="identity")
pl1 <- pl1 + geom_errorbar(aes(x=Species, ymin=lci, ymax= uci), width = 0.4, color ="red", size = 1)
pl1 <- pl1 + geom_text(aes(x=Species, y=lci, label = round(lci,1)), size= 2, vjust = 1)
pl1 <- pl1 + geom_text(aes(x=Species, y=uci, label = round(uci,1)), size= 2, vjust = -1)
pl1 <- pl1 + theme_classic()
pl1 <- pl1 + labs(title = "Bar chart with 95% confidence intervals")
pl1 <- pl1 + labs(x= "Species", y = "Mean Sepal Length")
pl1
pl2 <- ggplot(data = dt)
pl2 <- pl2 + geom_line(aes(x=Species, y=mean), group = 1)
pl2 <- pl2 + geom_point(aes(x=Species, y=mean), color= "red")
pl2 <- pl2 + geom_errorbar(aes(x=Species, ymin=lci, ymax= uci), width = 0.4, color ="red", size = 1)
pl2 <- pl2 + geom_text(aes(x=Species, y=lci, label = round(lci,1)), size= 2, vjust = 1)
pl2 <- pl2 + geom_text(aes(x=Species, y=uci, label = round(uci,1)), size= 2, vjust = -1)
pl2 <- pl2 + theme_classic()
pl2 <- pl2 + labs(title = "Line chart with 95% confidence intervals")
pl2 <- pl2 + labs(x= "Species", y = "Mean Sepal Length")
pl2
pl3 <- ggplot(data = dt)
pl3 <- pl3 + geom_point(aes(x=Species, y=mean), color= "red", size = 3)
pl3 <- pl3 + geom_errorbar(aes(x=Species, ymin=lci, ymax= uci), width = 0.4, color ="red", size = 1)
pl3 <- pl3 + geom_text(aes(x=Species, y=lci, label = round(lci,1)), size= 2, vjust = 1)
pl3 <- pl3 + geom_text(aes(x=Species, y=uci, label = round(uci,1)), size= 2, vjust = -1)
pl3 <- pl3 + theme_classic()
pl3 <- pl3 + labs(title = "Point chart with 95% confidence intervals")
pl3 <- pl3 + labs(x= "Species", y = "Mean Sepal Length")
pl3
In the example we will use Mean + 1 SD and Mean - 1 SD
dt <- data%>%
dplyr::group_by(Species)%>%
dplyr::summarise(
mean = mean(Sepal.Length)
,sd = sd(Sepal.Length)
,mean_pls_sd = mean + sd
,mean_mns_sd = mean - sd)
dt
pl1 <- ggplot(data = dt)
pl1 <- pl1 + geom_bar(aes(x=Species, y=mean, fill = Species), stat="identity")
pl1 <- pl1 + geom_errorbar(aes(x=Species, ymin=mean_mns_sd, ymax= mean_pls_sd), width = 0.4, color ="red", size = 1)
pl1 <- pl1 + geom_text(aes(x=Species, y=mean_mns_sd, label = round(mean_mns_sd,1)), size= 2, vjust = 1)
pl1 <- pl1 + geom_text(aes(x=Species, y=mean_pls_sd, label = round(mean_pls_sd,1)), size= 2, vjust = -1)
pl1 <- pl1 + theme_classic()
pl1 <- pl1 + labs(title = "Bar chart showing mean and the SD")
pl1 <- pl1 + labs(x= "Species", y = "Mean Sepal Length")
pl1