knitr::opts_chunk$set(echo = TRUE, cache = TRUE)

To hide code chunk echo=FALSE otherwise TRUE.

Packages————————————

Loading required packages

library(ggplot2)
library(dplyr)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(CGPfunctions)
library(patchwork)
library(gridExtra)

data("iris")
ggplot(data = iris,aes(x = Sepal.Length,y = Sepal.Width))+geom_point()

geom_point():It is a layer and Adds scatter plot points (geom = geometric object) to the plot.

Scatter Plot

data("iris")
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

ggplot(data = iris)+geom_point(aes(x=Sepal.Length,y=Sepal.Width),col="blue") +
  labs(title = "Scatter Plot of Sepal length and Sepal width",x= "Sepal length",
       y="Sepal width")# for rename the x and y

Data Visualization and ggplot

Data visualization with R and ggplot2 in R Programming Language also termed as Grammar of Graphics. It includes several layers on which it is governed. The layers are as follows:

Building Blocks of layers with the grammar of graphics Data: The element is the data set itself Aesthetics: The data is to map onto the Aesthetics attributes such as x-axis, y-axis, color, fill, size, labels, alpha, shape, line width, line type Geometrics: How our data being displayed using point, line, histogram, bar, boxplot Facets: It displays the subset of the data using Columns and rows Statistics: Binning, smoothing, descriptive, intermediate Coordinates: the space between data and display using Cartesian, fixed, polar, limits Themes: Non-data link

Data-set mtcars(motor trend car road test) comprise fuel consumption and 10 aspects of automobile design and performance for 32 automobiles and come come pre-installed with dplyr package in R.

data(mtcars)
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Data layer ggplot2 in R the data Layer we define the source of the information to be visualize, let’s use the mtcars data-set in the ggplot2 package.

ggplot(data=mtcars)+
  labs(title="MTCars Data Plot")

Aesthetic layer ggplot2 in R Here we will display and map data-set into certain aesthetics.

ggplot(data=mtcars,aes(x=hp,y=mpg,col=disp))+
  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")

Geometric layer ggplot2 in R geometric layer control the essential elements, see how our data being displayed using point, line, histogram, bar, boxplot.

ggplot(data=mtcars,aes(x=hp,y=mpg,col=disp))+
  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")+
  geom_point()

Geometric layer Adding Size, color, and shape and then plotting the Histogram plot

ggplot(data=mtcars,aes(x=hp,y=mpg,size = disp))+
  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")+
  geom_point()

ggplot(data=mtcars,aes(x=hp,y=mpg,size=disp,col=factor(cyl)))+
  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")+
  geom_point()

ggplot(data=mtcars,aes(x=hp,y=mpg,shape=factor(am),color=factor(cyl)))+
  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")+
  geom_point()

ggplot(data=mtcars,aes(x=hp,y=mpg,color=factor(cyl)))+geom_point(shape=4)

  labs(tile="Miles per Gallon vs Horsepower",x="Horse Power",y="Miles Per Galon")+
  geom_point()

## NULL

# here, geom_point(shape=4) is used 
# check the tutorial pdf

ggplot(data=mtcars,aes(x=hp))+
  labs(title = "Histogram of Horse-power",
       x="Horse-power",y="Count")+
  geom_histogram(binwidth = 10)

Facet layer

ggplot2 in R facet layer is used to split the data up into subsets of the entire data-set and it allows the subsets to be visualized on the same plot. Here we separate rows according to transmission type and Separate columns according to cylinders.

# Separate rows according to transmission type
p <- ggplot(data=mtcars,mapping = aes(x = hp,y = mpg,shape = factor(cyl)))+
  geom_point()
p+facet_grid(am~.)+
  labs(title = "Miles per Gallon Vs Horse-power",
       x="Horse-power",
       y="Miles per Gallon")

The line p + facet_grid(am ~ .) in your ggplot2 code is used to create faceted plots by the am variable in rows.

Understanding facet_grid(am ~ .) facet_grid() creates multiple plots based on the values of a categorical or discrete numeric variable.

The formula inside facet_grid(am ~ .):

(1)The left side (am) represents the rows.

(2)The right side (.) means no faceting by columns.

This means the plot will be split into multiple rows, where each row corresponds to a unique value of the am (transmission type) variable.

q <- ggplot(data=mtcars,aes(x=hp,y=mpg,
                            shape=factor(cyl)))+
  geom_point()
q+facet_grid(.~cyl)+
  labs(title = "Miles per Gallon Vs Horse-power",
       x="Horse-power",
       y="Miles per Gallon")

q <- ggplot(data=mtcars,aes(x=hp,y=mpg,
                            shape=factor(cyl)))+
  geom_point()
q+facet_wrap(~cyl)+         
  labs(title = "Miles per Gallon Vs Horse-power",
       x="Horse-power",
       y="Miles per Gallon")

# Here,facet_wrap() is used to group wise presentation

Understanding facet_grid(. ~ cyl) facet_grid() is a function used to create multiple small plots (facets) based on the values of a categorical variable.

The formula inside facet_grid(. ~ cyl):

(1)The left side of the formula (.) represents rows in the grid. Since there’s a dot (.) here, it means no faceting by rows.

(2)The right side of the formula (~ cyl) means faceting by the cyl (number of cylinders in the mtcars dataset) in columns.

As a result, the plot will be split into multiple subplots, with each column representing a different number of cylinders (cyl).

Statistics layer

ggplot(data=mtcars,aes(x = hp,
                       y=mpg))+geom_point()+
  stat_smooth(method = lm,col="blue")+
  labs(title="Miles per Gallon Vs Horse-power",
       x="Horse-power",y="Miles per Gallon")

## `geom_smooth()` using formula = 'y ~ x'

Coordinates layer ggplot2 in R these layers, data coordinates are mapped together to the mentioned plane of the graphic and we adjust the axis and changes the spacing of displayed data with Control plot dimensions.

ggplot(data=mtcars,aes(x = wt,
                       y=mpg))+geom_point()+
  stat_smooth(method = lm,col="blue")+
  labs(title="Miles per Gallon Vs Weight",
       x="Weight",y="Miles per Gallon")+
  scale_y_continuous("Miles per Gallon",limits = c(2,25),expand=c(0,0))+
  scale_x_continuous("Weight",limits=c(0,25),expand=c(0,0))+
  coord_equal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

Coord_cartesian() to proper zoom in

ggplot(data=mtcars,aes(x = wt,
                       y=hp,col=am))+
  geom_point()+geom_smooth()+
  coord_cartesian(xlim = c(3,6))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

geom_smooth Adds a smoothed trend line (default is LOESS for small data-sets). and Helps visualize the general relationship between wt and hp.

coord_cartesian(xlim = c(3, 6)) (1) Zooms in on the x-axis by setting xlim = c(3, 6). (2)Unlike scale_x_continuous(), which removes data outside the range, coord_cartesian() keeps all data points but only displays the specified range. (3)The y-axis automatically adjusts to fit the visible data.

Theme layer ggplot2 in R layer controls the finer points of display like the font size and background color properties.

Example-1 Theme layer -element_rect() function

ggplot(data=mtcars,
       aes(x=hp,y=mpg))+
  geom_point()+
  facet_grid(.~cyl)+
  theme(plot.background=element_rect(fill="orange",colour = "gray"))+
  labs(title="Miles per Gallon Vs Horse-power",
       x="Horse-power",
       y="Miles per Gallon")

Example-2

ggplot(data=mtcars,aes(x=hp,y=mpg))+
  geom_point()+facet_grid(am~cyl)+
  theme_gray()+
  labs(title="Miles per Gallon Vs Horse-power",
       x="Horse-power",
       y="Miles per Gallon")

ggplot2 in R provides various types of visualizations. More parameters can be used included in the package as the package gives greater control over the visualizations of data. Many packages can integrate with the ggplot2 package to make the visualizations interactive and animated.

ggplot(data=mtcars,
       aes(x=wt,y=mpg))+
  stat_density2d(aes(fill = ..level..),
                 geom="polygon",color="white")+
  scale_fill_viridis_c()+
  labs(title = "2D Density Contour Plot of mtcars Data-set",
       x="Weight(wt)",
       y="Miles per Gallon(mpg)",
       fill="Density")+
  theme_minimal()

## Warning: The dot-dot notation (`..level..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(level)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

In ggplot2 in R stat_density_2d to generate the 2D density contour plot. The aesthetics x and y specify the variables on the x-axis and y-axis, respectively. The fill aesthetic is set to ..level.. to map fill color to density levels.

stat_density_2d(aes(fill = ..level..), geom = “polygon”, color = “white”) + stat_density_2d() computes the 2D density estimate of the data. (1)aes(fill = ..level..): Uses ..level.., which represents density levels, to color the plot. (2)geom = “polygon”: Fills the density regions with colors instead of drawing contour lines. (3)color = “white”: Outlines the density regions in white to create visible contour boundaries.

Creating a panel of different plots

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

# Selecting specific columns from mtcars data-set
selected_col <- c("mpg","disp","hp","drat")
selected_data <- mtcars[,selected_col]

# Create histograms for individual variables
hist_plot_mpg <- ggplot(data = selected_data, aes(x = mpg)) +
  geom_histogram(binwidth = 2,
                 fill = "blue",
                 color = "white") +
  labs(title = "Histogram: Miles per Gallon", x = "Miles per Gallon")

hist_plot_disp <- ggplot(selected_data, aes(x = disp)) +
  geom_histogram(binwidth = 50, fill = "red", color = "white") +
  labs(title = "Histogram: Displacement", x = "Displacement", y = "Frequency")
 
hist_plot_hp <- ggplot(selected_data, aes(x = hp)) +
  geom_histogram(binwidth = 20, fill = "green", color = "white") +
  labs(title = "Histogram: Horsepower", x = "Horsepower", y = "Frequency")
 
hist_plot_drat <- ggplot(selected_data, aes(x = drat)) +
  geom_histogram(binwidth = 0.5, fill = "orange", color = "white") +
  labs(title = "Histogram: Drat", x = "Drat", y = "Frequency")

# Arrange the plots in a grid 
grid.arrange(hist_plot_mpg,
             hist_plot_disp,
             hist_plot_hp,
             hist_plot_drat,
             ncol = 2)

# Here, ncol=2 means the histograms are presented into 
# two columns

#Histogram ####

ggplot(data = iris) +
  geom_histogram(
    aes(x = Sepal.Length),
    binwidth = 0.5,
    fill = "lightblue",
    col = "black"
  )

ggplot(data = iris) +
  geom_histogram(
    aes(x = Sepal.Length),
    bins = 10,
    fill = "lightblue",
    col = "black"
  )

# Here,bins 
# Number of bins. Overridden by bin-width. Defaults to 30.

ggplot(data = iris) +
  geom_histogram(
    aes(x = Sepal.Length),
    bins = 10,
    fill = "lightblue",
    col = "white"
  )

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species),
                 bins = 10,
                 col = "black")

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species),
                 bins = 10,
                 col = "white")

ggplot(data = iris, aes(x = Sepal.Length, fill = Species)) +
  geom_histogram(bins = 10,
                 col = "black",
                 alpha = 0.2)

ggplot(data=iris)+
  geom_histogram(aes(x=Sepal.Length,fill=Species),bins=10,
                 col="white",alpha=0.5)

To (knit to pdf) we need packages install.packages(“tinytex”) tinytex::install_tinytex in Console

In line code

res <- sample(1:1000,10)
res

##  [1] 364 378 102 407 512 863 545 646 114 326

sum(res)

## [1] 4257

The total of the 10 sample number 4257

Reduce gap between plot and axis

ggplot(data = iris) +
  geom_histogram(
    aes(x = Sepal.Length, fill = Species),
    bins = 10,
    col = "black"
    ,
    alpha = 0.6
  ) +
  coord_cartesian(expand = T)

If TRUE, the default, adds a small expansion factor to the limits to ensure that data and axes don’t overlap. If FALSE, limits are taken exactly from the data or xlim/ylim.

ggplot(data = iris) +
  geom_histogram(
    aes(x = Sepal.Length, fill = Species),
    bins = 10,
    col = "black",
    apha = 0.5
  ) +
  scale_y_continuous(breaks = seq(-10, 30, by = 5),
  expand = expansion(mult = c(-0.1, 1), # Expands upper portion of the plot 20%
                    add = c(10, 0)))   # Increase gap at the bottom portion by 10 unit

## Warning in geom_histogram(aes(x = Sepal.Length, fill = Species), bins = 10, :
## Ignoring unknown parameters: `apha`

ggplot(data=iris)+
  geom_histogram(aes(x=Sepal.Length,fill=Species),
                 bins=10,col="black",alpha=0.5)+
  scale_y_continuous(expand=expansion(add = c(0,5)))+
  scale_x_continuous(expand = expansion(add=c(0,0)))+
  labs(title="Histogram of Sepal-length as per Species",
       x="Sepal-length",
       y="Count")

#Facet#

facet_wrap

ggplot2 in R facet layer is used to split the data up into subsets of the entire data-set and it allows the subsets to be visualized on the same plot.

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species),
                 bins = 10,
                 col = "white") +
  facet_wrap(vars(Species), ncol = 1, scales = "free") # Species variable(vars) wise

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species),
                 bins = 10,
                 col = "white") +
  facet_wrap(vars(Species), ncol =1)

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species),
                 bins = 10,
                 col = "white") +
  facet_wrap(vars(Species), ncol = 1, scales = "free_y")

facet_grid

facet_grid() forms a matrix of panels defined by row and column faceting variables. It is most useful when you have two discrete variables, and all combinations of the variables exist in the data.

ggplot(data=iris)+
  geom_histogram(aes(x=Sepal.Length,fill=Species),
                 bins=10,col="white",alpha=0.5)+
  facet_grid(rows=vars(Species))

Student data is here

setwd("D:/R-Programming/Class-12/Data")
student <- readxl::read_excel("StudentSurveyData.xlsx")
str(student)

## tibble [111 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ID               : num [1:111] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : chr [1:111] "Female" "Male" "Male" "Male" ...
##  $ Age              : num [1:111] 20 23 21 21 23 26 21 30 20 21 ...
##  $ Class            : chr [1:111] "Sophomore" "Senior" "Freshman" "Sophomore" ...
##  $ Major            : chr [1:111] "Other" "Management" "Other" "IS" ...
##  $ Grad Intention   : chr [1:111] "Yes" "Yes" "Yes" "Yes" ...
##  $ GPA              : num [1:111] 2.88 3.6 2.5 2.5 2.8 2.34 3 3.1 3.6 3.3 ...
##  $ Employment       : chr [1:111] "Full-Time" "Part-Time" "Part-Time" "Full-Time" ...
##  $ Salary           : num [1:111] 55 30 50 45 45 83 55 85 35 42.5 ...
##  $ Social Networking: num [1:111] 5 4 2 4 7 3 3 1 0 11 ...
##  $ Satisfaction     : num [1:111] 3 4 4 6 4 2 3 2 4 4 ...
##  $ Spending         : num [1:111] 850 860 1100 1100 1000 1200 1000 700 1000 700 ...
##  $ Computer         : chr [1:111] "Laptop" "Desktop" "Laptop" "Tablet" ...
##  $ Text Messages    : num [1:111] 200 50 200 250 100 0 50 300 400 100 ...
##  $ Wealth           : num [1:111] 2 10 70 100 1 5 0.6 1 0.6 1 ...

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Gender))

# here,cols means columns

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))

# here, cols means columns

# The function factor is used to encode a vector as a factor (the terms ‘category’ and ‘enumerated type’ are also used for factors). If argument ordered is TRUE, the factor levels are assumed to be ordered. 

student %>% 
  mutate(Class=factor(Class,levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  ggplot()+
  geom_histogram(aes(x=Spending,fill=Employment),
                 bins=10,col="white")+
  facet_grid(rows=vars(Employment),cols=vars(Class))

#Theme#

##Buit-in ggplot2 theme##

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species), 
                 bins = 10, col = "white", alpha = 0.5) +
  facet_wrap(vars(Species), ncol = 1) +
  theme_classic()

ggplot(data=iris)+
  geom_histogram(aes(x=Sepal.Length,fill=Species),
                 bins=10,col="white")+
  facet_wrap(vars(Species),ncol=1)+
  theme_bw()

ggplot(data=iris)+
  geom_histogram(aes(x=Sepal.Length,fill=Species),
                 bins=10,col="black")+
  facet_wrap(vars(Species),ncol=1)+
  theme_minimal()

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  theme_minimal()

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  theme_linedraw()

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  theme_void()

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  theme_test()

ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  ggthemes::theme_hc()

##Other pre-built theme##

library(ggthemes) #Theme similar to the default settings of LibreOffice Calc charts.
ggplot(data=student)+
  geom_histogram(aes(x=GPA,fill=Employment),
                 bins=10,col="black",alpha=0.5)+
  facet_grid(rows=vars(Employment),cols=vars(Class))+
  theme_calc()

##Manually editing themes##

library(ggThemeAssist)

 p <- ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species), bins = 10, col = "white", alpha = 0.5) +
  facet_wrap(vars(Species), ncol = 1) +
  labs(
    title = "Histogram of Sepal Length by Species",
    x = "Sepal Length",
    y = "Frequency",
    fill = "Species",
    subtitle = "Using Facet and Other Customizations",
    caption = "Data: Iris"
  )

# ggThemeAssistGadget(p)is an app

custom_theme <- theme(
  axis.title = element_text(face = "italic"),
  plot.title = element_text(face = "bold", hjust = 0),
  legend.text = element_text(face = "italic"),
  legend.title = element_text(face = "bold.italic"),
  panel.background = element_rect(fill = "white"),
  plot.background = element_rect(fill = "white"),
  legend.position = "bottom",
  legend.direction = "horizontal"
)

# therefore,
p+custom_theme

##Theme set globally## Where we don’t set theme After setting theme globally they catch the global theme.

#theme_set(theme_bw())

##Manually change color##

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species), bins = 10, col = "white", alpha = 0.5) +
  facet_wrap(vars(Species), ncol = 1) +
  scale_fill_manual(values = c("setosa" = "#6C1C80", "versicolor" = "#30A19C", "virginica" = "#123B96")) # named-vector ("setosa"="#6C1C80")

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species), bins = 10, col = "white", alpha = 0.5) +
  facet_wrap(vars(Species), ncol = 1) +
  scale_fill_brewer(palette = "Set1")# To get more color palette see the tutorial pdf

ggplot(data = iris) +
  geom_histogram(aes(x = Sepal.Length, fill = Species), bins = 10, col = "white", alpha = 0.5) +
  facet_wrap(vars(Species), ncol = 1) +
  scale_fill_hue(
    l = 80, c = 150,   # adjust luminosity and chroma
    h = c(90, 360)     # adjust range of hues
  )

Density plot

ggplot(data=iris)+
  geom_density(aes(x=Sepal.Length,fill=Species))

ggplot(data=iris)+
  geom_density(aes(x=Sepal.Length,fill=Species))+
  facet_wrap(vars(Species),ncol=1)+
  labs(title="Density Plot of Sepal-length as per Species",
       x="Sepal-length",
       y="Density")

#Histogram + Density plot#

ggplot(data=iris,aes(x=Sepal.Length,fill=Species))+
  geom_density(alpha=0.5,col="white")+
  geom_histogram(aes(y=after_stat(density)),alpha=0.5,bins=10)+
  facet_wrap(vars(Species),ncol=1)

Here, I have used y=after_stat(density) because geom_histogram has a default y value is equal to count and geom_density has a default y value is equal to density that means probability so to void collision for data interpretation as I want to understand data distribution I will need the data density value with the integration of density plot and histogram

ggplot(iris, aes(Sepal.Length)) +
  geom_histogram(aes(y = after_stat(density)), 
                 color = "#000000", fill = "#0099F8") +
  geom_density(color = "#000000", fill = "#0EE100", alpha = 0.5) +
  geom_vline(aes(xintercept = mean(Sepal.Length)), 
             color = "#000000", size = 1, linetype = "dashed") +
  labs(
    title = "Distribution of Sepal Length",
    subtitle = "Made by ggplot2",
    caption = "Source: Iris Data",
    x = "Sepal Length",
    y = "Density"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "blue", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10),
    plot.caption = element_text(face = "italic")
  ) +
  annotate("text", x = 5.9, y = 0.6, 
           label = paste0("Mode: ", round(DescTools::Mode(iris$Sepal.Length),1) ), hjust = 0)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Here,geom_vline is the part of the geom_abline that helps add reference lines (sometimes called rules) to a plot, either horizontal, vertical, or diagonal (specified by slope and intercept). These are useful for annotating plots.

Bar plot/ Column plot

ggplot(data=student)+
  geom_bar(aes(x=Computer))

ggplot(student)+
  geom_bar(aes(x=Computer ))+
  coord_flip()

ggplot(data=student)+
  geom_bar(aes(y=Computer))

student["Computer"]

## # A tibble: 111 × 1
##    Computer
##    <chr>   
##  1 Laptop  
##  2 Desktop 
##  3 Laptop  
##  4 Tablet  
##  5 Laptop  
##  6 Desktop 
##  7 Laptop  
##  8 Tablet  
##  9 Laptop  
## 10 Laptop  
## # ℹ 101 more rows

student %>% 
  count(Computer)

## # A tibble: 3 × 2
##   Computer     n
##   <chr>    <int>
## 1 Desktop     20
## 2 Laptop      80
## 3 Tablet      11

student %>% 
  count(Computer) %>% 
  ggplot+
  geom_col(aes(x=Computer,y=n))

# coord_flip()
# This function is superseded because in many cases, coord_flip() can easily be replaced by swapping the x and y aesthetics, or optionally setting the orientation argument in geom and stat layers.

student %>% 
  count(Computer) %>% 
  ggplot+
  geom_col(aes(x=Computer,y=n))+
  coord_flip()

# Otherwise,
student %>% 
  count(Computer) %>% 
  ggplot+
  geom_col(aes(y=Computer,x=n))

student %>% 
  count(Computer) %>% 
  ggplot() +
  geom_bar(aes(x = Computer, y = n), stat = "identity")

#Arranging bars#

###geom_bar modification###

ggplot(data=student)+
  geom_bar(aes(x=Computer))+
  scale_x_discrete(limits=c("Laptop","Desktop","Tablet"))

ggplot(student)+
  geom_bar(aes(x=fct_infreq(Computer)))+
  labs(x="Computer Usage Status")

# Here,fct_infreq means by number of observations with each level (largest first)

ggplot(student)+
  geom_bar(aes(x=fct_infreq(Computer) %>% fct_rev()))+
  labs(x="Computer Usage Status",
       y="Frequency",
       title="A simple bar plot")

# fct_rev reverse order of factor levels.

ggplot(student) +
  geom_bar(aes(x = Computer), fill = c("black","bisque3","red")) +
  theme_bw()

ggplot(student) +
  geom_bar(aes(x = Computer, fill = Computer)) +
  scale_fill_manual(values = c("black","bisque3","red"))+
  labs(x="Computer Usage Status",
       y="Frequency",
       title="A simple bar plot")

# scale_fill_manual() function is the part of the 
# scale_manual () functions and these functions allow us 
# to specify our own set of mapping from levels in the data to 
# aesthetic values.

geom_col modification

student %>% 
  count(Computer) %>% 
  ggplot() +
  geom_col(aes(x = Computer, y = n)) + 
  scale_x_discrete(limits = c("Laptop", "Desktop", "Tablet"))

# reorder()- is a generic function. The "default" method treats its first argument as a categorical variable, and reorders its levels based on the values of a second variable, usually numeric. 

student %>% 
  count(Computer) %>% 
ggplot()+
  geom_col(aes(x=reorder(Computer,-n),y=n))

student %>% 
  count(Computer) %>% 
  ggplot(aes(x = reorder(Computer, -n), y = n, label=n)) +
  geom_col() +
  geom_text(vjust=-1, color="black", size=3.5) +
  theme_minimal() +
  ylim(0, 100)

# Here, geom_text is the part of geom_label() and Text geoms are useful for labeling plots. They can be used by themselves as scatterplots or in combination with other geoms, for example, for labeling points or for annotating the height of bars. geom_text() adds only text to the plot. geom_label() draws a rectangle behind the text, making it easier to read.

student %>% 
  count(Computer) %>% 
  ggplot(aes(x = reorder(Computer, -n), y = n)) +
  geom_col() +
  geom_text(aes(label=n), vjust=-0.5, color="black", size=3) +
  theme_minimal()

# Here, aes(label=n) means each bar in the bar chart will have its respective count displayed on top.

student %>% 
  count(Computer, Class) %>% 
  ggplot(aes(x = reorder(Computer, -n), y = n)) +
  geom_col() +
  geom_text(aes(label=n), vjust=-0.5, color="black", size=3) +
  theme_minimal() +
  facet_wrap(vars(Class)) +
  ylim(0, 35) +
  labs(x = "Computer Usage")

student %>% 
  count(Computer, Class) %>% 
  ggplot(aes(x = reorder(Computer, -n), y = n)) +   # -n means descending order
  geom_col(fill = "cornflowerblue") +
  geom_text(aes(label=n), vjust=-0.5, color="black", size=3) +
  theme_light() +
  facet_wrap(vars(Class)) +
  ylim(0, 35) +
  labs(x = "Device usage", y = "Freq.", title = "Frequency of Device Usage by Class") + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.text = element_text(colour = 'black'))

#Values on the bars 1#

ggplot(data=student,aes(y=Major))+
  geom_bar()+
  geom_text(aes(x=after_stat(count-1),
                label=after_stat(count)),
                stat="count",
                size=4,col="white")+
  labs(x="Frequency",y=NULL)

ggplot(student, aes(y = Major, fill = Computer)) +
  geom_bar(position = "dodge") +
  geom_text(
    aes(x = after_stat(count + 1), label = after_stat(count)),
    stat = "count",
    size = 3,
    position = position_dodge(1)
  ) +
  labs(x = "Freq.", y  = NULL)

Stacked and Percentage Filled Bar Plot

ggplot(student)+
  geom_bar(aes(x=Class,fill=Employment))

ggplot(student)+
  geom_bar(aes(x=Class,fill=Employment),position="stack")

ggplot(student)+
  geom_bar(aes(x=Class,fill=Employment),position="dodge")

ggplot(student)+
  geom_bar(aes(x=Class,fill=Employment),position="dodge2")

ggplot(student)+
  geom_bar(aes(x=Class,fill=Employment),position="fill")

#Arranging bars#

student %>% 
  mutate(Class = factor(Class, levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  ggplot() +
  geom_bar(aes(x = Class, fill = Employment), position = "fill")

ggplot(student) +
  geom_bar(aes(x = Class, fill = Employment), position = "fill") +
  scale_x_discrete(limits = c("Freshman","Sophomore", "Junior","Senior"))

#Values on bars 2#

ggplot(student, aes(x = Class, fill = Employment)) +
  geom_bar(position = "fill") +
  geom_text(aes(label = after_stat(count)), size = 3,
            stat = "count", position = position_fill(vjust = 0.5))

install.packages("CGPfunctions")

## Warning: package 'CGPfunctions' is in use and will not be installed

library(CGPfunctions)
library(ggplot2)

CGPfunctions::PlotXTabs2(
  data = student,
  y = Gender,
  x =  Computer, 
  results.subtitle = FALSE, 
  sample.size.label = TRUE, palette = "Set1",
  ggtheme = ggplot2::theme_bw()
) + labs(title = "Stacked Barplot of Device Usage by Gender")

# Here, CGPfunctions is the part of the ggplot2 
# Therefore we will fetch CGPfunctions as like as ggplot 
# Remember here we have taken two variables of category- y and x
# respectively therefore Chi-square will be conducted that will be 
# shown by results.subtitle=TRUE

CGPfunctions::PlotXTabs2(
  data = student,
  y = Computer,
  x =  Class, 
  results.subtitle = TRUE, 
  sample.size.label = TRUE, palette = "Set3",
  ggtheme = ggplot2::theme_bw()
) + labs(title = "Stacked Barplot of Device Usage by Class")

#Legend customization# ##Legend Position##

ggplot(data = iris) +
  geom_point(aes(
    x = Sepal.Length,
    y = Sepal.Width,
    col = Species,
    size = Petal.Length
  )) + labs(x = "Sepal Length", y = "Sepal Width", title = "Scatter Plot of Sepal Length vs Width") +
  theme(legend.position = "right")

ggplot(data = iris) +
  geom_point(aes(
    x = Sepal.Length,
    y = Sepal.Width,
    col = Species,
    size = Petal.Length
  )) +
  labs(x = "Sepal Length", y = "Sepal Width", title = "Scatter Plot of Sepal Length vs Width") +
  guides(color = guide_legend(position = "bottom"))

# Guides is the part of ggplot2 and guides for each scale can be set scale-by-scale with the guide argument, or en masse with guides()

ggplot(data = iris) +
  geom_point(aes(
    x = Sepal.Length,
    y = Sepal.Width,
    col = Species,
    size = Petal.Length
  )) +
  labs(x = "Sepal Length", y = "Sepal Width", title = "Scatter Plot of Sepal Length vs Width") +
  guides(
    color = guide_legend(
      title = "Species Name",
      position = "bottom",
      direction = "horizontal",
      title.position = "left",
      reverse = FALSE
    )
  )

Hide legend for a specific attribute

ggplot(data = iris) +
  geom_point(aes(x = Sepal.Length, y = Sepal.Width, col = Species, size = Petal.Length)) + 
  guides(color = "none")

ggplot(data = iris) +
  geom_point(aes(x = Sepal.Length, y = Sepal.Width, col = Species, size = Petal.Length)) + 
  guides(size = "none")

ggplot(data = iris) +
  geom_point(aes(x = Sepal.Length, y = Sepal.Width, col = Species, size = Petal.Length)) + 
  guides(color = "none", size = "none")

Hide all legends

ggplot(data = iris) +
  geom_point(aes(x = Sepal.Length, y = Sepal.Width, col = Species, size = Petal.Length))+
  theme(legend.position="none")

Reordering levels of legend

ggplot(data = student) +
  geom_bar(aes(y = Computer, fill = Gender), position = "fill") +
  scale_fill_discrete(breaks = c("Male","Female")) +
  theme(legend.position = "bottom")

Axis Customization

# install.packages("scales")
ggplot(student, aes(x = Class, fill = Employment)) +
  geom_bar(position = "fill") +
  labs(y = "Propotion") +
  scale_y_continuous(labels = scales::label_percent())

student %>% 
  mutate(Class = factor(Class, 
                        levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  group_by(Class, Gender) %>% 
  summarize(AvgSpending = mean(Spending)) %>%
  ungroup() %>% 
  ggplot() +
  geom_col(aes(fill = Class, y = AvgSpending, x = Gender), position = "dodge") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::label_currency(prefix = "BDT "))

## `summarise()` has grouped output by 'Class'. You can override using the
## `.groups` argument.

# Without ungroup() function

student %>% 
  mutate(Class = factor(Class, 
                        levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  group_by(Class, Gender) %>% 
  summarize(AvgSpending = mean(Spending)) %>%
  ggplot() +
  geom_col(aes(fill = Class, y = AvgSpending, x = Gender), position = "dodge") +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::label_currency(prefix = "BDT "))

## `summarise()` has grouped output by 'Class'. You can override using the
## `.groups` argument.

#Box plot#

ggplot(data=student,aes(x=Class,y=Spending))+
  geom_boxplot()+
  geom_jitter()

# or,
ggplot(student)+
  geom_boxplot(aes(x=Class,y=Spending))+
  geom_jitter(aes(x=Class,y=Spending))

# The jitter geom is a convenient shortcut for geom_point(position = "jitter"). It adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.

iris %>% 
  ggplot(aes(x=Species,y=Sepal.Length))+
  geom_boxplot()+
  geom_jitter()+
  labs(title="Boxplot of Sepal length and Species",
       x="Species",
       y="Sepal length")+
  theme_bw()

Code for ggpubr:

student %>% 
  mutate(Class = factor(Class, 
                        levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  ggboxplot(x = "Class", y = "Spending",
            color = "Class", palette =c("#00AFBB", "#E7B800", "#FC4E07","black"),
            add = "jitter", shape = "Class") +
  theme(legend.position = "none") +
  geom_pwc(method = "t_test") +
  stat_compare_means(method = "anova", label.y = 50)

p2 <- ggboxplot(data = iris, x = "Species", y = "Sepal.Length",
          color = "Species", add = "jitter", palette = "npg") +
  theme(legend.position = "none") +
  geom_pwc(method = "t_test")

p2

ggadjust_pvalue(
  p2, p.adjust.method = "bonferroni",
  label = "{p.adj.format}{p.adj.signif}"
)

p3 <- student %>% 
  mutate(Class = factor(Class, 
                        levels = c("Freshman","Sophomore", "Junior","Senior"))) %>% 
  ggboxplot(x = "Class", y = "Spending",
            color = "Class", palette =c("#00AFBB", "#E7B800", "#FC4E07","black"),
            add = "jitter", shape = "Class") +
  theme(legend.position = "none") +
  geom_pwc(method = "t_test") +
  stat_compare_means(method = "anova", label.y = 50)

p3

ggadjust_pvalue(
  p3, p.adjust.method = "bonferroni",
  label = "{p.adj.format}{p.adj.signif}", hide.ns = TRUE
)

esquisse add-ins

# install.packages("equisse")
# library(equisse)
ggplot(iris) +
  aes(x = Sepal.Length, fill = Species) +
  geom_histogram(bins = 30L) +
  scale_fill_manual(
    values = c(setosa = "#6C1C80",versicolor = "#30A19C",virginica = "#123B96")
    ) +
  labs(x = "X label", y = "Y label", title = "Title",
    subtitle = "Subtitle", caption = "Caption", fill = "Fill label") +
  theme_bw() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold.italic"),
    plot.subtitle = element_text(face = "italic"),
    axis.title.y = element_text(face = "italic"),
    axis.title.x = element_text(face = "italic")
  ) +
  facet_wrap(vars(Species), ncol = 1)

Combining multiple plots

##patchwork##

library(patchwork)
(p3 | p2) /
  p

See more: https://patchwork.data-imaginist.com/articles/patchwork.html

##ggarrange##

# library(gridExtra)
ggarrange(p, 
          ggarrange(p2, p3, ncol = 2),
          ncol = 1, 
          nrow = 2)

Integration of Plotly in ggplot2

p %>% plotly::ggplotly()

p4 <- ggplot(student, aes(x = Class, y = Spending)) +
  geom_boxplot() +
  geom_jitter()
plotly::ggplotly(p4)

Data Visualization with ggplot and Rmarkdown

Mohammad Sabbir Hossain Shah

2025-03-30 16:43:42.301666