PROGRAM 15

Author

PRACHETAN MS

Objective

Create ** multiple histograms** using ggplot2::facet_wrap() to visualize how a variable (e.g.,Sepal.Length) is distributed across different groups(e.g., Species) in a built-in R dataset.

 # Load the ggplot2 package
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.5.3

Step 1: Load and Explore the Dataset:

# Load the iris dataset
data(iris)

# View the first few rows of the dataset 
head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

Step 2: Create grouped histograms using facet_wrap:

# Create histograms using facet_wrap for grouped data
ggplot(iris, aes(x = Sepal.Length)) +
  geom_histogram(binwidth = 0.3,fill = "skyblue", color = "black") +
  facet_wrap( ~ Species) +
  labs(title = "Distribution of Sepal Length by Species",
       x ="Sepal Length(cm)",
       y = "Frequency") +
  theme_minimal()

Program

  1. Develop an R function to draw density curve representing the probability density function of a continious variable function with separate curves for each group using ggplot2.

Step 1 : Load Required Library

# Load the ggplot2 for plotting
library(ggplot2)
plot_density_by_group <- function(data, continous_var, group_var,
                                  fill_colors = NULL) {
  
  # Check if the specified columns exist
  if (!(continous_var %in% names(data)) || !(group_var %in% names(data))) {
    stop("Invalid column names. Make sure both variables exist in the dataset.")
  }
  
  # Create the ggplot object
  p <- ggplot(data, aes_string(x = continous_var, 
                               color = group_var, 
                               fill = group_var)) +
    geom_density(alpha = 0.4) +
    labs(title = paste("Density plot of", continous_var, "by", group_var),
         x = continous_var,
         y = "Density") +
    theme_minimal()
  
  # Apply custom fill color if provided
  if (!is.null(fill_colors)) {
    p <- p +
      scale_fill_manual(values = fill_colors) +
      scale_color_manual(values = fill_colors)
  }
  
  # Return the plot
  return(p)
}
# Basic usage
plot_density_by_group(iris,"Sepal.Length", "Species")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.

# Define custom colors
custom_colors <- c("setosa" = "steelblue",
                   "versicolor" = "forestgreen",
                   "virginica" = "darkorange")

# Plot with correct argument name
plot_density_by_group(iris, "Petal.Length", "Species", fill_colors = custom_colors)

plot_density_by_group(iris, "Sepal.Length", "Species", fill_colors = custom_colors)

plot_density_by_group(iris, "Petal.Width", "Species", fill_colors = custom_colors)

plot_density_by_group(iris, "Sepal.Width", "Species", fill_colors = custom_colors)

Objective

To generate a basic box plot using ggplot2,enhanced with notches and outliers, and grouped by a categorical variable using an in built dataset in R.

Step 1: Load required package

We use the ggplot2 package for data visualization

#install.packages("ggplot2") # Uncomment if needed
library(ggplot2)

Step 2:Use an inbuilt dataset We will use the built-in iris dataset.This dataset

#load and preview the dataset
data(iris)
head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Step 3:Create a notched box plot Grouped by species We now create a box plot for Sepal.Length,grouped by Species.We’ll enhance the plot using:- Notches to show the confidence interval around the median.

ggplot(iris, aes(x = Species,y = Sepal.Length))

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_boxplot(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    )

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_boxplot(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    ) +
  labs(
    title = "Sepal length distribution by iris Species",
    subtitle = "Box plot with notches and outlier highlighting",
    x = "Species",
    y = "Sepal.Length (cm)"
  ) +
  theme_minimal()

  • Box plot: each box summarizes the distribution of Sepal.Length for a speciesshowing the interquartile range(IQR),median, and potential outliers.

  • Notches: The notches give a rough 95% confidence interval around the median.If the notches of two boxes overlap, the medians are significantly different.

  • outliers :points that fall outside 1.5 x IQR from the quartiles are considered outliers and shown in red.

  • grouping: the plot groups values based on the categorical variable species,helping compare between groups.

  • Aesthetics: theme_minimal() provides a clean background, while colors and transperancy make the plot.

#Box plots: six valuevalues are usually displayed: the lowest value, the lowest quartile (Q1), the median (Q2), the upper quartile (Q3), the highest, and the mean

##Percentile the sample 100 pth percentile is a value such that at least 100p% of the observations are of

##The following rule simplifies the calculation of sample percentiles. Calculate the sample 100 pth percentile: 1.Order the n observations from smallest to largest. 2.Determine the product np. If np is not an integer,round it up to the next integer and find the corresponding ordered value. If np is an integer,say k,calculate the mean of the kth and (k+1)st ordered observations.

136 143 147 151 158 160 161 163 165 167 173 174 181 181 185 188 190 205

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_boxplot(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    ) +
  labs(
    title = "Sepal length distribution by iris Species",
    subtitle = "Box plot with notches and outlier highlighting",
    x = "Species",
    y = "Petal.Length (cm)"
  ) +
  theme_minimal()

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_boxplot(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    ) +
  labs(
    title = "Sepal length distribution by iris Species",
    subtitle = "Box plot with notches and outlier highlighting",
    x = "Species",
    y = "Sepal.Width (cm)"
  ) +
  theme_minimal()

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_boxplot(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    ) +
  labs(
    title = "Sepal length distribution by iris Species",
    subtitle = "Box plot with notches and outlier highlighting",
    x = "Species",
    y = "Petal.Width (cm)"
  ) +
  theme_minimal()

Objective

To generate a basic box plot using ggplot2,enhanced with notches and outliers, and grouped by a categorical variable using an in built dataset in R.

Step 1: Load required package

We use the ggplot2 package for data visualization

#install.packages("ggplot2") # Uncomment if needed
library(ggplot2)

Step 2: Use an inbuilt dataset We will use the built-in iris dataset.This dataset

#load and preview the dataset
data(iris)
head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Step 3: Create a Violin Plot

We create a violin plot for Sepal.Length grouped by Species.

ggplot(iris, aes(x = Species,y = Sepal.Length))

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_violin(
    notch = TRUE,
    notchwidth = 0.6,
    outlier.colour = "red",
    outlier.shape = 16,
    fill = "skyblue",
    alpha = 0.7
    )
Warning in geom_violin(notch = TRUE, notchwidth = 0.6, outlier.colour = "red", : Ignoring unknown parameters: `notch`, `notchwidth`, `outlier.colour`, and
`outlier.shape`

ggplot(iris, aes(x = Species,y = Sepal.Length)) +
  geom_violin(
    fill = "green", alpha = 0.7) +
  labs(
    title = "Sepal length distribution by iris Species",
    subtitle = "Box plot with notches and outlier highlighting",
    x = "Species",
    y = "Petal.Width (cm)"
  ) +
  theme_minimal()

ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_violin(alpha = 0.7) +
  labs(
    title = "Sepal Length Distribution by Iris Species",
    subtitle = "Violin Plot with Legend",
    x = "Species",
    y = "Sepal Length (cm)",
    fill = "Species"   # Legend title
  ) +
  theme_minimal()

Write an r program to create multiple dot plots for grouped data comparing the distributions of variables across different categories using ggplot2 lib.

Objective

To create multiple dot plots for grouped data and compare the distribution of tooth length across different supplement types and dosage levels using ggplot2 and dplyr.

Step 1: Load required packages

We use ggplot2 for visualization and dplyr for data manipulation.

#install.packages("ggplot2") # Uncomment if needed
#install.packages("dplyr")  # Uncomment if needed

library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Step 2: Use an inbuilt dataset

We will use the built-in ToothGrowth dataset. It contains:

# len → Tooth length
# supp → Supplement type (VC or OJ)
# dose → Dosage level
# Load and preview dataset
data(ToothGrowth)
head(ToothGrowth)
   len supp dose
1  4.2   VC  0.5
2 11.5   VC  0.5
3  7.3   VC  0.5
4  5.8   VC  0.5
5  6.4   VC  0.5
6 10.0   VC  0.5
str(ToothGrowth)
'data.frame':   60 obs. of  3 variables:
 $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
 $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
 $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

Step 3: Data preprocessing

Convert categorical variables into factors for better grouping.

ToothGrowth <- ToothGrowth %>%
  mutate(
    supp = as.factor(supp),
    dose = as.factor(dose)
  )

Step 4: Create basic dot plot

We first create a simple dot plot of tooth length grouped by supplement type.

ggplot(ToothGrowth, aes(x = supp, y = len))

ggplot(ToothGrowth, aes(x = supp, y = len, color = supp)) +
  geom_dotplot(
    binaxis = "y",
    stackdir = "center",
    dotsize = 0.8
  )
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Step 5: Create multiple dot plots using facets

We now create multiple dot plots based on dosage levels.

ggplot(ToothGrowth, aes(x = supp, y = len, color = supp)) +
  geom_dotplot(
    binaxis = "y",
    stackdir = "center",
    dotsize = 0.8
  ) +
  facet_wrap(~ dose)
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Step 6: Add labels and theme

Enhance the plot with titles and clean styling.

ggplot(ToothGrowth, aes(x = supp, y = len, color = supp)) +
  geom_dotplot(
    binaxis = "y",
    stackdir = "center",
    dotsize = 0.8
  ) +
  facet_wrap(~ dose) +
  labs(
    title = "Multiple Dot Plots of Tooth Growth",
    subtitle = "Comparison across supplement types and dosage levels",
    x = "Supplement Type",
    y = "Tooth Length"
  ) +
  theme_minimal()
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Develop a R script to calculate and visualize a correlation matrix for a given dataset, with color coded cells including the strength and direction of correlations using ggplot2’s geom title_function.

# Load required Libraries
library(ggplot2)
library(tidyr)
library(dplyr)

Dataset

we use the built-in mtcars dataset

#Preview the dataset
head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Use built-in mtcars dataset
data(mtcars)
# Compute correlation matrix
cor_matrix <- cor(mtcars)
cor_matrix
            mpg        cyl       disp         hp        drat         wt
mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
            qsec         vs          am       gear        carb
mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000
# Convert matrix to a data frame for plotting

cor_df <- as.data.frame(as.table(cor_matrix))
head(cor_df)
  Var1 Var2       Freq
1  mpg  mpg  1.0000000
2  cyl  mpg -0.8521620
3 disp  mpg -0.8475514
4   hp  mpg -0.7761684
5 drat  mpg  0.6811719
6   wt  mpg -0.8676594

Explanation:

  • car(mtcars) computes pairwise correlation.
  • `as.table() flattens the matrix into a long-format table.
  • The result has 3 columns: Var1,Var2, and the correlation value (Freq). ## Step 2: Visualize Using ggplot2:: geom_tile
ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile(color = "white")

ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile(color = "white") + #Draw tile borders
  scale_fill_gradient2(
    low = "blue" , mid = "white" , high = "red",
    midpoint = 0, limit = c(-1, 1),
    name = "Correlation"
    )

ggplot(cor_df, aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile(color = "white") + #Draw tile borders
  scale_fill_gradient2(
    low = "blue" , mid = "white" , high = "red",
    midpoint = 0, limit = c(-1, 1),
    name = "Correlation"
    ) +
  geom_text(aes(label = round(Freq, 2)), size = 3) +
  # Show values
  theme_minimal() +
  labs(
    title = "Correlation Matrix (mtcars)",
    x = "", y = ""
    ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))