PROGRAM 8

Author

JAGADISH J M

  1. Develop an R program to quickly explore a given dataset, including categorical analysis using the group by command, and visualize the findings using ggplot2 features
# Load required libraries
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# Function to explore a dataset
explore_dataset <- function(data, categorical_var, numerical_var) {
  # Summary statistics
  summary_stats <- data %>% 
    group_by(!!sym(categorical_var)) %>% 
    summarise(
      Count = n(),
      Mean = mean(!!sym(numerical_var), na.rm = TRUE),
      Median = median(!!sym(numerical_var), na.rm = TRUE),
      SD = sd(!!sym(numerical_var), na.rm = TRUE)
    )
  print(summary_stats)

  # Visualization using ggplot2
  p <- ggplot(data, aes(x = !!sym(categorical_var), y = !!sym(numerical_var), fill = !!sym(categorical_var))) +
    geom_boxplot(outlier.color = "red", alpha = 0.7) +
    theme_minimal() +
    labs(title = paste("Distribution of", numerical_var, "by", categorical_var),
         x = categorical_var,
         y = numerical_var)

  print(p)
}

# Example usage
# Sample dataset
data <- data.frame(
  Category = rep(c("A", "B", "C"), each = 100),
  Values = c(rnorm(100, 10, 3), rnorm(100, 20, 4), rnorm(100, 15, 5))
)

# Call the function
explore_dataset(data, "Category", "Values")
# A tibble: 3 × 5
  Category Count  Mean Median    SD
  <chr>    <int> <dbl>  <dbl> <dbl>
1 A          100  9.92   9.86  3.05
2 B          100 19.6   19.3   3.81
3 C          100 15.1   16.0   4.46

  1. Write an R script to create a scatter plot, incorporating categorical analysis through color-coded data points representing different groups, using ggplot2.
# Load required library
library(ggplot2)

# Sample data (replace with your actual dataset)
data <- data.frame(
  Category = rep(c("A", "B", "C"), each = 100),
  X_values = c(rnorm(100, 5, 2), rnorm(100, 10, 3), rnorm(100, 15, 4)),
  Y_values = c(rnorm(100, 20, 5), rnorm(100, 25, 6), rnorm(100, 30, 7))
)

# Create scatter plot with color-coded categories
p <- ggplot(data, aes(x = X_values, y = Y_values, color = Category)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Scatter Plot with Categorical Analysis",
    x = "X Values",
    y = "Y Values",
    color = "Category"
  ) +
  scale_color_brewer(palette = "Set2")

# Display the scatter plot
print(p)

  1. Implement an R function to generate a line graph depicting the trend of a time-series dataset, with separate lines for each group, utilizing ggplot2’s group aesthetic

# Load required library
library(ggplot2)

# Function to generate a time-series line graph with separate lines for each group
plot_time_series <- function(data, time_var, value_var, group_var) {
  # Create the line graph
  p <- ggplot(data, aes_string(x = time_var, y = value_var, color = group_var, group = group_var)) +
    geom_line(size = 1.2) +  # Line with size 1.2 for clarity
    geom_point(size = 3, alpha = 0.7) +  # Adding points to make the trend clearer
    theme_minimal() +  # Minimal theme
    labs(
      title = paste("Time-Series Trend of", value_var, "by", group_var),
      x = "Time",
      y = "Values",
      color = "Group"
    )
  
  # Print the plot
  print(p)
}

# Example usage
# Sample time-series dataset
data <- data.frame(
  Time = rep(1:12, each = 3),  # Time variable (e.g., months)
  Value = c(rnorm(12, 50, 5), rnorm(12, 60, 5), rnorm(12, 55, 5)),  # Value variable
  Group = rep(c("A", "B", "C"), times = 12)  # Groups (e.g., different regions or categories)
)

# Call the function to generate the line graph
plot_time_series(data, "Time", "Value", "Group")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

  1. Develop a script in R to produce a bar graph displaying the frequency distribution of categorical data in a given dataset, grouped by a specific variable, using ggplot2.

    # Install ggplot2 package
    install.packages("ggplot2")
    Warning: package 'ggplot2' is in use and will not be installed
    # Load ggplot2 library
    library(ggplot2)
    # Load the ggplot2 library
    library(ggplot2)
# Load required library
library(ggplot2)

# Function to create a bar graph showing frequency distribution of categorical data
generate_bar_graph <- function(data, categorical_var, group_var = NULL) {
  p <- ggplot(data, aes_string(x = categorical_var, fill = group_var)) +
    geom_bar(position = "dodge", color = "black", alpha = 0.8) +
    theme_minimal() +
    labs(
      title = paste("Frequency Distribution of", categorical_var, "by", group_var),
      x = categorical_var,
      y = "Frequency",
      fill = "Group"
    )

  print(p)
}

# Example usage
# Sample dataset
data <- data.frame(
  Category = sample(c("A", "B", "C"), 300, replace = TRUE),
  Group = sample(c("X", "Y"), 300, replace = TRUE)
)

# Call the function
generate_bar_graph(data, "Category", "Group")

5. Implement an R program to create a histogram illustrating the distribution of a continuous variable, with overlays of density curves for each group, using ggplot2

Step 1: Load Required Library

{r}

library(ggplot2)

# Load the necessary library
library(ggplot2)

Step 2: Explore the Inbuilt Dataset

# Use the built-in 'iris' dataset
# 'Petal.Length' is a continuous variable
# 'Species' is a categorical grouping variable

str(iris)  # Shows the structure of the dataset
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris) # View the first few rows of the data
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

Step 3: Create Histogram with Group-wise Density Curves Step

3.1: Initialize the ggplot with aesthetic mappings

# Start ggplot with iris dataset
# Map Petal.Length to x-axis and fill by Species (grouping variable)

p <- ggplot(data = iris, aes(x = Petal.Length, fill = Species))
p

Explanation:

This initializes the plot and tells ggplot to map:

Petal.Length (continuous variable) to the x-axis

Species (categorical) to fill aesthetic to distinguish groups

Step 3.2: Add Histogram Layer

# Add histogram with density scaling

p <- p + geom_histogram(aes(y = ..density..),
         alpha = 0.4, # Set transparency
         position = "identity",# Overlap histograms
         bins = 30)            # Number of bins
p
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

Explanation:

aes(y=..density..) normalizes the histogram to density

alpha= 0.4 makes bars semi-transparent so overlaps are visible

position= “identity” lets different group histograms stack on top

bins =30 controls histogram resolution

Step 3.3: Add Density Curve Layer

# Overlay density curves for each group

p <- p + 
  geom_density(aes(color = Species), # Line color by group
  size = 1.2)# Line thickness

Explanation: This overlays smooth density curves for each species using color. The aes(color = Species) ensures each curve is colored by group.

STEP 3.4 ADD LABELS AND THEM

# Add title and axis labels, and apply clean theme

p <- p + labs(
title = "Distribution of Petal Length with Group-wise Density Curves", 
x = "Petal Length", 
y = "Density")+ 
theme_minimal()

p

Explanation:

labs() adds a title and axis labels

theme_minimal() applies a clean, modern plot style

Step 3.5: Display the Plot

Summary

Used built-in iris dataset

Visualized Petal.Length as histogram

Grouped and color-coded by Species

Overlaid group-wise density curves for better interpretation

  1. Write an R script to construct a box plot showcasing the distribution of a continuous variable, grouped by a categorical variable, using ggplot2’s fill aesthetic.
library(ggplot2)
# Load required library
library(ggplot2)

# Function to create a bar graph showing frequency distribution of categorical data
generate_bar_graph <- function(data, categorical_var, group_var = NULL) {
  p <- ggplot(data, aes_string(x = categorical_var, fill = group_var)) +
    geom_bar(position = "dodge", color = "black", alpha = 0.8) +
    theme_minimal() +
    labs(
      title = paste("Frequency Distribution of", categorical_var, "by", group_var),
      x = categorical_var,
      y = "Frequency",
      fill = "Group"
    )

  print(p)
}

# Function to create a box plot showing the distribution of a continuous variable by a categorical variable
generate_box_plot <- function(data, continuous_var, categorical_var) {
  p <- ggplot(data, aes_string(x = categorical_var, y = continuous_var, fill = categorical_var)) +
    geom_boxplot(alpha = 0.7) +
    theme_minimal() +
    labs(
      title = paste("Distribution of", continuous_var, "by", categorical_var),
      x = categorical_var,
      y = continuous_var
    ) +
    theme(legend.position = "none")

  print(p)
}

# Example usage
# Sample dataset
data <- data.frame(
  Category = sample(c("A", "B", "C"), 300, replace = TRUE),
  Group = sample(c("X", "Y"), 300, replace = TRUE),
  Value = c(rnorm(150, 50, 10), rnorm(150, 60, 15))
)

# Call the bar graph function
generate_bar_graph(data, "Category", "Group")

# Call the box plot function
generate_box_plot(data, "Value", "Category")

  1. Develop a function in R to plot a function curve based on a mathematical equation provided as input, with different curve styles for each group, using ggplot2.

    library(ggplot2)
    
    # Function to plot a function curve based on a mathematical equation with different styles for each group
    plot_function_curve <- function(equation, x_range = c(-10, 10), groups = c("Group A", "Group B")) {
      # Generate x-values
      x_values <- seq(x_range[1], x_range[2], length.out = 500)
    
      # Create data frame
      data <- data.frame(
        x = rep(x_values, length(groups)),
        Group = rep(groups, each = length(x_values))
      )
    
      # Apply the mathematical equation to the x-values
      data$y <- eval(parse(text = equation), envir = list(x = data$x))  # Calculate y using the equation
    
      # Create the plot
      p <- ggplot(data, aes(x = x, y = y, color = Group, linetype = Group)) +
        geom_line(size = 1.2) +
        theme_minimal() +
        labs(
          title = paste("Function Curve for:", equation),
          x = "X",
          y = "Y",
          color = "Group",
          linetype = "Group"
        )
    
      print(p)
    }
    
    # Example Usage
    # Plotting sine function with two groups
    plot_function_curve("sin(x)")