Iris Dataset Analysis

Author

Sammy Kyalo

Published

August 10, 2025

Iris Dataset Analysis

This notebook analyzes the famous Iris dataset using Observable JS within Quarto, featuring interactive visualizations and statistical summaries.

Introduction

The Iris dataset contains measurements of iris flowers from three species: setosa, versicolor, and virginica. Each observation includes sepal length, sepal width, petal length, and petal width measurements.

Data Loading and Setup

d3 = require("d3@7")
Plot = require("@observablehq/plot@0.6")
Inputs = require("@observablehq/inputs@0.10")

// Load the Iris dataset
data = d3.csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv", d3.autoType)

Interactive Controls

Use the checkbox below to filter the analysis by iris species:

viewof species_filter = Inputs.checkbox(
  ["setosa", "versicolor", "virginica"],
  {
    value: ["setosa", "versicolor", "virginica"], 
    label: "Select Species to Include:",
    format: x => x.charAt(0).toUpperCase() + x.slice(1)
  }
)

filtered_data = data.filter(d => species_filter.includes(d.species))

// Display current selection info
html`<p><strong>Currently analyzing:</strong> ${species_filter.length} species (${filtered_data.length} observations)</p>`

Summary Statistics

Overall Summary

Summary statistics for all numeric variables in the selected data:

numeric_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

// Calculate overall summary statistics
overall_summary = numeric_columns.map(col => ({
  Column: col.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
  Min: d3.min(filtered_data, d => d[col]).toFixed(2),
  Max: d3.max(filtered_data, d => d[col]).toFixed(2),
  Mean: d3.mean(filtered_data, d => d[col]).toFixed(2),
  Median: d3.median(filtered_data, d => d[col]).toFixed(2),
  "Std Dev": d3.deviation(filtered_data, d => d[col]).toFixed(2)
}))

// Display summary table
Inputs.table(overall_summary, {
  header: {
    Column: "Variable", 
    Min: "Minimum", 
    Max: "Maximum", 
    Mean: "Mean", 
    Median: "Median",
    "Std Dev": "Std Deviation"
  },
  width: {
    Column: 150,
    Min: 80,
    Max: 80,
    Mean: 80,
    Median: 80,
    "Std Dev": 100
  }
})

Summary by Species

Mean values for each measurement grouped by species:

grouped_data = Array.from(
  d3.group(filtered_data, d => d.species),
  ([species, values]) => ({
    Species: species.charAt(0).toUpperCase() + species.slice(1),
    Count: values.length,
    "Sepal Length": d3.mean(values, d => d.sepal_length).toFixed(2),
    "Sepal Width": d3.mean(values, d => d.sepal_width).toFixed(2),
    "Petal Length": d3.mean(values, d => d.petal_length).toFixed(2),
    "Petal Width": d3.mean(values, d => d.petal_width).toFixed(2)
  })
)

// Display grouped summary table
Inputs.table(grouped_data, {
  header: {
    Species: "Species",
    Count: "N",
    "Sepal Length": "Mean Sepal Length (cm)", 
    "Sepal Width": "Mean Sepal Width (cm)", 
    "Petal Length": "Mean Petal Length (cm)", 
    "Petal Width": "Mean Petal Width (cm)"
  }
})

Data Visualizations

Scatterplot: Sepal Measurements

Interactive scatterplot showing the relationship between sepal length and width, colored by species:

Plot.plot({
  marks: [
    Plot.dot(filtered_data, {
      x: "sepal_length",
      y: "sepal_width",
      fill: "species",
      r: 6,
      stroke: "white",
      strokeWidth: 1,
      tip: true
    })
  ],
  color: {
    legend: true,
    scheme: "category10",
    domain: ["setosa", "versicolor", "virginica"]
  },
  grid: true,
  x: {
    label: "Sepal Length (cm)",
    nice: true
  },
  y: {
    label: "Sepal Width (cm)",
    nice: true
  },
  title: "Iris Sepal Measurements by Species",
  subtitle: `Based on ${filtered_data.length} observations`,
  height: 450,
  width: 650,
  marginLeft: 70,
  marginBottom: 70
})

Relationship between sepal length and width by species

Histogram: Sepal Length Distribution

Distribution of sepal length measurements across the selected species:

Plot.plot({
  marks: [
    Plot.rectY(filtered_data, 
      Plot.binX(
        {y: "count"}, 
        {
          x: "sepal_length", 
          fill: "species", 
          thresholds: 15,
          tip: true
        }
      )
    )
  ],
  color: {
    legend: true,
    scheme: "category10",
    domain: ["setosa", "versicolor", "virginica"]
  },
  x: {
    label: "Sepal Length (cm)",
    nice: true
  },
  y: {
    label: "Frequency",
    nice: true
  },
  title: "Distribution of Sepal Length by Species",
  subtitle: "Stacked histogram showing frequency of measurements",
  height: 400,
  width: 650,
  marginLeft: 70,
  marginBottom: 70
})

Distribution of sepal length by species

Boxplot: Sepal Length by Species

Boxplot showing the distribution and quartiles of sepal length for each species:

Plot.plot({
  marks: [
    Plot.boxY(filtered_data, {
      x: "species",
      y: "sepal_length",
      fill: "species",
      stroke: "black",
      strokeWidth: 1.5
    })
  ],
  color: {
    legend: true,
    scheme: "category10",
    domain: ["setosa", "versicolor", "virginica"]
  },
  x: {
    label: "Species",
    tickFormat: d => d.charAt(0).toUpperCase() + d.slice(1)
  },
  y: {
    label: "Sepal Length (cm)",
    nice: true,
    grid: true
  },
  title: "Sepal Length Distribution by Species",
  subtitle: "Box plots showing median, quartiles, and outliers",
  height: 400,
  width: 650,
  marginLeft: 70,
  marginBottom: 70
})

Sepal length distribution by species showing quartiles and outliers

Petal vs Sepal Comparison

Comparison of petal and sepal measurements:

Plot.plot({
  marks: [
    Plot.dot(filtered_data, {
      x: "sepal_length",
      y: "petal_length",
      fill: "species",
      r: 6,
      stroke: "white",
      strokeWidth: 1,
      tip: true
    }),
    Plot.linearRegressionY(filtered_data, {
      x: "sepal_length",
      y: "petal_length",
      stroke: "species",
      strokeWidth: 2
    })
  ],
  color: {
    legend: true,
    scheme: "category10",
    domain: ["setosa", "versicolor", "virginica"]
  },
  grid: true,
  x: {
    label: "Sepal Length (cm)",
    nice: true
  },
  y: {
    label: "Petal Length (cm)",
    nice: true
  },
  title: "Petal vs Sepal Length with Trend Lines",
  subtitle: "Linear regression lines show species-specific relationships",
  height: 450,
  width: 650,
  marginLeft: 70,
  marginBottom: 70
})

Comparison of petal length vs sepal length by species

Correlation Analysis

Correlation matrix showing relationships between all numeric variables:

correlation_data = numeric_columns.flatMap(col1 => 
  numeric_columns.map(col2 => {
    const values1 = filtered_data.map(d => d[col1]);
    const values2 = filtered_data.map(d => d[col2]);
    const mean1 = d3.mean(values1);
    const mean2 = d3.mean(values2);
    const correlation = d3.sum(values1, (v1, i) => (v1 - mean1) * (values2[i] - mean2)) /
      Math.sqrt(d3.sum(values1, v1 => (v1 - mean1) ** 2) * 
                d3.sum(values2, v2 => (v2 - mean2) ** 2));
    
    return {
      x: col1.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
      y: col2.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
      correlation: correlation
    };
  })
)

Plot.plot({
  marks: [
    Plot.cell(correlation_data, {
      x: "x",
      y: "y",
      fill: "correlation",
      tip: true
    }),
    Plot.text(correlation_data, {
      x: "x",
      y: "y",
      text: d => d.correlation.toFixed(2),
      fill: d => Math.abs(d.correlation) > 0.5 ? "white" : "black",
      fontSize: 12,
      fontWeight: "bold"
    })
  ],
  color: {
    scheme: "RdBu",
    symmetric: true,
    legend: true,
    label: "Correlation coefficient"
  },
  x: {
    label: null,
    tickRotate: 45
  },
  y: {
    label: null
  },
  title: "Correlation Matrix of Iris Measurements",
  subtitle: "Values range from -1 (negative correlation) to +1 (positive correlation)",
  height: 400,
  width: 500,
  marginLeft: 100,
  marginBottom: 100
})

Correlation matrix of iris measurements

Key Findings

species_counts = d3.rollup(filtered_data, v => v.length, d => d.species);
max_sepal_species = d3.greatest(Array.from(species_counts.entries()), d => 
  d3.mean(filtered_data.filter(row => row.species === d[0]), row => row.sepal_length)
)[0];

html`
<div style="background-color: #e8f4fd; padding: 20px; border-radius: 8px; margin: 20px 0;">
  <h3>Key Observations:</h3>
  <ul>
    <li><strong>Dataset:</strong> Currently analyzing ${filtered_data.length} observations across ${species_filter.length} species</li>
    <li><strong>Largest Sepals:</strong> ${max_sepal_species.charAt(0).toUpperCase() + max_sepal_species.slice(1)} generally has the longest sepals</li>
    <li><strong>Strong Correlation:</strong> Petal length and width are highly correlated (r ≈ 0.96)</li>
    <li><strong>Species Separation:</strong> The three species show distinct clustering in measurement space</li>
  </ul>
</div>
`

Conclusion

This analysis demonstrates the clear morphological differences between iris species. The interactive visualizations reveal distinct patterns in sepal and petal measurements that could be used for species classification. The strong correlations between certain measurements suggest redundancy in some features, which could inform dimensionality reduction strategies in machine learning applications.