d3 = require("d3@7")
Plot = require("@observablehq/plot@0.6")
Inputs = require("@observablehq/inputs@0.10")
// Load the Iris dataset
data = d3.csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv", d3.autoType)Iris Dataset Analysis
Iris Dataset Analysis
This notebook analyzes the famous Iris dataset using Observable JS within Quarto, featuring interactive visualizations and statistical summaries.
Introduction
The Iris dataset contains measurements of iris flowers from three species: setosa, versicolor, and virginica. Each observation includes sepal length, sepal width, petal length, and petal width measurements.
Data Loading and Setup
Interactive Controls
Use the checkbox below to filter the analysis by iris species:
viewof species_filter = Inputs.checkbox(
["setosa", "versicolor", "virginica"],
{
value: ["setosa", "versicolor", "virginica"],
label: "Select Species to Include:",
format: x => x.charAt(0).toUpperCase() + x.slice(1)
}
)filtered_data = data.filter(d => species_filter.includes(d.species))
// Display current selection info
html`<p><strong>Currently analyzing:</strong> ${species_filter.length} species (${filtered_data.length} observations)</p>`Summary Statistics
Overall Summary
Summary statistics for all numeric variables in the selected data:
numeric_columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
// Calculate overall summary statistics
overall_summary = numeric_columns.map(col => ({
Column: col.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
Min: d3.min(filtered_data, d => d[col]).toFixed(2),
Max: d3.max(filtered_data, d => d[col]).toFixed(2),
Mean: d3.mean(filtered_data, d => d[col]).toFixed(2),
Median: d3.median(filtered_data, d => d[col]).toFixed(2),
"Std Dev": d3.deviation(filtered_data, d => d[col]).toFixed(2)
}))
// Display summary table
Inputs.table(overall_summary, {
header: {
Column: "Variable",
Min: "Minimum",
Max: "Maximum",
Mean: "Mean",
Median: "Median",
"Std Dev": "Std Deviation"
},
width: {
Column: 150,
Min: 80,
Max: 80,
Mean: 80,
Median: 80,
"Std Dev": 100
}
})Summary by Species
Mean values for each measurement grouped by species:
grouped_data = Array.from(
d3.group(filtered_data, d => d.species),
([species, values]) => ({
Species: species.charAt(0).toUpperCase() + species.slice(1),
Count: values.length,
"Sepal Length": d3.mean(values, d => d.sepal_length).toFixed(2),
"Sepal Width": d3.mean(values, d => d.sepal_width).toFixed(2),
"Petal Length": d3.mean(values, d => d.petal_length).toFixed(2),
"Petal Width": d3.mean(values, d => d.petal_width).toFixed(2)
})
)
// Display grouped summary table
Inputs.table(grouped_data, {
header: {
Species: "Species",
Count: "N",
"Sepal Length": "Mean Sepal Length (cm)",
"Sepal Width": "Mean Sepal Width (cm)",
"Petal Length": "Mean Petal Length (cm)",
"Petal Width": "Mean Petal Width (cm)"
}
})Data Visualizations
Scatterplot: Sepal Measurements
Interactive scatterplot showing the relationship between sepal length and width, colored by species:
Plot.plot({
marks: [
Plot.dot(filtered_data, {
x: "sepal_length",
y: "sepal_width",
fill: "species",
r: 6,
stroke: "white",
strokeWidth: 1,
tip: true
})
],
color: {
legend: true,
scheme: "category10",
domain: ["setosa", "versicolor", "virginica"]
},
grid: true,
x: {
label: "Sepal Length (cm)",
nice: true
},
y: {
label: "Sepal Width (cm)",
nice: true
},
title: "Iris Sepal Measurements by Species",
subtitle: `Based on ${filtered_data.length} observations`,
height: 450,
width: 650,
marginLeft: 70,
marginBottom: 70
})Relationship between sepal length and width by species
Histogram: Sepal Length Distribution
Distribution of sepal length measurements across the selected species:
Plot.plot({
marks: [
Plot.rectY(filtered_data,
Plot.binX(
{y: "count"},
{
x: "sepal_length",
fill: "species",
thresholds: 15,
tip: true
}
)
)
],
color: {
legend: true,
scheme: "category10",
domain: ["setosa", "versicolor", "virginica"]
},
x: {
label: "Sepal Length (cm)",
nice: true
},
y: {
label: "Frequency",
nice: true
},
title: "Distribution of Sepal Length by Species",
subtitle: "Stacked histogram showing frequency of measurements",
height: 400,
width: 650,
marginLeft: 70,
marginBottom: 70
})Distribution of sepal length by species
Boxplot: Sepal Length by Species
Boxplot showing the distribution and quartiles of sepal length for each species:
Plot.plot({
marks: [
Plot.boxY(filtered_data, {
x: "species",
y: "sepal_length",
fill: "species",
stroke: "black",
strokeWidth: 1.5
})
],
color: {
legend: true,
scheme: "category10",
domain: ["setosa", "versicolor", "virginica"]
},
x: {
label: "Species",
tickFormat: d => d.charAt(0).toUpperCase() + d.slice(1)
},
y: {
label: "Sepal Length (cm)",
nice: true,
grid: true
},
title: "Sepal Length Distribution by Species",
subtitle: "Box plots showing median, quartiles, and outliers",
height: 400,
width: 650,
marginLeft: 70,
marginBottom: 70
})Sepal length distribution by species showing quartiles and outliers
Petal vs Sepal Comparison
Comparison of petal and sepal measurements:
Plot.plot({
marks: [
Plot.dot(filtered_data, {
x: "sepal_length",
y: "petal_length",
fill: "species",
r: 6,
stroke: "white",
strokeWidth: 1,
tip: true
}),
Plot.linearRegressionY(filtered_data, {
x: "sepal_length",
y: "petal_length",
stroke: "species",
strokeWidth: 2
})
],
color: {
legend: true,
scheme: "category10",
domain: ["setosa", "versicolor", "virginica"]
},
grid: true,
x: {
label: "Sepal Length (cm)",
nice: true
},
y: {
label: "Petal Length (cm)",
nice: true
},
title: "Petal vs Sepal Length with Trend Lines",
subtitle: "Linear regression lines show species-specific relationships",
height: 450,
width: 650,
marginLeft: 70,
marginBottom: 70
})Comparison of petal length vs sepal length by species
Correlation Analysis
Correlation matrix showing relationships between all numeric variables:
correlation_data = numeric_columns.flatMap(col1 =>
numeric_columns.map(col2 => {
const values1 = filtered_data.map(d => d[col1]);
const values2 = filtered_data.map(d => d[col2]);
const mean1 = d3.mean(values1);
const mean2 = d3.mean(values2);
const correlation = d3.sum(values1, (v1, i) => (v1 - mean1) * (values2[i] - mean2)) /
Math.sqrt(d3.sum(values1, v1 => (v1 - mean1) ** 2) *
d3.sum(values2, v2 => (v2 - mean2) ** 2));
return {
x: col1.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
y: col2.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()),
correlation: correlation
};
})
)
Plot.plot({
marks: [
Plot.cell(correlation_data, {
x: "x",
y: "y",
fill: "correlation",
tip: true
}),
Plot.text(correlation_data, {
x: "x",
y: "y",
text: d => d.correlation.toFixed(2),
fill: d => Math.abs(d.correlation) > 0.5 ? "white" : "black",
fontSize: 12,
fontWeight: "bold"
})
],
color: {
scheme: "RdBu",
symmetric: true,
legend: true,
label: "Correlation coefficient"
},
x: {
label: null,
tickRotate: 45
},
y: {
label: null
},
title: "Correlation Matrix of Iris Measurements",
subtitle: "Values range from -1 (negative correlation) to +1 (positive correlation)",
height: 400,
width: 500,
marginLeft: 100,
marginBottom: 100
})Correlation matrix of iris measurements
Key Findings
Conclusion
This analysis demonstrates the clear morphological differences between iris species. The interactive visualizations reveal distinct patterns in sepal and petal measurements that could be used for species classification. The strong correlations between certain measurements suggest redundancy in some features, which could inform dimensionality reduction strategies in machine learning applications.