Visualization is not just about making pretty pictures; it is a
critical tool for Exploratory Data Analysis (EDA),
assumption checking, and communicating complex research findings. In
this module, we utilize the ggplot2 grammar of
graphics.
We will generate a complex dataset representing a Clinical Trial with 500 patients.
set.seed(555)
n <- 500
# 1. Patient Data (Cross-sectional)
data_clinical <- data.frame(
ID = 1:n,
Age = round(rnorm(n, 55, 12)),
BMI = round(rnorm(n, 28, 5), 1),
Cholesterol = round(rnorm(n, 200, 40)),
# Skewed variable (e.g., Biomarker X)
Biomarker = rlnorm(n, meanlog = 1, sdlog = 0.8),
# Count variable (Hospital Visits)
Visits = rpois(n, lambda = 2),
# Categorical Variables
Gender = sample(c("Male", "Female"), n, replace = TRUE),
Treatment = sample(c("Placebo", "Low Dose", "High Dose"), n, replace = TRUE),
Outcome = sample(c("Recovered", "Stable", "Deteriorated"), n, replace = TRUE, prob = c(0.4, 0.4, 0.2)),
Hospital = sample(c("General", "St. Mary's", "University"), n, replace = TRUE)
)
# 2. Time Series Data (Longitudinal) for Line plots
time_seq <- 1:12
data_longitudinal <- data.frame(
Month = rep(time_seq, 3),
Group = rep(c("Placebo", "Low Dose", "High Dose"), each = 12),
Avg_Pain_Score = c(
sort(runif(12, 5, 8), decreasing = TRUE), # Placebo (slow drop)
sort(runif(12, 3, 8), decreasing = TRUE), # Low Dose
sort(runif(12, 1, 8), decreasing = TRUE) # High Dose (fast drop)
)
)This section covers techniques for variables measured on a continuous scale (e.g., BMI, Age) or discrete counts (e.g., Visits).
ggplot(data_clinical, aes(x = Cholesterol)) +
geom_histogram(binwidth = 10, fill = "#69b3a2", color = "white") +
labs(title = "1. Histogram of Cholesterol", subtitle = "Checking for Normality")ggplot(data_clinical, aes(x = Cholesterol)) +
geom_density(fill = "skyblue", alpha = 0.5) +
labs(title = "2. Density Plot")ggplot(data_clinical, aes(x = Cholesterol, color = Gender)) +
geom_freqpoly(binwidth = 10, size = 1) +
labs(title = "3. Frequency Polygon by Gender")ggplot(data_clinical, aes(x = Age)) +
geom_area(stat = "bin", fill = "lightcoral", alpha = 0.6) +
labs(title = "4. Area Plot of Age")ggplot(data_clinical, aes(x = BMI)) +
geom_density() +
geom_rug(alpha = 0.5) +
labs(title = "5. Density with Rug Plot")ggplot(data_clinical, aes(x = BMI)) +
stat_ecdf(geom = "step", color = "blue") +
labs(title = "6. ECDF of BMI", y = "Cumulative Probability")ggplot(data_clinical, aes(sample = Cholesterol)) +
stat_qq() + stat_qq_line(color = "red") +
labs(title = "7. Q-Q Plot (Normality Check)")ggplot(data_clinical, aes(x = Treatment, y = BMI, fill = Treatment)) +
geom_boxplot() +
labs(title = "8. Boxplot of BMI by Treatment")ggplot(data_clinical, aes(x = Treatment, y = BMI, fill = Treatment)) +
geom_violin(trim = FALSE) +
labs(title = "9. Violin Plot")ggplot(data_clinical, aes(x = Treatment, y = BMI)) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(title = "10. Jitter Plot")ggplot(data_clinical, aes(x = Treatment, y = BMI)) +
geom_violin(alpha = 0.3) +
geom_jitter(width = 0.1, alpha = 0.3) +
labs(title = "11. Combined Violin and Jitter")ggplot(data_clinical, aes(x = Cholesterol, y = Outcome, fill = Outcome)) +
geom_density_ridges(alpha = 0.7) +
labs(title = "12. Ridgeline Plot")ggplot(data_clinical, aes(x = Age, y = Cholesterol)) +
geom_point(alpha = 0.6) +
labs(title = "13. Scatter Plot: Age vs Cholesterol")ggplot(data_clinical, aes(x = Age, y = Cholesterol)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", color = "red") +
labs(title = "14. Scatter with Linear Regression")ggplot(data_clinical, aes(x = Age, y = BMI, size = Biomarker)) +
geom_point(alpha = 0.5, color = "purple") +
labs(title = "15. Bubble Chart (Size = Biomarker)")ggplot(data_clinical, aes(x = Age, y = Cholesterol)) +
geom_hex(bins = 20) +
scale_fill_viridis_c() +
labs(title = "16. Hexbin Plot (Density of Points)")ggplot(data_clinical, aes(x = Age, y = Cholesterol)) +
geom_density_2d() +
labs(title = "17. 2D Contour Plot")corr_matrix <- cor(data_clinical %>% select(Age, BMI, Cholesterol, Biomarker, Visits))
ggcorrplot(corr_matrix, lab = TRUE, type = "lower", title = "18. Correlation Matrix")This section covers techniques for nominal (e.g., Gender) and ordinal (e.g., Outcome) data.
ggplot(data_clinical, aes(x = Outcome)) +
geom_bar(fill = "steelblue") +
labs(title = "21. Simple Bar Chart")ggplot(data_clinical, aes(x = Outcome)) +
geom_bar(fill = "steelblue") +
coord_flip() +
labs(title = "22. Horizontal Bar Chart")ggplot(data_clinical, aes(x = Treatment, fill = Outcome)) +
geom_bar(position = "stack") +
labs(title = "23. Stacked Bar Chart")ggplot(data_clinical, aes(x = Treatment, fill = Outcome)) +
geom_bar(position = "dodge") +
labs(title = "24. Grouped Bar Chart")ggplot(data_clinical, aes(x = Treatment, fill = Outcome)) +
geom_bar(position = "fill") +
labs(y = "Proportion", title = "25. 100% Stacked Bar Chart")data_summary <- data_clinical %>% count(Hospital)
ggplot(data_summary, aes(x = Hospital, y = n)) +
geom_segment(aes(x=Hospital, xend=Hospital, y=0, yend=n), color="grey") +
geom_point(size=4, color="orange") +
labs(title = "26. Lollipop Chart")ggplot(data_summary, aes(x = n, y = reorder(Hospital, n))) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "27. Cleveland Dot Plot", y = "Hospital")ggplot(data_summary, aes(x = "", y = n, fill = Hospital)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
labs(title = "28. Pie Chart")ggplot(data_summary, aes(x = 2, y = n, fill = Hospital)) +
geom_bar(stat = "identity", width = 1) +
xlim(0.5, 2.5) +
coord_polar("y") +
theme_void() +
labs(title = "29. Donut Chart")ggplot(data_clinical, aes(x = Outcome, fill = Treatment)) +
geom_bar() +
coord_polar() +
labs(title = "30. Rose Plot")table_data <- data_clinical %>% count(Treatment, Outcome)
ggplot(table_data, aes(x = Treatment, y = Outcome, fill = n)) +
geom_tile() +
geom_text(aes(label = n), color = "white") +
labs(title = "31. Categorical Heatmap")treemapify package, simulated here
with basic tiles.# Simplified representation using tiles
ggplot(table_data, aes(area = n, fill = Outcome, label = Treatment)) +
geom_tile(aes(x = as.numeric(as.factor(Treatment)), y = as.numeric(as.factor(Outcome)), fill = n)) +
labs(title = "32. Tile Map (Treemap Alternative)", subtitle = "Size/Color = Frequency")dumb_data <- data.frame(
Metric = c("Pain", "Mobility", "Sleep"),
Placebo = c(6, 4, 5),
Drug = c(3, 7, 8)
) %>% pivot_longer(cols = c("Placebo", "Drug"), names_to = "Group", values_to = "Score")
ggplot(dumb_data, aes(x = Score, y = Metric)) +
geom_line(aes(group = Metric), color = "grey") +
geom_point(aes(color = Group), size = 3) +
labs(title = "33. Dumbbell Plot (Effect Size)")# Simulated via Polar Coordinates in ggplot
radar_data <- data.frame(
Metric = c("Physical", "Mental", "Social", "Pain", "General"),
Score = c(80, 60, 90, 40, 70)
)
ggplot(radar_data, aes(x = Metric, y = Score, group = 1)) +
geom_polygon(fill = "blue", alpha = 0.2) +
geom_line(color = "blue") +
coord_polar() +
labs(title = "34. Radar Chart Profile")likert_data <- data.frame(
Question = c("Q1", "Q2", "Q3"),
Score = c(-20, 15, -5) # Net Promoter Score or similar
)
ggplot(likert_data, aes(x = Question, y = Score, fill = Score > 0)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "35. Diverging Bar Chart")In this module, we explored the versatility of R and
ggplot2 for data visualization. 1. Continuous
plots (Histograms, Boxplots, Scatterplots) help us understand
distribution and correlation. 2. Categorical plots
(Bars, Lollipops, Heatmaps) help us understand frequency and
proportions.
Assignment: Select the mtcars built-in
dataset in R. Produce a report containing: 1. A histogram of
mpg. 2. A scatter plot of hp vs
wt colored by cyl. 3. A boxplot of
mpg grouped by gear.
End of Module V ```