1) Load dataset and create derived fields
get_rmd_dir <- function() {
# knitr::current_input() works during knitting
p <- knitr::current_input()
if (!is.null(p) && nzchar(p)) return(normalizePath(dirname(p)))
# fallback (interactive)
tryCatch(normalizePath(getwd()), error = function(e) getwd())
}
rmd_dir <- get_rmd_dir()
candidate_paths <- c(
file.path(rmd_dir, "CASchools.csv"),
file.path(getwd(), "CASchools.csv")
)
csv_path <- candidate_paths[file.exists(candidate_paths)][1]
if (is.na(csv_path) || !nzchar(csv_path)) {
stop(
"CASchools.csv not found.\n",
"Fix: Put CASchools.csv in the SAME folder as this .Rmd file, or edit the csv_path line.\n",
"Searched:\n- ", paste(candidate_paths, collapse = "\n- "),
call. = FALSE
)
}
# Load CSV
df <- read.csv(csv_path, stringsAsFactors = FALSE)
# Create overall test score (testscr) from subject scores (english/read/math)
df$testscr <- rowMeans(df[, c("english","read","math")], na.rm = TRUE)
# Student-teacher ratio
df$str <- df$students / df$teachers
# Convert to factors (helpful for grouping/faceting)
df$grades <- as.factor(df$grades)
df$county <- as.factor(df$county)
# Quick checks
cat("Loaded:", csv_path, "\n")
## Loaded: /Users/gbeazer/02_School/DATA 640 Cloud Data Visualization/R/week4/CASchools.csv
dim(df)
## [1] 420 17
head(df)
## rownames district school county grades students
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195
## 2 2 61499 Manzanita Elementary Butte KK-08 240
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335
## 6 6 62042 Burrel Union Elementary Fresno KK-08 137
## teachers calworks lunch computer expenditure income english read
## 1 10.90 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6
## 2 11.15 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5
## 3 82.90 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3
## 4 14.00 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9
## 5 71.50 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8
## 6 6.40 12.3188 86.9565 25 5580.147 10.415000 12.408759 605.7
## math testscr str
## 1 690.0 460.5333 17.88991
## 2 661.9 442.3278 21.52466
## 3 650.9 439.0667 18.69723
## 4 643.5 431.8000 17.35714
## 5 639.9 431.8526 18.67133
## 6 605.4 407.8363 21.40625
str(df)
## 'data.frame': 420 obs. of 17 variables:
## $ rownames : int 1 2 3 4 5 6 7 8 9 10 ...
## $ district : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ school : chr "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
## $ grades : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
## $ students : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers : num 10.9 11.1 82.9 14 71.5 ...
## $ calworks : num 0.51 15.42 55.03 36.48 33.11 ...
## $ lunch : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer : int 67 101 169 85 171 25 28 66 35 0 ...
## $ expenditure: num 6385 5099 5502 7102 5236 ...
## $ income : num 22.69 9.82 8.98 8.98 9.08 ...
## $ english : num 0 4.58 30 0 13.86 ...
## $ read : num 692 660 636 652 642 ...
## $ math : num 690 662 651 644 640 ...
## $ testscr : num 461 442 439 432 432 ...
## $ str : num 17.9 21.5 18.7 17.4 18.7 ...
2) Exploratory Data Analysis (EDA)
summary(df)
## rownames district school county
## Min. : 1.0 Min. :61382 Length:420 Sonoma : 29
## 1st Qu.:105.8 1st Qu.:64308 Class :character Kern : 27
## Median :210.5 Median :67760 Mode :character Los Angeles: 27
## Mean :210.5 Mean :67473 Tulare : 24
## 3rd Qu.:315.2 3rd Qu.:70419 San Diego : 21
## Max. :420.0 Max. :75440 Santa Clara: 20
## (Other) :272
## grades students teachers calworks
## KK-06: 61 Min. : 81.0 Min. : 4.85 Min. : 0.000
## KK-08:359 1st Qu.: 379.0 1st Qu.: 19.66 1st Qu.: 4.395
## Median : 950.5 Median : 48.56 Median :10.520
## Mean : 2628.8 Mean : 129.07 Mean :13.246
## 3rd Qu.: 3008.0 3rd Qu.: 146.35 3rd Qu.:18.981
## Max. :27176.0 Max. :1429.00 Max. :78.994
##
## lunch computer expenditure income
## Min. : 0.00 Min. : 0.0 Min. :3926 Min. : 5.335
## 1st Qu.: 23.28 1st Qu.: 46.0 1st Qu.:4906 1st Qu.:10.639
## Median : 41.75 Median : 117.5 Median :5215 Median :13.728
## Mean : 44.71 Mean : 303.4 Mean :5312 Mean :15.317
## 3rd Qu.: 66.86 3rd Qu.: 375.2 3rd Qu.:5601 3rd Qu.:17.629
## Max. :100.00 Max. :3324.0 Max. :7712 Max. :55.328
##
## english read math testscr
## Min. : 0.000 Min. :604.5 Min. :605.4 Min. :407.8
## 1st Qu.: 1.941 1st Qu.:640.4 1st Qu.:639.4 1st Qu.:434.4
## Median : 8.778 Median :655.8 Median :652.4 Median :440.6
## Mean :15.768 Mean :655.0 Mean :653.3 Mean :441.4
## 3rd Qu.:22.970 3rd Qu.:668.7 3rd Qu.:665.8 3rd Qu.:447.8
## Max. :85.540 Max. :704.0 Max. :709.5 Max. :472.7
##
## str
## Min. :14.00
## 1st Qu.:18.58
## Median :19.72
## Mean :19.64
## 3rd Qu.:20.87
## Max. :25.80
##
missing_counts <- sapply(df, function(x) sum(is.na(x)))
missing_counts[missing_counts > 0]
## named integer(0)
2.1 Distribution of overall test scores
# Improved histogram: better bin width + mean/median reference lines
mean_ts <- mean(df$testscr, na.rm = TRUE)
med_ts <- median(df$testscr, na.rm = TRUE)
binw <- 2 # ~2-point bins works well for this score range
ggplot(df, aes(x = testscr)) +
geom_histogram(binwidth = binw) +
geom_density(aes(y = after_stat(count) * binw), linewidth = 0.6) +
geom_vline(xintercept = mean_ts, linetype = "dashed") +
geom_vline(xintercept = med_ts, linetype = "dotted") +
labs(
title = "Distribution of Overall Test Scores (testscr)",
subtitle = "Histogram with density overlay; dashed = mean, dotted = median",
x = "Overall Test Score (Avg of English/Read/Math)",
y = "Number of districts"
) +
theme_minimal()

2.2 Income distribution
# Show raw income and log-income side-by-side (log helps with skew)
p_income <- ggplot(df, aes(x = income)) +
geom_histogram(bins = 30) +
labs(
title = "Income distribution (raw)",
x = "District income",
y = "Count"
) +
theme_minimal()
p_logincome <- ggplot(df, aes(x = log(income))) +
geom_histogram(bins = 30) +
labs(
title = "Income distribution (log-transformed)",
x = "log(Income)",
y = "Count"
) +
theme_minimal()
p_income

p_logincome

3) Advanced Visualizations
3.1 Layered plot (points + regression + CI)
ggplot(df, aes(x = income, y = testscr)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
scale_y_continuous(limits = range(df$testscr, na.rm = TRUE)) +
labs(
title = "Test Score vs Income (Layered Visualization)",
subtitle = "Points + linear regression + confidence interval",
x = "Income",
y = "Overall Test Score (testscr)"
) +
theme_minimal()

3.2 Facets (small multiples) by grade range
ggplot(df, aes(x = str, y = testscr)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ grades) +
labs(
title = "Test Score vs Student-Teacher Ratio (Faceted)",
subtitle = "Small multiples by grade range",
x = "Student-Teacher Ratio (students/teachers)",
y = "Overall Test Score (testscr)"
) +
theme_minimal()

3.3 Multivariate plot (GGally ggpairs)
vars <- c("testscr","income","str","expenditure","lunch","calworks","computer")
vars <- vars[vars %in% names(df)] # safety
GGally::ggpairs(df[, vars], aes(alpha = 0.35)) + theme_minimal()

3.4 Correlation heatmap (numeric variables)
num_df <- df[, sapply(df, is.numeric)]
corr <- cor(num_df, use = "pairwise.complete.obs")
ggcorrplot(
corr,
type = "lower",
lab = TRUE,
lab_size = 2.5
) +
labs(title = "Correlation Heatmap (Numeric Variables)")

3.6 Parallel coordinates (GGally ggparcoord)
# Sample to keep the parallel coordinates readable
d_sample <- df %>%
sample_n(250) %>%
select(testscr, income, str, expenditure, lunch, calworks, computer, grades)
p_par <- GGally::ggparcoord(
data = d_sample,
columns = 1:7,
groupColumn = 8,
scale = "uniminmax",
alphaLines = 0.35
) +
labs(
title = "Parallel Coordinates (CASchools Sample)",
subtitle = "Multivariate comparison by grade range (scaled to [0,1])",
x = "Variables",
y = "Scaled value"
) +
theme_minimal()
p_par

3.7 Interactive visualization (Plotly with filtering) + save
HTML
# What is interactive?
# This section creates a Plotly widget. In the HTML output, you can hover, zoom, pan,
# and use the dropdown to filter grade ranges.
#
# Word output cannot embed the interactive widget. That is why we also save a separate
# HTML file (W4_CASchools_InteractivePlot.html).
# Rich hover text
df$hover <- paste0(
"School: ", df$school,
"<br>County: ", df$county,
"<br>Grades: ", df$grades,
"<br>Income: ", round(df$income, 2),
"<br>Test score: ", round(df$testscr, 1),
"<br>Students: ", df$students,
"<br>STR: ", round(df$str, 2),
"<br>Lunch (%): ", round(df$lunch, 1),
"<br>Expenditure: ", round(df$expenditure, 0)
)
grade_levels <- levels(df$grades)
p_int <- plot_ly()
# One trace per grade range (enables dropdown filtering)
for (g in grade_levels) {
d_g <- dplyr::filter(df, grades == g)
p_int <- add_trace(
p_int,
data = d_g,
x = ~income,
y = ~testscr,
type = "scatter",
mode = "markers",
name = paste("Grades", g),
text = ~hover,
hoverinfo = "text",
marker = list(
sizemode = "area",
sizeref = 2 * max(d_g$students) / (30^2),
size = ~students
)
)
}
# Dropdown buttons: "All" + one button per grade range
buttons <- list(
list(
method = "update",
args = list(list(visible = rep(TRUE, length(grade_levels)))),
label = "All grades"
)
)
for (i in seq_along(grade_levels)) {
vis <- rep(FALSE, length(grade_levels))
vis[i] <- TRUE
buttons[[length(buttons) + 1]] <- list(
method = "update",
args = list(list(visible = vis)),
label = paste("Only", grade_levels[i])
)
}
p_int <- layout(
p_int,
title = "Interactive: Test Score vs Income (filterable by grade range)",
xaxis = list(title = "Income"),
yaxis = list(title = "Overall Test Score (testscr)"),
updatemenus = list(list(
type = "dropdown",
x = 0.02, y = 1.12,
buttons = buttons
)),
legend = list(orientation = "h", x = 0, y = -0.15)
)
# Save interactive HTML (submit this alongside the Word doc if allowed)
saveWidget(p_int, "W4_CASchools_InteractivePlot.html", selfcontained = TRUE)
p_int