1) Load dataset and create derived fields
2) Exploratory Data Analysis (EDA)
- 2.1 Distribution of overall test scores
- 2.2 Income distribution
3) Advanced Visualizations
4) Quick numeric support for insights

1) Load dataset and create derived fields

get_rmd_dir <- function() {
  # knitr::current_input() works during knitting
  p <- knitr::current_input()
  if (!is.null(p) && nzchar(p)) return(normalizePath(dirname(p)))
  # fallback (interactive)
  tryCatch(normalizePath(getwd()), error = function(e) getwd())
}

rmd_dir <- get_rmd_dir()

candidate_paths <- c(
  file.path(rmd_dir, "CASchools.csv"),
  file.path(getwd(), "CASchools.csv")
)

csv_path <- candidate_paths[file.exists(candidate_paths)][1]

if (is.na(csv_path) || !nzchar(csv_path)) {
  stop(
    "CASchools.csv not found.\n",
    "Fix: Put CASchools.csv in the SAME folder as this .Rmd file, or edit the csv_path line.\n",
    "Searched:\n- ", paste(candidate_paths, collapse = "\n- "),
    call. = FALSE
  )
}

# Load CSV
df <- read.csv(csv_path, stringsAsFactors = FALSE)

# Create overall test score (testscr) from subject scores (english/read/math)
df$testscr <- rowMeans(df[, c("english","read","math")], na.rm = TRUE)

# Student-teacher ratio
df$str <- df$students / df$teachers

# Convert to factors (helpful for grouping/faceting)
df$grades <- as.factor(df$grades)
df$county <- as.factor(df$county)

# Quick checks
cat("Loaded:", csv_path, "\n")

## Loaded: /Users/gbeazer/02_School/DATA 640 Cloud Data Visualization/R/week4/CASchools.csv

dim(df)

## [1] 420  17

head(df)

##   rownames district                          school  county grades students
## 1        1    75119              Sunol Glen Unified Alameda  KK-08      195
## 2        2    61499            Manzanita Elementary   Butte  KK-08      240
## 3        3    61549     Thermalito Union Elementary   Butte  KK-08     1550
## 4        4    61457 Golden Feather Union Elementary   Butte  KK-08      243
## 5        5    61523        Palermo Union Elementary   Butte  KK-08     1335
## 6        6    62042         Burrel Union Elementary  Fresno  KK-08      137
##   teachers calworks   lunch computer expenditure    income   english  read
## 1    10.90   0.5102  2.0408       67    6384.911 22.690001  0.000000 691.6
## 2    11.15  15.4167 47.9167      101    5099.381  9.824000  4.583333 660.5
## 3    82.90  55.0323 76.3226      169    5501.955  8.978000 30.000002 636.3
## 4    14.00  36.4754 77.0492       85    7101.831  8.978000  0.000000 651.9
## 5    71.50  33.1086 78.4270      171    5235.988  9.080333 13.857677 641.8
## 6     6.40  12.3188 86.9565       25    5580.147 10.415000 12.408759 605.7
##    math  testscr      str
## 1 690.0 460.5333 17.88991
## 2 661.9 442.3278 21.52466
## 3 650.9 439.0667 18.69723
## 4 643.5 431.8000 17.35714
## 5 639.9 431.8526 18.67133
## 6 605.4 407.8363 21.40625

str(df)

## 'data.frame':    420 obs. of  17 variables:
##  $ rownames   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ district   : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ school     : chr  "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##  $ county     : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ grades     : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ students   : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers   : num  10.9 11.1 82.9 14 71.5 ...
##  $ calworks   : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ lunch      : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer   : int  67 101 169 85 171 25 28 66 35 0 ...
##  $ expenditure: num  6385 5099 5502 7102 5236 ...
##  $ income     : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ english    : num  0 4.58 30 0 13.86 ...
##  $ read       : num  692 660 636 652 642 ...
##  $ math       : num  690 662 651 644 640 ...
##  $ testscr    : num  461 442 439 432 432 ...
##  $ str        : num  17.9 21.5 18.7 17.4 18.7 ...

2) Exploratory Data Analysis (EDA)

summary(df)

##     rownames        district        school                  county   
##  Min.   :  1.0   Min.   :61382   Length:420         Sonoma     : 29  
##  1st Qu.:105.8   1st Qu.:64308   Class :character   Kern       : 27  
##  Median :210.5   Median :67760   Mode  :character   Los Angeles: 27  
##  Mean   :210.5   Mean   :67473                      Tulare     : 24  
##  3rd Qu.:315.2   3rd Qu.:70419                      San Diego  : 21  
##  Max.   :420.0   Max.   :75440                      Santa Clara: 20  
##                                                     (Other)    :272  
##    grades       students          teachers          calworks     
##  KK-06: 61   Min.   :   81.0   Min.   :   4.85   Min.   : 0.000  
##  KK-08:359   1st Qu.:  379.0   1st Qu.:  19.66   1st Qu.: 4.395  
##              Median :  950.5   Median :  48.56   Median :10.520  
##              Mean   : 2628.8   Mean   : 129.07   Mean   :13.246  
##              3rd Qu.: 3008.0   3rd Qu.: 146.35   3rd Qu.:18.981  
##              Max.   :27176.0   Max.   :1429.00   Max.   :78.994  
##                                                                  
##      lunch           computer       expenditure       income      
##  Min.   :  0.00   Min.   :   0.0   Min.   :3926   Min.   : 5.335  
##  1st Qu.: 23.28   1st Qu.:  46.0   1st Qu.:4906   1st Qu.:10.639  
##  Median : 41.75   Median : 117.5   Median :5215   Median :13.728  
##  Mean   : 44.71   Mean   : 303.4   Mean   :5312   Mean   :15.317  
##  3rd Qu.: 66.86   3rd Qu.: 375.2   3rd Qu.:5601   3rd Qu.:17.629  
##  Max.   :100.00   Max.   :3324.0   Max.   :7712   Max.   :55.328  
##                                                                   
##     english            read            math          testscr     
##  Min.   : 0.000   Min.   :604.5   Min.   :605.4   Min.   :407.8  
##  1st Qu.: 1.941   1st Qu.:640.4   1st Qu.:639.4   1st Qu.:434.4  
##  Median : 8.778   Median :655.8   Median :652.4   Median :440.6  
##  Mean   :15.768   Mean   :655.0   Mean   :653.3   Mean   :441.4  
##  3rd Qu.:22.970   3rd Qu.:668.7   3rd Qu.:665.8   3rd Qu.:447.8  
##  Max.   :85.540   Max.   :704.0   Max.   :709.5   Max.   :472.7  
##                                                                  
##       str       
##  Min.   :14.00  
##  1st Qu.:18.58  
##  Median :19.72  
##  Mean   :19.64  
##  3rd Qu.:20.87  
##  Max.   :25.80  
##

missing_counts <- sapply(df, function(x) sum(is.na(x)))
missing_counts[missing_counts > 0]

## named integer(0)

2.1 Distribution of overall test scores

# Improved histogram: better bin width + mean/median reference lines
mean_ts <- mean(df$testscr, na.rm = TRUE)
med_ts  <- median(df$testscr, na.rm = TRUE)
binw    <- 2  # ~2-point bins works well for this score range

ggplot(df, aes(x = testscr)) +
  geom_histogram(binwidth = binw) +
  geom_density(aes(y = after_stat(count) * binw), linewidth = 0.6) +
  geom_vline(xintercept = mean_ts, linetype = "dashed") +
  geom_vline(xintercept = med_ts,  linetype = "dotted") +
  labs(
    title = "Distribution of Overall Test Scores (testscr)",
    subtitle = "Histogram with density overlay; dashed = mean, dotted = median",
    x = "Overall Test Score (Avg of English/Read/Math)",
    y = "Number of districts"
  ) +
  theme_minimal()

2.2 Income distribution

# Show raw income and log-income side-by-side (log helps with skew)
p_income <- ggplot(df, aes(x = income)) +
  geom_histogram(bins = 30) +
  labs(
    title = "Income distribution (raw)",
    x = "District income",
    y = "Count"
  ) +
  theme_minimal()

p_logincome <- ggplot(df, aes(x = log(income))) +
  geom_histogram(bins = 30) +
  labs(
    title = "Income distribution (log-transformed)",
    x = "log(Income)",
    y = "Count"
  ) +
  theme_minimal()

p_income

p_logincome

3) Advanced Visualizations

3.1 Layered plot (points + regression + CI)

ggplot(df, aes(x = income, y = testscr)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  scale_y_continuous(limits = range(df$testscr, na.rm = TRUE)) +
  labs(
    title = "Test Score vs Income (Layered Visualization)",
    subtitle = "Points + linear regression + confidence interval",
    x = "Income",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.2 Facets (small multiples) by grade range

ggplot(df, aes(x = str, y = testscr)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ grades) +
  labs(
    title = "Test Score vs Student-Teacher Ratio (Faceted)",
    subtitle = "Small multiples by grade range",
    x = "Student-Teacher Ratio (students/teachers)",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.3 Multivariate plot (GGally ggpairs)

vars <- c("testscr","income","str","expenditure","lunch","calworks","computer")
vars <- vars[vars %in% names(df)]  # safety
GGally::ggpairs(df[, vars], aes(alpha = 0.35)) + theme_minimal()

3.4 Correlation heatmap (numeric variables)

num_df <- df[, sapply(df, is.numeric)]
corr <- cor(num_df, use = "pairwise.complete.obs")

ggcorrplot(
  corr,
  type = "lower",
  lab = TRUE,
  lab_size = 2.5
) +
  labs(title = "Correlation Heatmap (Numeric Variables)")

3.5 Statistical transformation (log income)

ggplot(df, aes(x = log(income), y = testscr)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Test Score vs log(Income)",
    subtitle = "Log transform reduces skewness and can stabilize the relationship",
    x = "log(Income)",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.6 Parallel coordinates (GGally ggparcoord)

# Sample to keep the parallel coordinates readable
d_sample <- df %>%
  sample_n(250) %>%
  select(testscr, income, str, expenditure, lunch, calworks, computer, grades)

p_par <- GGally::ggparcoord(
  data = d_sample,
  columns = 1:7,
  groupColumn = 8,
  scale = "uniminmax",
  alphaLines = 0.35
) +
  labs(
    title = "Parallel Coordinates (CASchools Sample)",
    subtitle = "Multivariate comparison by grade range (scaled to [0,1])",
    x = "Variables",
    y = "Scaled value"
  ) +
  theme_minimal()

p_par

3.7 Interactive visualization (Plotly with filtering) + save HTML

# What is interactive?
# This section creates a Plotly widget. In the HTML output, you can hover, zoom, pan,
# and use the dropdown to filter grade ranges.
#
# Word output cannot embed the interactive widget. That is why we also save a separate
# HTML file (W4_CASchools_InteractivePlot.html).

# Rich hover text
df$hover <- paste0(
  "School: ", df$school,
  "<br>County: ", df$county,
  "<br>Grades: ", df$grades,
  "<br>Income: ", round(df$income, 2),
  "<br>Test score: ", round(df$testscr, 1),
  "<br>Students: ", df$students,
  "<br>STR: ", round(df$str, 2),
  "<br>Lunch (%): ", round(df$lunch, 1),
  "<br>Expenditure: ", round(df$expenditure, 0)
)

grade_levels <- levels(df$grades)

p_int <- plot_ly()

# One trace per grade range (enables dropdown filtering)
for (g in grade_levels) {
  d_g <- dplyr::filter(df, grades == g)
  p_int <- add_trace(
    p_int,
    data = d_g,
    x = ~income,
    y = ~testscr,
    type = "scatter",
    mode = "markers",
    name = paste("Grades", g),
    text = ~hover,
    hoverinfo = "text",
    marker = list(
      sizemode = "area",
      sizeref = 2 * max(d_g$students) / (30^2),
      size = ~students
    )
  )
}

# Dropdown buttons: "All" + one button per grade range
buttons <- list(
  list(
    method = "update",
    args = list(list(visible = rep(TRUE, length(grade_levels)))),
    label = "All grades"
  )
)

for (i in seq_along(grade_levels)) {
  vis <- rep(FALSE, length(grade_levels))
  vis[i] <- TRUE
  buttons[[length(buttons) + 1]] <- list(
    method = "update",
    args = list(list(visible = vis)),
    label = paste("Only", grade_levels[i])
  )
}

p_int <- layout(
  p_int,
  title = "Interactive: Test Score vs Income (filterable by grade range)",
  xaxis = list(title = "Income"),
  yaxis = list(title = "Overall Test Score (testscr)"),
  updatemenus = list(list(
    type = "dropdown",
    x = 0.02, y = 1.12,
    buttons = buttons
  )),
  legend = list(orientation = "h", x = 0, y = -0.15)
)

# Save interactive HTML (submit this alongside the Word doc if allowed)
saveWidget(p_int, "W4_CASchools_InteractivePlot.html", selfcontained = TRUE)

p_int

4) Quick numeric support for insights

cor(df$income, df$testscr, use = "complete.obs")

## [1] 0.7220618

cor(df$str, df$testscr, use = "complete.obs")

## [1] -0.1742427

Week 4 Assignment: Advanced Data Visualization in R (CASchools) - Germaine Beazer

Germaine Beazer

2026-02-01

1) Load dataset and create derived fields

2) Exploratory Data Analysis (EDA)

2.1 Distribution of overall test scores

2.2 Income distribution

3) Advanced Visualizations

3.1 Layered plot (points + regression + CI)

3.2 Facets (small multiples) by grade range

3.3 Multivariate plot (GGally ggpairs)

3.4 Correlation heatmap (numeric variables)

3.5 Statistical transformation (log income)

3.6 Parallel coordinates (GGally ggparcoord)

3.7 Interactive visualization (Plotly with filtering) + save HTML

4) Quick numeric support for insights