1) Load dataset and create derived fields

get_rmd_dir <- function() {
  # knitr::current_input() works during knitting
  p <- knitr::current_input()
  if (!is.null(p) && nzchar(p)) return(normalizePath(dirname(p)))
  # fallback (interactive)
  tryCatch(normalizePath(getwd()), error = function(e) getwd())
}

rmd_dir <- get_rmd_dir()

candidate_paths <- c(
  file.path(rmd_dir, "CASchools.csv"),
  file.path(getwd(), "CASchools.csv")
)

csv_path <- candidate_paths[file.exists(candidate_paths)][1]

if (is.na(csv_path) || !nzchar(csv_path)) {
  stop(
    "CASchools.csv not found.\n",
    "Fix: Put CASchools.csv in the SAME folder as this .Rmd file, or edit the csv_path line.\n",
    "Searched:\n- ", paste(candidate_paths, collapse = "\n- "),
    call. = FALSE
  )
}

# Load CSV
df <- read.csv(csv_path, stringsAsFactors = FALSE)

# Create overall test score (testscr) from subject scores (english/read/math)
df$testscr <- rowMeans(df[, c("english","read","math")], na.rm = TRUE)

# Student-teacher ratio
df$str <- df$students / df$teachers

# Convert to factors (helpful for grouping/faceting)
df$grades <- as.factor(df$grades)
df$county <- as.factor(df$county)

# Quick checks
cat("Loaded:", csv_path, "\n")
## Loaded: /Users/gbeazer/02_School/DATA 640 Cloud Data Visualization/R/week4/CASchools.csv
dim(df)
## [1] 420  17
head(df)
##   rownames district                          school  county grades students
## 1        1    75119              Sunol Glen Unified Alameda  KK-08      195
## 2        2    61499            Manzanita Elementary   Butte  KK-08      240
## 3        3    61549     Thermalito Union Elementary   Butte  KK-08     1550
## 4        4    61457 Golden Feather Union Elementary   Butte  KK-08      243
## 5        5    61523        Palermo Union Elementary   Butte  KK-08     1335
## 6        6    62042         Burrel Union Elementary  Fresno  KK-08      137
##   teachers calworks   lunch computer expenditure    income   english  read
## 1    10.90   0.5102  2.0408       67    6384.911 22.690001  0.000000 691.6
## 2    11.15  15.4167 47.9167      101    5099.381  9.824000  4.583333 660.5
## 3    82.90  55.0323 76.3226      169    5501.955  8.978000 30.000002 636.3
## 4    14.00  36.4754 77.0492       85    7101.831  8.978000  0.000000 651.9
## 5    71.50  33.1086 78.4270      171    5235.988  9.080333 13.857677 641.8
## 6     6.40  12.3188 86.9565       25    5580.147 10.415000 12.408759 605.7
##    math  testscr      str
## 1 690.0 460.5333 17.88991
## 2 661.9 442.3278 21.52466
## 3 650.9 439.0667 18.69723
## 4 643.5 431.8000 17.35714
## 5 639.9 431.8526 18.67133
## 6 605.4 407.8363 21.40625
str(df)
## 'data.frame':    420 obs. of  17 variables:
##  $ rownames   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ district   : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ school     : chr  "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##  $ county     : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ grades     : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ students   : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers   : num  10.9 11.1 82.9 14 71.5 ...
##  $ calworks   : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ lunch      : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer   : int  67 101 169 85 171 25 28 66 35 0 ...
##  $ expenditure: num  6385 5099 5502 7102 5236 ...
##  $ income     : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ english    : num  0 4.58 30 0 13.86 ...
##  $ read       : num  692 660 636 652 642 ...
##  $ math       : num  690 662 651 644 640 ...
##  $ testscr    : num  461 442 439 432 432 ...
##  $ str        : num  17.9 21.5 18.7 17.4 18.7 ...

2) Exploratory Data Analysis (EDA)

summary(df)
##     rownames        district        school                  county   
##  Min.   :  1.0   Min.   :61382   Length:420         Sonoma     : 29  
##  1st Qu.:105.8   1st Qu.:64308   Class :character   Kern       : 27  
##  Median :210.5   Median :67760   Mode  :character   Los Angeles: 27  
##  Mean   :210.5   Mean   :67473                      Tulare     : 24  
##  3rd Qu.:315.2   3rd Qu.:70419                      San Diego  : 21  
##  Max.   :420.0   Max.   :75440                      Santa Clara: 20  
##                                                     (Other)    :272  
##    grades       students          teachers          calworks     
##  KK-06: 61   Min.   :   81.0   Min.   :   4.85   Min.   : 0.000  
##  KK-08:359   1st Qu.:  379.0   1st Qu.:  19.66   1st Qu.: 4.395  
##              Median :  950.5   Median :  48.56   Median :10.520  
##              Mean   : 2628.8   Mean   : 129.07   Mean   :13.246  
##              3rd Qu.: 3008.0   3rd Qu.: 146.35   3rd Qu.:18.981  
##              Max.   :27176.0   Max.   :1429.00   Max.   :78.994  
##                                                                  
##      lunch           computer       expenditure       income      
##  Min.   :  0.00   Min.   :   0.0   Min.   :3926   Min.   : 5.335  
##  1st Qu.: 23.28   1st Qu.:  46.0   1st Qu.:4906   1st Qu.:10.639  
##  Median : 41.75   Median : 117.5   Median :5215   Median :13.728  
##  Mean   : 44.71   Mean   : 303.4   Mean   :5312   Mean   :15.317  
##  3rd Qu.: 66.86   3rd Qu.: 375.2   3rd Qu.:5601   3rd Qu.:17.629  
##  Max.   :100.00   Max.   :3324.0   Max.   :7712   Max.   :55.328  
##                                                                   
##     english            read            math          testscr     
##  Min.   : 0.000   Min.   :604.5   Min.   :605.4   Min.   :407.8  
##  1st Qu.: 1.941   1st Qu.:640.4   1st Qu.:639.4   1st Qu.:434.4  
##  Median : 8.778   Median :655.8   Median :652.4   Median :440.6  
##  Mean   :15.768   Mean   :655.0   Mean   :653.3   Mean   :441.4  
##  3rd Qu.:22.970   3rd Qu.:668.7   3rd Qu.:665.8   3rd Qu.:447.8  
##  Max.   :85.540   Max.   :704.0   Max.   :709.5   Max.   :472.7  
##                                                                  
##       str       
##  Min.   :14.00  
##  1st Qu.:18.58  
##  Median :19.72  
##  Mean   :19.64  
##  3rd Qu.:20.87  
##  Max.   :25.80  
## 
missing_counts <- sapply(df, function(x) sum(is.na(x)))
missing_counts[missing_counts > 0]
## named integer(0)

2.1 Distribution of overall test scores

# Improved histogram: better bin width + mean/median reference lines
mean_ts <- mean(df$testscr, na.rm = TRUE)
med_ts  <- median(df$testscr, na.rm = TRUE)
binw    <- 2  # ~2-point bins works well for this score range

ggplot(df, aes(x = testscr)) +
  geom_histogram(binwidth = binw) +
  geom_density(aes(y = after_stat(count) * binw), linewidth = 0.6) +
  geom_vline(xintercept = mean_ts, linetype = "dashed") +
  geom_vline(xintercept = med_ts,  linetype = "dotted") +
  labs(
    title = "Distribution of Overall Test Scores (testscr)",
    subtitle = "Histogram with density overlay; dashed = mean, dotted = median",
    x = "Overall Test Score (Avg of English/Read/Math)",
    y = "Number of districts"
  ) +
  theme_minimal()

2.2 Income distribution

# Show raw income and log-income side-by-side (log helps with skew)
p_income <- ggplot(df, aes(x = income)) +
  geom_histogram(bins = 30) +
  labs(
    title = "Income distribution (raw)",
    x = "District income",
    y = "Count"
  ) +
  theme_minimal()

p_logincome <- ggplot(df, aes(x = log(income))) +
  geom_histogram(bins = 30) +
  labs(
    title = "Income distribution (log-transformed)",
    x = "log(Income)",
    y = "Count"
  ) +
  theme_minimal()

p_income

p_logincome

3) Advanced Visualizations

3.1 Layered plot (points + regression + CI)

ggplot(df, aes(x = income, y = testscr)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  scale_y_continuous(limits = range(df$testscr, na.rm = TRUE)) +
  labs(
    title = "Test Score vs Income (Layered Visualization)",
    subtitle = "Points + linear regression + confidence interval",
    x = "Income",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.2 Facets (small multiples) by grade range

ggplot(df, aes(x = str, y = testscr)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ grades) +
  labs(
    title = "Test Score vs Student-Teacher Ratio (Faceted)",
    subtitle = "Small multiples by grade range",
    x = "Student-Teacher Ratio (students/teachers)",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.3 Multivariate plot (GGally ggpairs)

vars <- c("testscr","income","str","expenditure","lunch","calworks","computer")
vars <- vars[vars %in% names(df)]  # safety
GGally::ggpairs(df[, vars], aes(alpha = 0.35)) + theme_minimal()

3.4 Correlation heatmap (numeric variables)

num_df <- df[, sapply(df, is.numeric)]
corr <- cor(num_df, use = "pairwise.complete.obs")

ggcorrplot(
  corr,
  type = "lower",
  lab = TRUE,
  lab_size = 2.5
) +
  labs(title = "Correlation Heatmap (Numeric Variables)")

3.5 Statistical transformation (log income)

ggplot(df, aes(x = log(income), y = testscr)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Test Score vs log(Income)",
    subtitle = "Log transform reduces skewness and can stabilize the relationship",
    x = "log(Income)",
    y = "Overall Test Score (testscr)"
  ) +
  theme_minimal()

3.6 Parallel coordinates (GGally ggparcoord)

# Sample to keep the parallel coordinates readable
d_sample <- df %>%
  sample_n(250) %>%
  select(testscr, income, str, expenditure, lunch, calworks, computer, grades)

p_par <- GGally::ggparcoord(
  data = d_sample,
  columns = 1:7,
  groupColumn = 8,
  scale = "uniminmax",
  alphaLines = 0.35
) +
  labs(
    title = "Parallel Coordinates (CASchools Sample)",
    subtitle = "Multivariate comparison by grade range (scaled to [0,1])",
    x = "Variables",
    y = "Scaled value"
  ) +
  theme_minimal()

p_par

3.7 Interactive visualization (Plotly with filtering) + save HTML

# What is interactive?
# This section creates a Plotly widget. In the HTML output, you can hover, zoom, pan,
# and use the dropdown to filter grade ranges.
#
# Word output cannot embed the interactive widget. That is why we also save a separate
# HTML file (W4_CASchools_InteractivePlot.html).

# Rich hover text
df$hover <- paste0(
  "School: ", df$school,
  "<br>County: ", df$county,
  "<br>Grades: ", df$grades,
  "<br>Income: ", round(df$income, 2),
  "<br>Test score: ", round(df$testscr, 1),
  "<br>Students: ", df$students,
  "<br>STR: ", round(df$str, 2),
  "<br>Lunch (%): ", round(df$lunch, 1),
  "<br>Expenditure: ", round(df$expenditure, 0)
)

grade_levels <- levels(df$grades)

p_int <- plot_ly()

# One trace per grade range (enables dropdown filtering)
for (g in grade_levels) {
  d_g <- dplyr::filter(df, grades == g)
  p_int <- add_trace(
    p_int,
    data = d_g,
    x = ~income,
    y = ~testscr,
    type = "scatter",
    mode = "markers",
    name = paste("Grades", g),
    text = ~hover,
    hoverinfo = "text",
    marker = list(
      sizemode = "area",
      sizeref = 2 * max(d_g$students) / (30^2),
      size = ~students
    )
  )
}

# Dropdown buttons: "All" + one button per grade range
buttons <- list(
  list(
    method = "update",
    args = list(list(visible = rep(TRUE, length(grade_levels)))),
    label = "All grades"
  )
)

for (i in seq_along(grade_levels)) {
  vis <- rep(FALSE, length(grade_levels))
  vis[i] <- TRUE
  buttons[[length(buttons) + 1]] <- list(
    method = "update",
    args = list(list(visible = vis)),
    label = paste("Only", grade_levels[i])
  )
}

p_int <- layout(
  p_int,
  title = "Interactive: Test Score vs Income (filterable by grade range)",
  xaxis = list(title = "Income"),
  yaxis = list(title = "Overall Test Score (testscr)"),
  updatemenus = list(list(
    type = "dropdown",
    x = 0.02, y = 1.12,
    buttons = buttons
  )),
  legend = list(orientation = "h", x = 0, y = -0.15)
)

# Save interactive HTML (submit this alongside the Word doc if allowed)
saveWidget(p_int, "W4_CASchools_InteractivePlot.html", selfcontained = TRUE)

p_int

4) Quick numeric support for insights

cor(df$income, df$testscr, use = "complete.obs")
## [1] 0.7220618
cor(df$str, df$testscr, use = "complete.obs")
## [1] -0.1742427