This document creates three wide datasets inspired by the Bright Spots article, tidies them to long, performs simple analyses, makes quick charts, and saves the wide CSVs for submission. Everything runs with base R only (no installs).
# ================== IS 607 – Project 2 (Bright Spots: Wide → Long) ==================
# Single, self-contained chunk: creates 3 wide datasets (inspired by the article),
# tidies them to long, analyzes, plots, and saves CSVs. Base R only (no installs).
# ---- Setup -----------------------------------------------------------------------
set.seed(60702)
options(stringsAsFactors = FALSE)
dir.create("data", showWarnings = FALSE)
# ---- Helper: wide -> long (base R) -----------------------------------------------
to_long <- function(df, id_col, measure_cols, key_name="key", value_name="value"){
long <- data.frame(
id = rep(df[[id_col]], each = length(measure_cols)),
key = rep(measure_cols, times = nrow(df)),
value = as.numeric(unlist(df[measure_cols])),
row.names = NULL
)
names(long)[1] <- id_col; names(long)[2] <- key_name; names(long)[3] <- value_name
long
}
# ===================================================================================
# DATASET A — Vietnam nutrition (Before vs After)
# ===================================================================================
# -- A1. Recreate WIDE data (include a missing cell as per rubric)
df_a_wide <- data.frame(
village = c("An Phu","Ba Tri","Cam Giang","Dai Loc","Eo Ken"),
Before = c(0.28, 0.31, 0.26, 0.22, NA),
After = c(0.61, 0.66, 0.58, 0.52, 0.47)
)
write.csv(df_a_wide, "data/datasetA_vietnam_wide.csv", row.names = FALSE)
df_a_wide
## village Before After
## 1 An Phu 0.28 0.61
## 2 Ba Tri 0.31 0.66
## 3 Cam Giang 0.26 0.58
## 4 Dai Loc 0.22 0.52
## 5 Eo Ken NA 0.47
# -- A2. Tidy transform (wide -> long)
df_a_long <- to_long(df_a_wide, "village", c("Before","After"),
key_name="phase", value_name="prop_nourished")
# -- A3. Populate missing data (transparent, minimal rule)
na_rows <- which(is.na(df_a_long$prop_nourished))
if (length(na_rows)) {
for (i in na_rows) {
v <- df_a_long$village[i]
same_v <- df_a_long[df_a_long$village==v & !is.na(df_a_long$prop_nourished), "prop_nourished"]
df_a_long$prop_nourished[i] <- if (length(same_v)) max(0, same_v - 0.30) else
mean(df_a_long$prop_nourished[df_a_long$phase=="Before"], na.rm=TRUE)
}
}
df_a_long
## village phase prop_nourished
## 1 An Phu Before 0.28
## 2 An Phu After 0.31
## 3 Ba Tri Before 0.26
## 4 Ba Tri After 0.22
## 5 Cam Giang Before 0.31
## 6 Cam Giang After 0.61
## 7 Dai Loc Before 0.66
## 8 Dai Loc After 0.58
## 9 Eo Ken Before 0.52
## 10 Eo Ken After 0.47
# -- A4. Analysis (Before vs After means)
a_before <- mean(df_a_long$prop_nourished[df_a_long$phase=="Before"], na.rm=TRUE)
a_after <- mean(df_a_long$prop_nourished[df_a_long$phase=="After"], na.rm=TRUE)
a_change <- a_after - a_before
data.frame(mean_before=a_before, mean_after=a_after, change=a_change)
## mean_before mean_after change
## 1 0.406 0.438 0.032
# -- A5. Chart (barplot by village & phase)
op <- par(mar=c(5,4,2,1))
wide_for_plot <- reshape(df_a_long, timevar="phase", idvar="village", direction="wide")
matA <- t(as.matrix(wide_for_plot[, c("prop_nourished.Before","prop_nourished.After")]))
colnames(matA) <- wide_for_plot$village
barplot(matA, beside=TRUE, ylim=c(0,1), ylab="Proportion nourished",
main="Dataset A (Inspired): Before vs After")
legend("topleft", legend=c("Before","After"), fill=gray(c(0.7,0.4)), bty="n")
par(op)
# ===================================================================================
# DATASET B — Genentech Xolair bright spot (DFW vs Others, Month1–Month3)
# ===================================================================================
# -- B1. Recreate WIDE data
df_b_wide <- data.frame(
territory = c("DFW","Others"),
Month1 = c(120, 20),
Month2 = c(210, 28),
Month3 = c(260, 31)
)
write.csv(df_b_wide, "data/datasetB_xolair_wide.csv", row.names = FALSE)
df_b_wide
## territory Month1 Month2 Month3
## 1 DFW 120 210 260
## 2 Others 20 28 31
# -- B2. Tidy transform (wide -> long)
df_b_long <- to_long(df_b_wide, "territory", c("Month1","Month2","Month3"),
key_name="month", value_name="rx")
df_b_long$month_num <- match(df_b_long$month, c("Month1","Month2","Month3"))
df_b_long
## territory month rx month_num
## 1 DFW Month1 120 1
## 2 DFW Month2 20 2
## 3 DFW Month3 210 3
## 4 Others Month1 28 1
## 5 Others Month2 260 2
## 6 Others Month3 31 3
# -- B3. Analysis (growth by territory)
b_first <- aggregate(rx ~ territory, df_b_long[df_b_long$month=="Month1",], mean)
b_last <- aggregate(rx ~ territory, df_b_long[df_b_long$month=="Month3",], mean)
b_out <- merge(b_first, b_last, by="territory", suffixes=c("_first","_last"))
b_out$abs_change <- b_out$rx_last - b_out$rx_first
b_out$pct_change <- (b_out$rx_last - b_out$rx_first)/b_out$rx_first
b_out
## territory rx_first rx_last abs_change pct_change
## 1 DFW 120 210 90 0.7500000
## 2 Others 28 31 3 0.1071429
# -- B4. Chart (lines by territory)
rngx <- range(df_b_long$month_num); rngy <- range(df_b_long$rx)
plot(NULL, xlim=rngx, ylim=rngy, xaxt="n", xlab="Month", ylab="Prescriptions",
main="Dataset B (Inspired): DFW vs Others")
axis(1, at=1:3, labels=c("M1","M2","M3"))
by(df_b_long, df_b_long$territory, function(d){
lines(d$month_num, d$rx, lwd=2); points(d$month_num, d$rx, pch=16)
})
## df_b_long$territory: DFW
## NULL
## ------------------------------------------------------------
## df_b_long$territory: Others
## NULL
legend("topleft", legend=unique(df_b_long$territory), lwd=2, bty="n")
# ===================================================================================
# DATASET C — Ikea reusable bag adoption (BeforeFee vs AfterFee)
# ===================================================================================
# -- C1. Recreate WIDE data (include one missing cell)
df_c_wide <- data.frame(
market = c("Store A","Store B","Store C","Store D"),
BeforeFee = c(0.12, 0.18, NA, 0.10),
AfterFee = c(0.78, 0.84, 0.92, 0.80)
)
write.csv(df_c_wide, "data/datasetC_ikea_wide.csv", row.names = FALSE)
df_c_wide
## market BeforeFee AfterFee
## 1 Store A 0.12 0.78
## 2 Store B 0.18 0.84
## 3 Store C NA 0.92
## 4 Store D 0.10 0.80
# -- C2. Tidy transform (wide -> long)
df_c_long <- to_long(df_c_wide, "market", c("BeforeFee","AfterFee"),
key_name="phase", value_name="adoption")
# -- C3. Populate missing “BeforeFee” with mean(BeforeFee)
missing_before <- is.na(df_c_long$adoption) & df_c_long$phase=="BeforeFee"
if (any(missing_before)) {
df_c_long$adoption[missing_before] <- mean(df_c_long$adoption[df_c_long$phase=="BeforeFee"], na.rm=TRUE)
}
df_c_long
## market phase adoption
## 1 Store A BeforeFee 0.1200000
## 2 Store A AfterFee 0.1800000
## 3 Store B BeforeFee 0.6066667
## 4 Store B AfterFee 0.1000000
## 5 Store C BeforeFee 0.7800000
## 6 Store C AfterFee 0.8400000
## 7 Store D BeforeFee 0.9200000
## 8 Store D AfterFee 0.8000000
# -- C4. Analysis (Before vs After means)
c_before <- mean(df_c_long$adoption[df_c_long$phase=="BeforeFee"], na.rm=TRUE)
c_after <- mean(df_c_long$adoption[df_c_long$phase=="AfterFee"], na.rm=TRUE)
c_change <- c_after - c_before
data.frame(mean_before=c_before, mean_after=c_after, change=c_change)
## mean_before mean_after change
## 1 0.6066667 0.48 -0.1266667
# -- C5. Chart (barplot by market & phase)
wide_c <- reshape(df_c_long, timevar="phase", idvar="market", direction="wide")
matC <- t(as.matrix(wide_c[, c("adoption.BeforeFee","adoption.AfterFee")]))
colnames(matC) <- wide_c$market
barplot(matC, beside=TRUE, ylim=c(0,1), ylab="Adoption",
main="Dataset C (Inspired): Before vs After Fee")
legend("topleft", legend=c("Before","After"), bty="n", fill=gray(c(0.7,0.4)))
#===================================================================================
# REPORT TEXT SNIPPETS (quick summaries for your write-up)
# ===================================================================================
cat("\n[Summary A] Mean Before: ", round(a_before, 2),
" Mean After: ", round(a_after, 2),
" Change: +", round(100 * a_change, 1), " pts\n", sep = "")
##
## [Summary A] Mean Before: 0.41 Mean After: 0.44 Change: +3.2 pts
cat("[Summary B] DFW abs/pct change: ",
{ r <- b_out[b_out$territory == "DFW", ]
paste0(r$abs_change, " (", round(100 * r$pct_change, 1), "%)") },
"\n", sep = "")
## [Summary B] DFW abs/pct change: 90 (75%)
cat("[Summary C] Mean BeforeFee: ", round(100 * c_before, 1), "% ",
"Mean AfterFee: ", round(100 * c_after, 1), "% ",
"Change: +", round(100 * c_change, 1), " pts\n\n", sep = "")
## [Summary C] Mean BeforeFee: 60.7% Mean AfterFee: 48% Change: +-12.7 pts
# Files saved for submission (per rubric):
list.files("data", full.names = TRUE)
## [1] "data/datasetA_vietnam_wide.csv" "data/datasetB_xolair_wide.csv"
## [3] "data/datasetC_ikea_wide.csv"
# =============================== END OF CHUNK ======================================