Project 2 — Overview

This document creates three wide datasets inspired by the Bright Spots article, tidies them to long, performs simple analyses, makes quick charts, and saves the wide CSVs for submission. Everything runs with base R only (no installs).

# ================== IS 607 – Project 2 (Bright Spots: Wide → Long) ==================
# Single, self-contained chunk: creates 3 wide datasets (inspired by the article),
# tidies them to long, analyzes, plots, and saves CSVs. Base R only (no installs).

# ---- Setup -----------------------------------------------------------------------
set.seed(60702)
options(stringsAsFactors = FALSE)
dir.create("data", showWarnings = FALSE)

# ---- Helper: wide -> long (base R) -----------------------------------------------
to_long <- function(df, id_col, measure_cols, key_name="key", value_name="value"){
  long <- data.frame(
    id = rep(df[[id_col]], each = length(measure_cols)),
    key = rep(measure_cols, times = nrow(df)),
    value = as.numeric(unlist(df[measure_cols])),
    row.names = NULL
  )
  names(long)[1] <- id_col; names(long)[2] <- key_name; names(long)[3] <- value_name
  long
}

# ===================================================================================
# DATASET A — Vietnam nutrition (Before vs After)
# ===================================================================================

# -- A1. Recreate WIDE data (include a missing cell as per rubric)
df_a_wide <- data.frame(
  village = c("An Phu","Ba Tri","Cam Giang","Dai Loc","Eo Ken"),
  Before  = c(0.28, 0.31, 0.26, 0.22, NA),
  After   = c(0.61, 0.66, 0.58, 0.52, 0.47)
)
write.csv(df_a_wide, "data/datasetA_vietnam_wide.csv", row.names = FALSE)
df_a_wide
##     village Before After
## 1    An Phu   0.28  0.61
## 2    Ba Tri   0.31  0.66
## 3 Cam Giang   0.26  0.58
## 4   Dai Loc   0.22  0.52
## 5    Eo Ken     NA  0.47
 # -- A2. Tidy transform (wide -> long)
df_a_long <- to_long(df_a_wide, "village", c("Before","After"),
                     key_name="phase", value_name="prop_nourished")

# -- A3. Populate missing data (transparent, minimal rule)
na_rows <- which(is.na(df_a_long$prop_nourished))
if (length(na_rows)) {
  for (i in na_rows) {
    v <- df_a_long$village[i]
    same_v <- df_a_long[df_a_long$village==v & !is.na(df_a_long$prop_nourished), "prop_nourished"]
    df_a_long$prop_nourished[i] <- if (length(same_v)) max(0, same_v - 0.30) else
      mean(df_a_long$prop_nourished[df_a_long$phase=="Before"], na.rm=TRUE)
  }
}
df_a_long
##      village  phase prop_nourished
## 1     An Phu Before           0.28
## 2     An Phu  After           0.31
## 3     Ba Tri Before           0.26
## 4     Ba Tri  After           0.22
## 5  Cam Giang Before           0.31
## 6  Cam Giang  After           0.61
## 7    Dai Loc Before           0.66
## 8    Dai Loc  After           0.58
## 9     Eo Ken Before           0.52
## 10    Eo Ken  After           0.47
# -- A4. Analysis (Before vs After means)
a_before <- mean(df_a_long$prop_nourished[df_a_long$phase=="Before"], na.rm=TRUE)
a_after  <- mean(df_a_long$prop_nourished[df_a_long$phase=="After"],  na.rm=TRUE)
a_change <- a_after - a_before
data.frame(mean_before=a_before, mean_after=a_after, change=a_change)
##   mean_before mean_after change
## 1       0.406      0.438  0.032
# -- A5. Chart (barplot by village & phase)
op <- par(mar=c(5,4,2,1))
wide_for_plot <- reshape(df_a_long, timevar="phase", idvar="village", direction="wide")
matA <- t(as.matrix(wide_for_plot[, c("prop_nourished.Before","prop_nourished.After")]))
colnames(matA) <- wide_for_plot$village
barplot(matA, beside=TRUE, ylim=c(0,1), ylab="Proportion nourished",
        main="Dataset A (Inspired): Before vs After")
legend("topleft", legend=c("Before","After"), fill=gray(c(0.7,0.4)), bty="n")

par(op)

# ===================================================================================
# DATASET B — Genentech Xolair bright spot (DFW vs Others, Month1–Month3)
# ===================================================================================

# -- B1. Recreate WIDE data
df_b_wide <- data.frame(
  territory = c("DFW","Others"),
  Month1 = c(120, 20),
  Month2 = c(210, 28),
  Month3 = c(260, 31)
)
write.csv(df_b_wide, "data/datasetB_xolair_wide.csv", row.names = FALSE)
df_b_wide
##   territory Month1 Month2 Month3
## 1       DFW    120    210    260
## 2    Others     20     28     31
# -- B2. Tidy transform (wide -> long)
df_b_long <- to_long(df_b_wide, "territory", c("Month1","Month2","Month3"),
                     key_name="month", value_name="rx")
df_b_long$month_num <- match(df_b_long$month, c("Month1","Month2","Month3"))
df_b_long
##   territory  month  rx month_num
## 1       DFW Month1 120         1
## 2       DFW Month2  20         2
## 3       DFW Month3 210         3
## 4    Others Month1  28         1
## 5    Others Month2 260         2
## 6    Others Month3  31         3
# -- B3. Analysis (growth by territory)
b_first <- aggregate(rx ~ territory, df_b_long[df_b_long$month=="Month1",], mean)
b_last  <- aggregate(rx ~ territory, df_b_long[df_b_long$month=="Month3",], mean)
b_out <- merge(b_first, b_last, by="territory", suffixes=c("_first","_last"))
b_out$abs_change <- b_out$rx_last - b_out$rx_first
b_out$pct_change <- (b_out$rx_last - b_out$rx_first)/b_out$rx_first
b_out
##   territory rx_first rx_last abs_change pct_change
## 1       DFW      120     210         90  0.7500000
## 2    Others       28      31          3  0.1071429
# -- B4. Chart (lines by territory)
rngx <- range(df_b_long$month_num); rngy <- range(df_b_long$rx)
plot(NULL, xlim=rngx, ylim=rngy, xaxt="n", xlab="Month", ylab="Prescriptions",
     main="Dataset B (Inspired): DFW vs Others")
axis(1, at=1:3, labels=c("M1","M2","M3"))
by(df_b_long, df_b_long$territory, function(d){
  lines(d$month_num, d$rx, lwd=2); points(d$month_num, d$rx, pch=16)
})
## df_b_long$territory: DFW
## NULL
## ------------------------------------------------------------ 
## df_b_long$territory: Others
## NULL
legend("topleft", legend=unique(df_b_long$territory), lwd=2, bty="n")

# ===================================================================================
# DATASET C — Ikea reusable bag adoption (BeforeFee vs AfterFee)
# ===================================================================================

# -- C1. Recreate WIDE data (include one missing cell)
df_c_wide <- data.frame(
  market   = c("Store A","Store B","Store C","Store D"),
  BeforeFee = c(0.12, 0.18, NA,   0.10),
  AfterFee  = c(0.78, 0.84, 0.92, 0.80)
)
write.csv(df_c_wide, "data/datasetC_ikea_wide.csv", row.names = FALSE)
df_c_wide
##    market BeforeFee AfterFee
## 1 Store A      0.12     0.78
## 2 Store B      0.18     0.84
## 3 Store C        NA     0.92
## 4 Store D      0.10     0.80
# -- C2. Tidy transform (wide -> long)
df_c_long <- to_long(df_c_wide, "market", c("BeforeFee","AfterFee"),
                     key_name="phase", value_name="adoption")

# -- C3. Populate missing “BeforeFee” with mean(BeforeFee)
missing_before <- is.na(df_c_long$adoption) & df_c_long$phase=="BeforeFee"
if (any(missing_before)) {
  df_c_long$adoption[missing_before] <- mean(df_c_long$adoption[df_c_long$phase=="BeforeFee"], na.rm=TRUE)
}
df_c_long
##    market     phase  adoption
## 1 Store A BeforeFee 0.1200000
## 2 Store A  AfterFee 0.1800000
## 3 Store B BeforeFee 0.6066667
## 4 Store B  AfterFee 0.1000000
## 5 Store C BeforeFee 0.7800000
## 6 Store C  AfterFee 0.8400000
## 7 Store D BeforeFee 0.9200000
## 8 Store D  AfterFee 0.8000000
# -- C4. Analysis (Before vs After means)
c_before <- mean(df_c_long$adoption[df_c_long$phase=="BeforeFee"], na.rm=TRUE)
c_after  <- mean(df_c_long$adoption[df_c_long$phase=="AfterFee"],  na.rm=TRUE)
c_change <- c_after - c_before
data.frame(mean_before=c_before, mean_after=c_after, change=c_change)
##   mean_before mean_after     change
## 1   0.6066667       0.48 -0.1266667
# -- C5. Chart (barplot by market & phase)
wide_c <- reshape(df_c_long, timevar="phase", idvar="market", direction="wide")
matC <- t(as.matrix(wide_c[, c("adoption.BeforeFee","adoption.AfterFee")]))
colnames(matC) <- wide_c$market
barplot(matC, beside=TRUE, ylim=c(0,1), ylab="Adoption",
        main="Dataset C (Inspired): Before vs After Fee")
legend("topleft", legend=c("Before","After"), bty="n", fill=gray(c(0.7,0.4)))

#===================================================================================
# REPORT TEXT SNIPPETS (quick summaries for your write-up)
# ===================================================================================
cat("\n[Summary A] Mean Before: ", round(a_before, 2),
    "  Mean After: ", round(a_after, 2),
    "  Change: +", round(100 * a_change, 1), " pts\n", sep = "")
## 
## [Summary A] Mean Before: 0.41  Mean After: 0.44  Change: +3.2 pts
cat("[Summary B] DFW abs/pct change: ",
    { r <- b_out[b_out$territory == "DFW", ]
    paste0(r$abs_change, " (", round(100 * r$pct_change, 1), "%)") },
    "\n", sep = "")
## [Summary B] DFW abs/pct change: 90 (75%)
cat("[Summary C] Mean BeforeFee: ", round(100 * c_before, 1), "%  ",
   "Mean AfterFee: ", round(100 * c_after, 1), "%  ",
    "Change: +", round(100 * c_change, 1), " pts\n\n", sep = "")
## [Summary C] Mean BeforeFee: 60.7%  Mean AfterFee: 48%  Change: +-12.7 pts
    # Files saved for submission (per rubric):
    list.files("data", full.names = TRUE)
## [1] "data/datasetA_vietnam_wide.csv" "data/datasetB_xolair_wide.csv" 
## [3] "data/datasetC_ikea_wide.csv"
    # =============================== END OF CHUNK ======================================

Conclusion