# Make data frame
pbs <- PBS
# Define function to give mean value of target col based on key values
tsmean <- function(ts, target, keycols, keyvals) {
stopifnot(length(keycols) == length(keyvals))
filtered <- ts
for (i in seq_along(keycols)) {
filtered <- filtered[filtered[[keycols[i]]] == keyvals[i], ]
}
return (mean(filtered[[target]], na.rm = TRUE))
}
# Define set of key values (cartesian product of ATC1 and ATC2)
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)
feats <- data.frame()
# Loop through key values, finding mean of target
rown <- 1
for (i in 1:2) {
for (j in 1:2) {
for (k in 1:15) {
for (l in 1:84) {
if (nrow(pbs[pbs$Concession == key1[i] & pbs$Type == key2[j] & pbs$ATC1 == key3[k] & pbs$ATC2 == key4[l], ]) > 0) {
feats[rown, 1] <- key1[i]
feats[rown, 2] <- key2[j]
feats[rown, 3] <- key3[k]
feats[rown, 4] <- key4[l]
feats[rown, 5] <- tsmean(
pbs,
"Scripts",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
feats[rown, 6] <- tsmean(
pbs,
"Cost",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
rown <- rown + 1
}
}
}
}
}
# Clean mean dataframe, return keys with largest mean
colnames(feats) <- c("Concession","Type", "ATC1", "ATC2", "Mean scripts", "Mean cost")
feats <- na.omit(feats)Chapter 4 Homework
1.1. Write function to find mean of a time series, plot series with greatest mean
Series with greatest mean Scripts:
feats[feats$`Mean scripts` == max(feats$`Mean scripts`), ] Concession Type ATC1 ATC2 Mean scripts Mean cost
46 Concessional Co-payments J J01 831727.1 7725842
Series with greatest mean Cost:
feats[feats$`Mean cost` == max(feats$`Mean cost`), ] Concession Type ATC1 ATC2 Mean scripts Mean cost
26 Concessional Co-payments C C10 448340.4 24845501
I’m not exactly sure which series to plot, the one with maximal mean Scripts or the one with maximal mean Cost. I’m going to assume Cost, but just know that the defined function that I made can do any numeric column you want:
filtered <- pbs[pbs$Concession == "Concessional" & pbs$Type == "Co-payments" & pbs$ATC1 == "C" & pbs$ATC2 == "C10", ]
filtered |> autoplot(.vars = Cost) +
labs(title = "Cost ($AUD) over Time (with maximal mean)") +
theme_bw()Above is a plot of the time series with the greatest average cost. This series is the one with Concession == "Concessional", Type == "Co-payments, ATC1 == "C", and ATC2 == "C10".
1.2. Do the same for standard deviation, plot series with lowest
# Define function to give standard deviation of target col based on key values
tsstdev <- function(ts, target, keycols, keyvals) {
stopifnot(length(keycols) == length(keyvals))
mean <- tsmean(ts, target, keycols, keyvals)
filtered <- ts
for (i in seq_along(keycols)) {
filtered <- filtered[filtered[[keycols[i]]] == keyvals[i], ]
}
return (sd(filtered[[target]], na.rm = TRUE))
}
# Define set of key values (cartesian product of ATC1 and ATC2)
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)
# Loop through key values, finding stdev of target
rown <- 1
for (i in 1:2) {
for (j in 1:2) {
for (k in 1:15) {
for (l in 1:84) {
if (nrow(pbs[pbs$Concession == key1[i] & pbs$Type == key2[j] & pbs$ATC1 == key3[k] & pbs$ATC2 == key4[l], ]) > 0) {
feats[rown, 7] <- tsstdev(
pbs,
"Scripts",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
feats[rown, 8] <- tsstdev(
pbs,
"Cost",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
rown <- rown + 1
}
}
}
}
}
# Clean feats dataframe
colnames(feats)[7] <- "StDev. scripts"
colnames(feats)[8] <- "StDev. cost"
feats <- na.omit(feats)Series with lowest standard deviation of Scripts:
feats[feats$`StDev. scripts` == min(feats$`StDev. scripts`), ] Concession Type ATC1 ATC2 Mean scripts Mean cost StDev. scripts
238 General Co-payments R R 0 0 0
243 General Co-payments S S 0 0 0
StDev. cost
238 0
243 0
Series with lowest standard deviation of Cost:
feats[feats$`StDev. scripts` == min(feats$`StDev. scripts`), ] Concession Type ATC1 ATC2 Mean scripts Mean cost StDev. scripts
238 General Co-payments R R 0 0 0
243 General Co-payments S S 0 0 0
StDev. cost
238 0
243 0
filtered <- pbs[pbs$Concession == "General" & pbs$Type == "Co-payments" & pbs$ATC1 == "R" & pbs$ATC2 == "R", ]
filtered |> autoplot(.vars = Cost) +
labs(title = "Cost ($AUD) over Time (with minimal StDev.)") +
theme_bw()This is unfortunate, the series with the lowest standard deviation of Cost (the one with Concession == "General", Type == "Co-payments, ATC1 == "R", and ATC2 == "R") also has no nonzero data! Maybe this intentional, maybe not. If we impose a “nonzero” restriction on the data, looking only at series with whole observations in all 216 months:
# Filter out only "complete" time series
wholes <- pbs |>
group_by(Concession, Type, ATC1, ATC2) |>
filter(n() > 200) |>
ungroup()
# More looping
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)
feats2 <- data.frame()
# Loop through key values, finding stdev of target
rown <- 1
for (i in 1:2) {
for (j in 1:2) {
for (k in 1:15) {
for (l in 1:84) {
if (nrow(wholes[wholes$Concession == key1[i] & wholes$Type == key2[j] & wholes$ATC1 == key3[k] & wholes$ATC2 == key4[l], ]) > 0) {
feats2[rown, 1] <- key1[i]
feats2[rown, 2] <- key2[j]
feats2[rown, 3] <- key3[k]
feats2[rown, 4] <- key4[l]
feats2[rown, 5] <- tsstdev(
wholes,
"Scripts",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
feats2[rown, 6] <- tsstdev(
wholes,
"Cost",
c("Concession", "Type", "ATC1", "ATC2"),
c(key1[i], key2[j], key3[k], key4[l])
)
rown <- rown + 1
}
}
}
}
}
# Clean feats dataframe
colnames(feats2)[5] <- "StDev. scripts"
colnames(feats2)[6] <- "StDev. cost"
feats2 <- na.omit(feats2)Series with lowest standard deviation of Cost:
feats2[feats2$`StDev. scripts` == min(feats2$`StDev. scripts`), ] V1 V2 V3 V4 StDev. scripts StDev. cost
232 General Co-payments S S02 0.426898 15.63282
filtered <- wholes[wholes$Concession == "General" & wholes$Type == "Co-payments" & wholes$ATC1 == "S" & wholes$ATC2 == "S02", ]
filtered |> autoplot(.vars = Cost) +
labs(title = "Cost ($AUD) over Time (with minimal StDev.)") +
theme_bw()Upon further examining this time series (with Concession == "General", Type == "Co-payments, ATC1 == "S", and ATC2 == "S02"), most of the values for Cost are just 0:
[1] 3.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[11] 2.00 1.00 0.00 0.00 0.00 8.91 0.00 0.00 0.00 0.00
[21] 0.00 10.59 0.00 6.89 0.00 11.85 0.00 0.00 0.00 0.00
[31] 0.00 38.53 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[41] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[51] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.57 0.00 0.00
[61] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.19 0.00 0.00
[71] 1.19 0.00 7.79 0.00 0.00 0.00 4.62 0.00 0.00 0.00
[81] 0.00 0.00 0.00 1.24 0.00 0.00 96.34 0.00 0.00 0.00
[91] 4.34 8.08 0.00 4.04 0.00 0.00 0.00 0.00 0.00 0.00
[101] 0.00 0.99 0.99 0.83 0.00 0.00 0.00 0.00 0.53 0.00
[111] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 10.80
[121] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.98 0.00
[131] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[141] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[151] 0.00 0.00 0.00 2.96 0.00 0.00 0.00 0.00 0.00 0.00
[161] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
[171] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 0.00
[181] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 22.00 0.00
[191] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 12.00 11.00 196.00
[201] 0.00 0.00 1.00 11.00
Not sure why, but it is almost certainly explainable with more knowledge of the data.