Chapter 4 Homework

Author

Griffin Lessinger

1.1. Write function to find mean of a time series, plot series with greatest mean

# Make data frame
pbs <- PBS

# Define function to give mean value of target col based on key values
tsmean <- function(ts, target, keycols, keyvals) {
  stopifnot(length(keycols) == length(keyvals))

  filtered <- ts
  for (i in seq_along(keycols)) {
    filtered <- filtered[filtered[[keycols[i]]] == keyvals[i], ]
  }
  
  return (mean(filtered[[target]], na.rm = TRUE))
}

# Define set of key values (cartesian product of ATC1 and ATC2)
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)
feats <- data.frame()

# Loop through key values, finding mean of target
rown <- 1
for (i in 1:2) {
  for (j in 1:2) {
    for (k in 1:15) {
      for (l in 1:84) {
        
        if (nrow(pbs[pbs$Concession == key1[i] & pbs$Type == key2[j] & pbs$ATC1 == key3[k] & pbs$ATC2 == key4[l], ]) > 0) {
          feats[rown, 1] <- key1[i]
          feats[rown, 2] <- key2[j]
          feats[rown, 3] <- key3[k]
          feats[rown, 4] <- key4[l]
          
          feats[rown, 5] <- tsmean(
            pbs,
            "Scripts",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          feats[rown, 6] <- tsmean(
            pbs,
            "Cost",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          
          rown <- rown + 1
        }
      }
    }
  }
}

# Clean mean dataframe, return keys with largest mean
colnames(feats) <- c("Concession","Type", "ATC1", "ATC2", "Mean scripts", "Mean cost")
feats <- na.omit(feats)

Series with greatest mean Scripts:

feats[feats$`Mean scripts` == max(feats$`Mean scripts`), ]
     Concession        Type ATC1 ATC2 Mean scripts Mean cost
46 Concessional Co-payments    J  J01     831727.1   7725842

Series with greatest mean Cost:

feats[feats$`Mean cost` == max(feats$`Mean cost`), ]
     Concession        Type ATC1 ATC2 Mean scripts Mean cost
26 Concessional Co-payments    C  C10     448340.4  24845501

I’m not exactly sure which series to plot, the one with maximal mean Scripts or the one with maximal mean Cost. I’m going to assume Cost, but just know that the defined function that I made can do any numeric column you want:

filtered <- pbs[pbs$Concession == "Concessional" & pbs$Type == "Co-payments" & pbs$ATC1 == "C" & pbs$ATC2 == "C10", ]

filtered |> autoplot(.vars = Cost) + 
  labs(title = "Cost ($AUD) over Time (with maximal mean)") +
  theme_bw()

Above is a plot of the time series with the greatest average cost. This series is the one with Concession == "Concessional", Type == "Co-payments, ATC1 == "C", and ATC2 == "C10".

1.2. Do the same for standard deviation, plot series with lowest

# Define function to give standard deviation of target col based on key values
tsstdev <- function(ts, target, keycols, keyvals) {
  stopifnot(length(keycols) == length(keyvals))
  
  mean <- tsmean(ts, target, keycols, keyvals)
  
  filtered <- ts
  for (i in seq_along(keycols)) {
    filtered <- filtered[filtered[[keycols[i]]] == keyvals[i], ]
  }
  return (sd(filtered[[target]], na.rm = TRUE))
}

# Define set of key values (cartesian product of ATC1 and ATC2)
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)

# Loop through key values, finding stdev of target
rown <- 1
for (i in 1:2) {
  for (j in 1:2) {
    for (k in 1:15) {
      for (l in 1:84) {
        
        if (nrow(pbs[pbs$Concession == key1[i] & pbs$Type == key2[j] & pbs$ATC1 == key3[k] & pbs$ATC2 == key4[l], ]) > 0) {
          feats[rown, 7] <- tsstdev(
            pbs,
            "Scripts",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          feats[rown, 8] <- tsstdev(
            pbs,
            "Cost",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          rown <- rown + 1
        }
      }
    }
  }
}

# Clean feats dataframe
colnames(feats)[7] <- "StDev. scripts"
colnames(feats)[8] <- "StDev. cost"
feats <- na.omit(feats)

Series with lowest standard deviation of Scripts:

feats[feats$`StDev. scripts` == min(feats$`StDev. scripts`), ]
    Concession        Type ATC1 ATC2 Mean scripts Mean cost StDev. scripts
238    General Co-payments    R    R            0         0              0
243    General Co-payments    S    S            0         0              0
    StDev. cost
238           0
243           0

Series with lowest standard deviation of Cost:

feats[feats$`StDev. scripts` == min(feats$`StDev. scripts`), ]
    Concession        Type ATC1 ATC2 Mean scripts Mean cost StDev. scripts
238    General Co-payments    R    R            0         0              0
243    General Co-payments    S    S            0         0              0
    StDev. cost
238           0
243           0
filtered <- pbs[pbs$Concession == "General" & pbs$Type == "Co-payments" & pbs$ATC1 == "R" & pbs$ATC2 == "R", ]

filtered |> autoplot(.vars = Cost) + 
  labs(title = "Cost ($AUD) over Time (with minimal StDev.)") +
  theme_bw()

This is unfortunate, the series with the lowest standard deviation of Cost (the one with Concession == "General", Type == "Co-payments, ATC1 == "R", and ATC2 == "R") also has no nonzero data! Maybe this intentional, maybe not. If we impose a “nonzero” restriction on the data, looking only at series with whole observations in all 216 months:

# Filter out only "complete" time series
wholes <- pbs |>
  group_by(Concession, Type, ATC1, ATC2) |>
  filter(n() > 200) |>
  ungroup()

# More looping
key1 <- unique(pbs$Concession)
key2 <- unique(pbs$Type)
key3 <- unique(pbs$ATC1)
key4 <- unique(pbs$ATC2)
feats2 <- data.frame()

# Loop through key values, finding stdev of target
rown <- 1
for (i in 1:2) {
  for (j in 1:2) {
    for (k in 1:15) {
      for (l in 1:84) {
        
        if (nrow(wholes[wholes$Concession == key1[i] & wholes$Type == key2[j] & wholes$ATC1 == key3[k] & wholes$ATC2 == key4[l], ]) > 0) {
          feats2[rown, 1] <- key1[i]
          feats2[rown, 2] <- key2[j]
          feats2[rown, 3] <- key3[k]
          feats2[rown, 4] <- key4[l]
          
          feats2[rown, 5] <- tsstdev(
            wholes,
            "Scripts",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          feats2[rown, 6] <- tsstdev(
            wholes,
            "Cost",
            c("Concession", "Type", "ATC1", "ATC2"),
            c(key1[i], key2[j], key3[k], key4[l])
          )
          rown <- rown + 1
        }
      }
    }
  }
}

# Clean feats dataframe
colnames(feats2)[5] <- "StDev. scripts"
colnames(feats2)[6] <- "StDev. cost"
feats2 <- na.omit(feats2)

Series with lowest standard deviation of Cost:

feats2[feats2$`StDev. scripts` == min(feats2$`StDev. scripts`), ]
         V1          V2 V3  V4 StDev. scripts StDev. cost
232 General Co-payments  S S02       0.426898    15.63282
filtered <- wholes[wholes$Concession == "General" & wholes$Type == "Co-payments" & wholes$ATC1 == "S" & wholes$ATC2 == "S02", ]

filtered |> autoplot(.vars = Cost) + 
  labs(title = "Cost ($AUD) over Time (with minimal StDev.)") +
  theme_bw()

Upon further examining this time series (with Concession == "General", Type == "Co-payments, ATC1 == "S", and ATC2 == "S02"), most of the values for Cost are just 0:

  [1]   3.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
 [11]   2.00   1.00   0.00   0.00   0.00   8.91   0.00   0.00   0.00   0.00
 [21]   0.00  10.59   0.00   6.89   0.00  11.85   0.00   0.00   0.00   0.00
 [31]   0.00  38.53   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
 [41]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
 [51]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   1.57   0.00   0.00
 [61]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   1.19   0.00   0.00
 [71]   1.19   0.00   7.79   0.00   0.00   0.00   4.62   0.00   0.00   0.00
 [81]   0.00   0.00   0.00   1.24   0.00   0.00  96.34   0.00   0.00   0.00
 [91]   4.34   8.08   0.00   4.04   0.00   0.00   0.00   0.00   0.00   0.00
[101]   0.00   0.99   0.99   0.83   0.00   0.00   0.00   0.00   0.53   0.00
[111]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00  10.80
[121]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.98   0.00
[131]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
[141]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
[151]   0.00   0.00   0.00   2.96   0.00   0.00   0.00   0.00   0.00   0.00
[161]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
[171]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   3.00   0.00
[181]   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00  22.00   0.00
[191]   0.00   0.00   0.00   0.00   0.00   0.00   0.00  12.00  11.00 196.00
[201]   0.00   0.00   1.00  11.00

Not sure why, but it is almost certainly explainable with more knowledge of the data.