Read in the Proposal Data

#Read in the Proposed Phasing Schedule
proposedPhasingSchedule <- read.csv2(
  file = "ProposedPhasingSchedule.csv",
  header = TRUE,
  sep = ","
)

head(proposedPhasingSchedule)
##     Housing Total TPV Transfer.out Transfer.in Attrition Remain Year
## 1 Tidewater   618 173           83           0        36    327 2019
## 2     Young   746   0            0          60        60    746 2019
## 3   Calvert   310  36           17          39        22    274 2019
## 4 Tidewater   327 124           75           0        20    109 2020
## 5     Young   746   0            0          60        60    746 2020
## 6   Calvert   274   0            0          22        22    274 2020

Read in the 2009 Study Data

#Read in the 2009 Attrition Study
attritionRates2009raw <- read.csv2(
  file = "LIPHAttritionRate2009Cohort.csv",
  header = TRUE,
  sep = ",",
  na.strings = "N/A"
)

head(attritionRates2009raw)
##                Stat X2009  X2010  X2011  X2012  X2013  X2014  X2015
## 1 StillLivingInLIPH   353    323    271    251    226    199    171
## 2         Attrition    NA     30     52     20     25     27     28
## 3     AttritionRate    NA 0.0850 0.1610 0.0738 0.0996 0.1195 0.1410

Clean the Proposal Data for Yearly Attrition Rate

#library for manipulating data frames
require(dplyr)

#clean the data to get the attrition rate each year
projectedAttritionRates = proposedPhasingSchedule %>%
  group_by(Year) %>% #group for statistics across all housing
    mutate(TotalAll = sum(Total)) %>% #get total across all housing
    mutate(AttritionAll = sum(Attrition)) #get attrition across all housing

head(projectedAttritionRates)
## # A tibble: 6 x 10
## # Groups:   Year [2]
##     Housing Total   TPV Transfer.out Transfer.in Attrition Remain  Year
##      <fctr> <int> <int>        <int>       <int>     <int>  <int> <int>
## 1 Tidewater   618   173           83           0        36    327  2019
## 2     Young   746     0            0          60        60    746  2019
## 3   Calvert   310    36           17          39        22    274  2019
## 4 Tidewater   327   124           75           0        20    109  2020
## 5     Young   746     0            0          60        60    746  2020
## 6   Calvert   274     0            0          22        22    274  2020
## # ... with 2 more variables: TotalAll <int>, AttritionAll <int>
projectedAttritionRates <- projectedAttritionRates %>%
    distinct(Year, TotalAll, AttritionAll) %>% #remove duplicates
  ungroup() %>% #we have the statistics we need
  mutate(AttritionRatePercentage = round(AttritionAll/TotalAll * 100)) %>% #calc attrition rate
  select(Year, TotalAll, AttritionAll, AttritionRatePercentage) #we only need these columns

head(projectedAttritionRates)
## # A tibble: 6 x 4
##    Year TotalAll AttritionAll AttritionRatePercentage
##   <int>    <int>        <int>                   <dbl>
## 1  2019     1674          118                       7
## 2  2020     1347          102                       8
## 3  2021     1129           70                       6
## 4  2022      852           49                       6
## 5  2023      600           29                       5
## 6  2024      386            5                       1

Plot the Proposal Yearly Attrition Rate

#library for plotting data
require(ggplot2)

#plot the attrition rate each year
ggplot(projectedAttritionRates, aes(x = Year, y = AttritionRatePercentage)) + 
  geom_line(size = 2) + 
  labs(title = "Attrition Rate, Year-to-Year ")

Clean the 2009 Study Data for Yearly Attrition Rate

#library for changing the shape of the data
require(reshape2)

#go from column per year to one column for year
attritionRates2009 <- attritionRates2009raw %>% 
  melt(id.vars = c("Stat")) %>% #combine all columns except stat
  rename(Year = variable) #change the variable column to year

head(attritionRates2009)
##                Stat  Year  value
## 1 StillLivingInLIPH X2009    353
## 2         Attrition X2009   <NA>
## 3     AttritionRate X2009   <NA>
## 4 StillLivingInLIPH X2010    323
## 5         Attrition X2010     30
## 6     AttritionRate X2010 0.0850
#have one column for each measurement each year
attritionRates2009 <- attritionRates2009 %>%
  dcast(Year ~ Stat) #make a column for each type of Stat that drives Year

head(attritionRates2009)
##    Year Attrition AttritionRate StillLivingInLIPH
## 1 X2009      <NA>          <NA>               353
## 2 X2010        30        0.0850               323
## 3 X2011        52        0.1610               271
## 4 X2012        20        0.0738               251
## 5 X2013        25        0.0996               226
## 6 X2014        27        0.1195               199
#clean to get the attrition rate each year
attritionRates2009 <- attritionRates2009 %>%
  filter(!is.na((AttritionRate))) %>% #remove the column there was no attrition
  mutate(Year = substring(Year, 2)) %>% #get rid of the X in front of the year
  mutate(Year = as.numeric(Year)) %>% #convert Year to a number
  mutate(AttritionRate = as.numeric(AttritionRate)) %>% #convert attrition rate to a number
  select(Year, AttritionRate) #get only year and attrition rate

head(attritionRates2009)
##   Year AttritionRate
## 1 2010        0.0850
## 2 2011        0.1610
## 3 2012        0.0738
## 4 2013        0.0996
## 5 2014        0.1195
## 6 2015        0.1410

Combine Proposal Data and 2009 Study Data

#standardize the 2009 study columns so it can be combined
attritionRates2009 <- attritionRates2009 %>% 
  mutate(AttritionRatePercentage = round(AttritionRate * 100)) %>% #round the attrition rate
  mutate(YearSinceVoucher = Year - 2009) %>% #just count years instead of the specific year
  mutate(DataSet = "2009Study") %>% #tag these findings as coming from the 2009 study
  select(YearSinceVoucher, AttritionRatePercentage, DataSet) #only use these columns

#standardize the proposal columns so it can be combined
projectedAttritionRates <- projectedAttritionRates %>%
  mutate(YearSinceVoucher = Year - 2018) %>% #just count years instead of the specific year
  mutate(DataSet = "ProjectedAttritionRate") %>% #tag this as coming from the proposal
  select(YearSinceVoucher, AttritionRatePercentage, DataSet) #only use these columns

head(attritionRates2009)
##   YearSinceVoucher AttritionRatePercentage   DataSet
## 1                1                       8 2009Study
## 2                2                      16 2009Study
## 3                3                       7 2009Study
## 4                4                      10 2009Study
## 5                5                      12 2009Study
## 6                6                      14 2009Study
head(projectedAttritionRates)
## # A tibble: 6 x 3
##   YearSinceVoucher AttritionRatePercentage                DataSet
##              <dbl>                   <dbl>                  <chr>
## 1                1                       7 ProjectedAttritionRate
## 2                2                       8 ProjectedAttritionRate
## 3                3                       6 ProjectedAttritionRate
## 4                4                       6 ProjectedAttritionRate
## 5                5                       5 ProjectedAttritionRate
## 6                6                       1 ProjectedAttritionRate
#combined the dataframes into one
attritionRatesCompared = rbind(attritionRates2009, projectedAttritionRates)

head(attritionRatesCompared)
##   YearSinceVoucher AttritionRatePercentage   DataSet
## 1                1                       8 2009Study
## 2                2                      16 2009Study
## 3                3                       7 2009Study
## 4                4                      10 2009Study
## 5                5                      12 2009Study
## 6                6                      14 2009Study

Plot Proposal and 2009 Study Yearly Attrition Rate

#plot the attrition rates each year, using color for different data sets
ggplot(
    attritionRatesCompared, 
    aes(x = YearSinceVoucher, y = AttritionRatePercentage, color = DataSet)
  ) + 
  geom_line(size = 2) + 
  labs(title = "Attrition Rates Compared Year-to-Year ")

Get a Linear Regression of 2009 Study Yearly Attrition Rate

#create a linear regression of the 2009 attrition rates
attritionRates2009lm <- lm(AttritionRatePercentage ~ YearSinceVoucher, attritionRates2009)

summary(attritionRates2009lm)
## 
## Call:
## lm(formula = AttritionRatePercentage ~ YearSinceVoucher, data = attritionRates2009)
## 
## Residuals:
##        1        2        3        4        5        6 
## -1.66667  5.73333 -3.86667 -1.46667 -0.06667  1.33333 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        9.0667     3.4374   2.638   0.0577 .
## YearSinceVoucher   0.6000     0.8826   0.680   0.5339  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.692 on 4 degrees of freedom
## Multiple R-squared:  0.1036, Adjusted R-squared:  -0.1205 
## F-statistic: 0.4621 on 1 and 4 DF,  p-value: 0.5339

Add 2009 Study Regression Data

# library for explicitly creating data frames
require(tibble)

#create a data frame with the predicted values from the linear regression
averagedAttritionRates2009 <- tibble(
  YearSinceVoucher = attritionRates2009$YearSinceVoucher, #year, from the 2009 data
  AttritionRatePercentage = predict(attritionRates2009lm, attritionRates2009), #predicted value
  DataSet = "2009Study", #tag as coming from the 2009 study
  DataType = "Averaged" #tag as a regression, not the actual data
)

#add the regression values to our dataframe
attritionRatesCompared <- attritionRatesCompared %>%
  mutate(DataType = "Literal") %>% #tag all data not from the regression as Literal
  rbind(averagedAttritionRates2009) %>% #add the regression data
  mutate(DataType = factor(DataType, levels = c("Literal", "Averaged"))) #reverse label order

head(attritionRatesCompared)
##   YearSinceVoucher AttritionRatePercentage   DataSet DataType
## 1                1                       8 2009Study  Literal
## 2                2                      16 2009Study  Literal
## 3                3                       7 2009Study  Literal
## 4                4                      10 2009Study  Literal
## 5                5                      12 2009Study  Literal
## 6                6                      14 2009Study  Literal

Plot Proposal Attrition Rate, 2009 Study Attrition Rate, and 2009 Study Regression Attrition Rate

#plot the attrition rates from the proposal and the study and the regression
ggplot(
    attritionRatesCompared, 
    aes(
      x = YearSinceVoucher, 
      y = AttritionRatePercentage, 
      color = DataSet, 
      linetype = DataType
    )
  ) + 
  geom_line(size = 2) + 
  labs(
    title = "Attrition Rates Compared", 
    x = "Years with Voucher", 
    y = "Attrition Rate (Percentage)"
  ) +
  scale_color_manual(labels = c("2009 Study", "Proposal Estimation"), values = c("red", "blue")) + 
  scale_linetype_discrete(guide = FALSE)