MEDA Sequence Visualization Workflow

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

This document describes the workflow within R for creating the sequential visualizations shown in the Final Presentation, on Tableau at https://public.tableau.com/profile/cbergman#!/vizhome/bundled_programs1/Sheet1. The goal was to visualize each client’s time sequence for enrolling in programs and making financial assessments and see when clients had “successes”.

First load the data, originally from 12 Excel workbooks, loaded into .Rdata file. This visualization uses 2 demographic tables for clients and households (72 columns), 5 program tables for 5 counseling programs offered to clients, 1 financial assessment table with (102 columns, all columns repeated each date client is assessed.

setwd("C:/Users/bergmc/Documents/cb/volunteer/DataKind/MedaSF/data/CleanedData/DataDive_CleanData")
load("DataKind_DataDive_MEDA_032715_v2_2015-03-26_clean.Rdata")
names(data)

##  [1] "Schema of Record Relationships"  "Clients inc. Demographics"      
##  [3] "Client Household Demographics"   "Universal Financial Assessments"
##  [5] "All Services Provided"           "Staff Time 1-1 Coaching Only"   
##  [7] "BDP Case Records"                "HOP Case Records"               
##  [9] "FINCAP Case Records"             "Tax Case Records"               
## [11] "Workforce Case Records"          "Other Outcomes and Milestones"  
## [13] "Job Placements"

Next, create a big matrix of all client records in all programs. Then look across the whole matrix and count client duplicate entries, which in theory should give the subset of clients who enrolled in more than 1 program. Nonprofits call this “bundling”. One of MEDA’s questions was, “Are participants more successful if they enroll in more than 1 program?”

# ##########################
# Program Data - keep only Contact.ID, Date.of.Most.Recent.Direct.Service, Program.Name, 
#  <any fields you need to calculate success>, success - calculated, success.date - calculated
# Using Enrollment.Date makes more sense but too many missing dates 
# ##########################
# read BDP program data
bdp <- data[[7]]
bdp <- bdp[, c(1, 10, 3, 18)]
names(bdp)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "Startup.Status.Date"

nrow(bdp)

## [1] 1244

# read HOP program data
hop <- data[[8]]
hop <- hop[, c(1, 11, 3, 17, 19)]
names(hop)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "Homeownership.Success.Statuses"    
## [5] "Homeownership.Status.Date"

nrow(hop)

## [1] 2651

# read FINCAP program data
fin <- data[[9]]
fin <- fin[, c(1, 11, 2)]
names(fin)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"

nrow(fin)

## [1] 3119

# read TAX program data
tax <- data[[10]]
tax <- tax[, c(1, 9, 2)]
names(tax)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"

nrow(tax)

## [1] 6667

# read WORKforce program data
work <- data[[11]]
work <- work[, c(1, 9, 2, 18, 19)]
names(work)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "Number.of.Job.Placements"          
## [5] "Most.Recent.Start.Date"

nrow(work)

## [1] 906

Now add the calculated successes and success dates.

#bdp
bdp$success = c(rep("2", nrow(bdp))) #char vector
for(i in 1:nrow(bdp)){
   if(is.na(bdp$Startup.Status.Date[i]) == T){
       bdp$success[i] = "0"
   } else{
       bdp$success[i] = "B"
   }
}
table(bdp$success)  #88/1244 = 7% successes

## 
##    0    B 
## 1156   88

# drop intermediate vars, keep only success
bdp <- bdp[-c(4)]
# add success date 
bdp$success.date = bdp$Date.of.Most.Recent.Direct.Service
names(bdp)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "success"                           
## [5] "success.date"

#hop
hop$success = c(rep("2", nrow(hop))) #char vector
hop$success.date <- as.Date(hop$Date.of.Most.Recent.Direct.Service, origin="1899-12-30")
# convert dates
hop$Homeownership.Status.Date <- as.Date(hop$Homeownership.Status.Date, origin="1899-12-30")
hop$Date.of.Most.Recent.Direct.Service <- as.Date(hop$Date.of.Most.Recent.Direct.Service, origin="1899-12-30")
for(i in 1:nrow(hop)){
   if(hop$Homeownership.Success.Statuses[i] == "Success"){
       hop$success[i] = "H"
       hop$success.date[i] = hop$Homeownership.Status.Date[i]
   } else{
       hop$success[i] = "0"
       hop$success.date[i] = hop$Date.of.Most.Recent.Direct.Service[i]
   }
}
table(hop$success)  #481/2651 = 18% successes

## 
##    0    H 
## 2170  481

# drop intermediate vars, keep only success
hop <- hop[-c(4,5)]
names(hop)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "success"                           
## [5] "success.date"

#work
work$success = c(rep("2", nrow(work)))
work$success.date <- as.Date(work$Date.of.Most.Recent.Direct.Service, origin="1899-12-30")
# convert dates
work$Most.Recent.Start.Date <- as.Date(work$Most.Recent.Start.Date, origin="1899-12-30")
work$Date.of.Most.Recent.Direct.Service <- as.Date(work$Date.of.Most.Recent.Direct.Service, origin="1899-12-30")
for(i in 1:nrow(work)){
   if(work$Number.of.Job.Placements[i] >= 1 ){
       work$success[i] = "W"
       work$success.date[i] = work$Most.Recent.Start.Date[i]
   } else{
       work$success[i] = "0"
       work$success.date[i] = work$Date.of.Most.Recent.Direct.Service[i]
   }
}
table(work$success)  #211/906 = 23% successes

## 
##   0   W 
## 695 211

# drop intermediate vars, keep only success
work <- work[-c(4,5)]
names(work)

## [1] "Contact.ID"                        
## [2] "Date.of.Most.Recent.Direct.Service"
## [3] "Program.Name"                      
## [4] "success"                           
## [5] "success.date"

# pad tax, allways success and success.date = Date.of.Most.Recent.Direct.Service
tax <- transform(tax, success = "1")
tax$success.date = tax$Date.of.Most.Recent.Direct.Service
# pad fin, never success (use assessments instead) and success.date = Date.of.Most.Recent.Direct.Service
fin <- transform(fin, success = "0")
fin$success.date = fin$Date.of.Most.Recent.Direct.Service

Next, bind together into big matrix.

allp <- rbind(as.matrix(hop), as.matrix(bdp))
allp <- rbind(as.matrix(allp), as.matrix(fin))
allp <- rbind(as.matrix(allp), as.matrix(tax))
allp <- rbind(as.matrix(allp), as.matrix(work))
allp <- data.frame(allp)
sapply(allp, mode)

Check for data problems. First check for missing values. Previously I checked each program and saw approx 1000 missing Enrollment dates. Now that I’m using Most Recent Activity Date, it looks like only 22 missing dates.

sapply(allp, function(x) sum(is.na(x)))

##                         Contact.ID Date.of.Most.Recent.Direct.Service 
##                                  0                                  0 
##                       Program.Name                            success 
##                                  0                                  0 
##                       success.date 
##                                 22

Check for duplicate clients by program. Adding them up, about 90 duplicate client entries. Don’t know if that’s accidents such as couple are entered 2x for same program enrollment, bad data, or maybe some people do re-enroll after some time has passed.

ndup <- data.frame(table(bdp$Contact.ID))
bdpidx <- ndup[ndup$Freq > 1,]
ndup <- data.frame(table(hop$Contact.ID))
hopidx <- ndup[ndup$Freq > 1,]
ndup <- data.frame(table(fin$Contact.ID))
finidx <- ndup[ndup$Freq > 1,]
ndup <- data.frame(table(tax$Contact.ID))
workidx <- ndup[ndup$Freq > 1,]
ndup <- data.frame(table(work$Contact.ID))
taxidx <- ndup[ndup$Freq > 1,]
ndup <- data.frame(table(allp$Contact.ID))
nrow(bdpidx)

## [1] 11

nrow(hopidx)

## [1] 21

nrow(finidx)

## [1] 22

nrow(workidx)

## [1] 7

nrow(taxidx)

## [1] 38

Knowing your data errors, go ahead and count # clients who bundled more than 3 programs. Answer is 51.

ndup <- data.frame(table(allp$Contact.ID))
allpidx <- ndup[ndup$Freq > 3,] #high bundles are > 3 since 3 is just "single bundle"
allpidx <- allpidx[with(allpidx, order(-Freq)), ]
nrow(allpidx)

## [1] 51

# limit visualization data to just bundled clients
vis <- allp[allp$Contact.ID %in% allpidx[, c(1)], ]
nrow(vis)

## [1] 210

Next, create a reduced financial assessment. Each financial assessment, by client and date, is given a set of D, I, S, C (Debt, Income, Savings, Credit) measures. Get rid of the 102 columns in that table and replace them with just 4 DISC success indicators. This code uses package dplyr and sits at https://www.dropbox.com/home/DataDive_MEDA/merged_outcome_code?preview=reduced_fin_assess.R.

setwd("C:/Users/bergmc/Documents/cb/volunteer/DataKind/MedaSF/data/CleanedData/DataDive_CleanData")
red = read.csv(file="merged_outcome_datasets/fin_assess_reduced.csv", header=TRUE, sep=",");
# Read off some basic stats
nrow(unique(red[c("Contact.ID")]))  #Unique clients = 1783

## [1] 1783

aggregate(Contact.ID ~ D, data = red, FUN = function(x) length(unique(x))) #D 297/1783 = 15%

##       D Contact.ID
## 1 FALSE       1783
## 2  TRUE        297

aggregate(Contact.ID ~ I, data = red, FUN = function(x) length(unique(x))) #I 109/1783 = 6%

##       I Contact.ID
## 1 FALSE       1783
## 2  TRUE        109

aggregate(Contact.ID ~ S, data = red, FUN = function(x) length(unique(x))) #S 890/1783 = 45%

##                      S Contact.ID
## 1                FALSE        747
## 2 no expenses recorded        327
## 3                 TRUE        890

aggregate(Contact.ID ~ C, data = red, FUN = function(x) length(unique(x))) #C 901/1783 = 46%

##       C Contact.ID
## 1 FALSE        636
## 2  TRUE        901

sapply(red, mode)

##               X      Contact.ID           Order Assessment.Date 
##       "numeric"       "numeric"       "numeric"       "numeric" 
##               S               D               I               C 
##       "numeric"       "logical"       "logical"       "logical"

# notice csv lost leading 00's in front of Contact.ID and dates
red$Contact.ID <- formatC(as.numeric(red$Contact.ID), format='f',width=7, digits=0, flag=0)
red$Assessment.Date <- as.Date(red$Assessment.Date, origin="1899-12-30")
red$Program.Name <- formatC(as.character(red$Program.Name))

Now you’re ready to add in these financial assessments into your big programs matrix. The result matrix will have an entry for every client in every program, whether or not they succeeded in that program and assessments with DISC successes by client and date. Subset result matrix to only those who bundled.

# ##########################
# Create fn for column reordering
# ##########################
movetolast <- function(data, move) {
  data[c(setdiff(names(data), move), move)]
}
red <- transform(red, Program.Name = "")
colnames(red)[4] <- "Date.of.Most.Recent.Direct.Service"
red <- red[ -c(1)] #drop column X
# pad with no success and success.date = Date.of.Most.Recent.Direct.Service
red <- transform(red, success = "0")
red$success.date = red$Date.of.Most.Recent.Direct.Service
red <- movetolast(red, c("success", "success.date", "Order", "D", "I", "S", "C")) #reorder columns
vis <- transform(vis, Order = NA)
vis <- transform(vis, D = NA)
vis <- transform(vis, I = NA)
vis <- transform(vis, S = NA)
vis <- transform(vis, C = NA)
# double check your column names
names(red)
names(vis)
# limit to just clients who bundled
#red <- red[red$Contact.ID %in% allpidx[, c(1)], ]
red_temp <- red[red$Contact.ID %in% vis[, c(1)], ]
nrow(red_temp)
# append keeping all financial assessments for clients who bundled
vis <- rbind(as.matrix(vis), as.matrix(red_temp))
# convert vis to data frame
vis <- data.frame(vis, stringsAsFactors = FALSE)

Another aggregation step is to take the 2 demographics tables and merge them by client and Household ID. This code sits at https://www.dropbox.com/home/DataDive_MEDA/merged_outcome_code?preview=contact_inc_HH_demogr_merged.R

setwd("C:/Users/bergmc/Documents/cb/volunteer/DataKind/MedaSF/data/CleanedData/DataDive_CleanData")
client_inc_HH_demogr = read.csv(file="merged_outcome_datasets/client_inc_HH_demogr.csv", header=TRUE, sep=",");
client_inc_HH_demogr$Contact.ID <- formatC(as.numeric(client_inc_HH_demogr$Contact.ID), format='f',width=7, digits=0, flag=0)
# notice .csv files lost leading 00's in front of Contact.ID
client_inc_HH_demogr$Contact.ID <- formatC(as.numeric(client_inc_HH_demogr$Contact.ID), format='f',width=7, digits=0, flag=0)
nrow(client_inc_HH_demogr)

## [1] 16495

Now, the final merge of your Program, Assessessment successes with Demographic data.

# final merge
vis <- merge(x=vis, y=client_inc_HH_demogr, by="Contact.ID", all.x=TRUE)
write.csv(vis, file="seq_demogr2.csv")
nrow(vis)

## [1] 325

The final step is to read that last csv file into an exploratory visualization tool such as Tableau Public. This one can be found at https://public.tableau.com/profile/cbergman#!/vizhome/bundled_programs1/Sheet1.

From the visualization below, it does not appear that bundled customers have any more DISC successes than the average population, maybe even fewer. It also does not look like number of minutes direct service made any difference. We also know not many people bundled (51 out of 1244). We might be able to make a few hypothesis about DISC success, especially Income success related to Home program, with small n=13. Based on just the visualization of DISC success, it appears the answer is “NO” to the question “Are participants more successful if they enroll in more than 1 program?”

Next, we look at the visualization of Program Successes. From the visualization below, the answer looks even worse for Program successes. There are far fewer Program successes than in the general population. Maybe because enrolling in too many programs at once is distracting? Based on just the visualization of Program success, it appears the answer is “NO” again to the question “Are participants more successful if they enroll in more than 1 program?”

MEDA Sequence Visualization Workflow

Christy Bergman

Sunday, April 12, 2015