Full Visual

.

Introduction

The topic for this project evolved out of assistance I provided to a political campaign in the city of Waltham when I first moved to the area in August 2017. In September, I did a simple summarize and filter operation of the top streets on which voters likely to be supporters of the campaign resided in order to provide canvassing teams information as to where it would be beneficial to focus their efforts. With new machine learning and data visualization skills acquired while learning Data Analytics at Northeastern came new ideas and potentialities for how to better make use of this data. This document will provide an overview of the intention, methods, and outcomes this exploration entailed, and will highlight the outline of what could potentially become an R package for canvass campaign coordinators.

Data Reading & Wrangling

Loading voter data, making some basic transforms. Extracting the addresses as a vector. Load all of the data I’ve used for the project.

Geocoding Addresses

The script below was used to geocode the formatted addresses in the data set to make them amenable to graphical representation, and to provide additional factors by which the data can be sorted. The operation was computationally expensive, and quite time consuming. Google limits unsubscribed users of the API to 2500 requests per 24 hour period. Thus it was necessary to make minimal examples to get the code right, then cross fingers and hope that the script handled exceptions appropriately and did not burn requests without retrieving the requested data due to malformed requests. Due to it’s computational complexity, I’ve set this chunk not to evaluate, and instead loaded the output as an Rdata file in the ‘Load Data’ chunk.

# ----------------------- Thu Mar 08 22:49:26 2018 ------------------------# 2500
# at a time based on free restrictions.
geo_replies <- vector("list", 4820)
ctr <- vector()
for (i in retry) {
    ctr[i] <- i
    print(ctr[i])
    geo_replies[[i]] <- ggmap::geocode(Addresses[i], output = "all", messaging = TRUE, 
        override_limit = TRUE, source = "dsk")
    
    print(geo_replies[[i]]$status)
    while (geo_replies[[i]]$status != "OK") {
        Sys.sleep(60)
        geo_replies[[i]] <- ggmap::geocode(Addresses[i], output = "all", messaging = TRUE, 
            override_limit = TRUE, source = "dsk")
    }
}
answer <- data.frame(formatted_address = rep(NA, 4820), neighborhood = rep(NA, 4820), 
    locality = rep(NA, 4820), administrative_area_level_2 = rep(NA, 4820), administrative_area_level_1 = rep(NA, 
        4820), street_number = rep(NA, 4820), route = rep(NA, 4820), country = rep(NA, 
        4820), postal_code = rep(NA, 4820), postal_code_suffix = rep(NA, 4820), lat = rep(NA, 
        4820), lon = rep(NA, 4820), accuracy = rep(NA, 4820))
ct <- vector()
for (l in retry) {
    ct[l] <- l
    types <- geo_replies[[l]] %>% purrr::pluck(list("results", 1, "address_components")) %>% 
        lapply("[[", "types") %>% lapply("[", 1) %>% unlist %>% stringr::str_which("postal_code")
    if (length(types) == 1) {
        next
    }
    for (c in seq_along(types)) {
        answer[l, types[c]] <- geo_replies[[l]] %>% purrr::pluck(list("results", 
            1, "address_components", c, "short_name"))
    }
    answer[l, "lat"] <- geo_replies[[l]][["results"]][[1]][["geometry"]][["location"]][["lat"]]
    answer[l, "lon"] <- geo_replies[[l]][["results"]][[1]][["geometry"]][["location"]][["lng"]]
    answer[l, "formatted_address"] <- geo_replies[[l]][["results"]][[1]][["formatted_address"]]
    answer[l, "accuracy"] <- geo_replies[[l]][["results"]][[1]][["types"]][1]
}
# ----------------------- Fri Mar 09 11:08:25 2018 ------------------------#
# Determine items with missing data

type.v <- lapply(geo_replies[2522:4820], FUN = function(x) {
    x %>% purrr::pluck(list("results", 1, "address_components")) %>% lapply("[[", 
        "types") %>% lapply("[", 1) %>% unlist %>% length
})
which(type.v == 1)
retry <- 2521 + which(type.v == 1)
sapply(types[2522:4820], length)
Addresses[retry]

retry <- which(answer$locality != "Waltham")
oql <- which(lapply(geo_replies, "[[", "status") != "OK")
lengths <- vector()
for (l in seq_along(geo_replies)) {
    lengths[l] <- geo_replies[[l]] %>% purrr::pluck(list("results", 3, "address_components")) %>% 
        length
}
which.max(lengths)
write.csv(answer, "answer.csv")
save(geo_replies, file = "geo_replies.Rdata")

Joining with Original Data

This chunk comprises the operations necessary to format the data returned from the operation above to be joined with the original dataset. It also attempts to remedy any missing values found in the dataset. If missing values remained after the operations herein, they were removed.

answer$postal_code %<>% gsub("^", "0", .)
# ----------------------- Sun Mar 11 18:04:20 2018 ------------------------#
# Remove duplicates from answer
sum(duplicated(Voters$`Voter ID Number`))
names(answer)
names(Voters)
# ----------------------- Fri Mar 09 12:05:11 2018 ------------------------#
# Format Addresses the same
simpleCap <- function(x) {
    s <- strsplit(x, " ")[[1]]
    paste(toupper(substring(s, 1, 1)), tolower(substring(s, 2)), sep = "", collapse = " ")
}
Voters$`Residential Address - Street Name` %<>% sapply(., simpleCap)

iVoters <- nrow(Voters)
formatted_address <- vector()
for (i in 1:iVoters) {
    formatted_address[i] <- stringr::str_c(Voters[i, "St Addr"], " ", simpleCap(Voters[i, 
        "Residential Address - Street Name"]), ",", " Waltham, MA ", Voters[i, "Zip"], 
        " US", collapse = "")
}
Voters$formatted_address <- formatted_address
ianswer <- nrow(answer)
formatted_address <- vector()
answer <- answer %>% mutate_at(vars(c("street_number", "route", "postal_code")), 
    funs(as.character))
for (i in 1:ianswer) {
    formatted_address[i] <- str_c(answer[i, "street_number"], " ", simpleCap(answer[i, 
        "route"]), ",", " Waltham, MA ", answer[i, "postal_code"], " US", collapse = "")
}
answer$formatted_address <- formatted_address
anyDuplicated(answer$formatted_address)


identical(class(answer$formatted_address), class(Voters$formatted_address))

Voters.full <- left_join(Voters, (answer %>% select(formatted_address, lat, lon) %>% 
    unique), by = "formatted_address")
# ----------------------- Fri Mar 09 12:37:03 2018 ------------------------# Fix
# the ones that were missed or duplicated
Voters.full.miss <- Voters.full[which(is.na(Voters.full$lat)), ]
compareClasses <- sapply(Voters.full.miss, class) %>% data.frame(Colnames = names(.), 
    Voters = ., stringsAsFactors = F) %>% rownames_to_column %>% left_join((as.data.frame(sapply(answer, 
    class)) %>% rownames_to_column), by = c(Colnames = "rowname"))

by <- c(`St Addr` = "street_number", `Residential Address - Street Name` = "route", 
    Zip = "postal_code")
answer$street_number %<>% as.numeric
Voters.full.miss <- left_join(Voters.full.miss[, c(1:13)], (answer %>% select(street_number, 
    route, postal_code, lat, lon) %>% unique), by = by)
sum(is.na(Voters.full.miss$lat))
# No use ----------------------- Sun Mar 11 18:30:52 2018
# ------------------------#

# Remove Any still missing
Voters.full %<>% filter(!is.na(lat)) %>% filter(!is.na(lon))

Campaign modeling

k-Means for canvassing campaign drop points

For a campaign, the number of volunteers available for canvassing can help determine the model to use for drop point selection: where the number of volunteers is equal to the number of clusters. Alternatively, the number of volunteers can be used to select the model k value. This chunk explores the dataset manually using an iterative k-means operation with a tuning grid for parameter values. Each model is assessed based on all the criteria in the clusterCrit package. Each criteria has an equally weighted vote - the models with the most votes are included in the index. All of the k-means models for the Democrats in Waltham are graphed with a Voronoi tesselation from the deldir package to provide visual segmentation of clusters.

## bestIndex
##  1  5  9 44 49 57 59 
## 25  3  2  1  1  1  1
##              alg  k
## 1  Hartigan-Wong  1
## 5  Hartigan-Wong  2
## 9  Hartigan-Wong  3
## 44      MacQueen 11
## 49 Hartigan-Wong 13
## 57 Hartigan-Wong 15
## 59         Forgy 15

dropPoints function

Taking what was learned in the initial trial run above, the inspiration came to functionalize this operation such that a dataset can be provided and k-means clustering is performed for a canvassing campaign coordinator whom only needs to know how to provide to the function the parameters of the dataset, and basic parameters for k-means to receive workable clusters of the geographic data.

Arguments

:
data

Input data as dataframe. Latitude and Longitude columns must be labeled la and lo at the very least or any longer variation (case insensitive) to be detected. Required

f

The name of the factor variable on which to split the data. Must be a character vector enclosed in quotes Required

k

The k (# of centroids/clusters) for k-means, an integer vector of the number of clusters to try. Optional: Defaults to

c(2:10)

nms

The names of the factor levels to compute clusters for. Optional: Defaults to all except those with fewer than 10 values.

best.clust

A logical indicating the best clusters are requested in the output and the cluster evaluation criterion votind method should be used. If FALSE, all the clusters (for all methods and all k values) are included in the output. Optional: Defaults to T. Note: criterion voting is more computationally complex so can force the function to take longer in evaluation. If pressed for time, set to F. Otherwise, leave enabled for optimal (and fewer) cluster outputs.

Details

A list, with sublists for each factor level, each with 3 sublists for each unique factor level, explained further in Value.

Value

[[‘BestConfigs’]]

Contains the parameters for the best configurations of kmeans.

[[‘DropCoords’]]

a list of the drop points corresponding to each of the best cluster models.

[[‘Clusters’]]

  1. If best.clust=TRUE, a list of the highest performing cluster models. If best.clust=FALSE, a list containing [[‘clusters’]] - a list of all clusters.
  2. [[‘performance’]] - the respective performance of each cluster across numerous criteria. (See ?clusterCrit for details on performance criteria)

# ----------------------- Fri Mar 09 17:56:06 2018 ------------------------#
# Kmeans for drop points sorted by factor
dropPoints <- function(data, f, k = c(2:10), nms = NULL, best.clust = T) {
    # ----------------------- Fri Mar 09 18:12:36 2018 ------------------------#
    # Inputs: data - voter data as data frame, f: factor on which to split the data
    # for each clustering operation, k: k (centroids) a vector of the number of
    # possible drop point. nms: The names of the factor levels to compute drop points
    # for (party affiliation abbvs typically). best.clust: A logical indicating the
    # best clusters are requested in the output, if FALSE, all the clusters are
    # included in the output.  Outputs: a list, with sublists for each factor level,
    # each with 3 sublists for each unique factor level.  [[BestConfigs]] contains
    # the parameters for the best configurations of kmeans.  [['DropCoords']] - a
    # list of the drop points corresponding to each of the cluster models
    # [['Clusters']] - a list of the highest performing cluster models.If
    # best.clust=FALSE, a list containing [['clusters']] - all cluster models and
    # [['performance']] - the respective performance of each cluster across numerous
    # criteria. (See ?clusterCrit for details on performance criteria) 3/21/2018
    # 9:45:31 PM TODO(Needs if control statement for best.clust=T v best.clust=F)
    if (is.null(nms)) {
        nms <- unique(data[[f]])
    }
    drops <- vector("list", length(nms))
    names(drops) <- nms
    alg <- c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")
    pars <- expand.grid(alg = alg, k = k, stringsAsFactors = F)
    var.num <- names(data)[sapply(data, is.numeric)]
    lalo <- c(stringr::str_extract(var.num, regex("(?:lat?i?t?u?d?e?)", ignore_case = T)) %>% 
        .[!is.na(.)], stringr::str_extract(var.num, regex("(?:lon?g?i?t?u?d?e?)", 
        ignore_case = T)) %>% .[!is.na(.)])
    # Begin party loop
    for (n in seq_along(nms)) {
        clusters <- vector("list", nrow(pars))
        performance <- vector("list", nrow(pars))
        df.f <- data %>% subset(.[[f]] == nms[n]) %>% select(c(lalo))
        df <- scale(df.f)
        if (nrow(df.f) < max(k)) {
            k <- c(1:(nrow(df.f) - 1))
            pars <- expand.grid(alg = alg, k = k, stringsAsFactors = F)
            clusters <- vector("list", nrow(pars))
            performance <- vector("list", nrow(pars))
        }
        # Begin cluster loop
        for (a in seq_along(clusters)) {
            clusters[[a]] <- kmeans(df, centers = pars[a, "k"], nstart = 25, algorithm = pars[a, 
                "alg"], iter.max = 25)
            performance[[a]] <- clusterCrit::intCriteria(traj = df, part = clusters[[a]]$cluster, 
                crit = "all")
        }  #End clustering loop
        critNames <- lapply(performance[[1]], names) %>% names %>% as.list
        names(critNames) <- critNames %>% unlist
        for (c in seq_along(critNames)) {
            critNames[[c]] <- performance %>% lapply(FUN = "[[", critNames[[c]]) %>% 
                unlist %>% clusterCrit::bestCriterion(crit = critNames[[c]])
        }
        (bestIndex.crit <- table(critNames %>% unlist))
        best <- names(bestIndex.crit) %>% as.numeric
        bestCl <- clusters %>% subset(seq_along(clusters) %in% best)
        matCl <- lapply(bestCl, "[[", "cluster")
        dropPts <- vector("list", length(matCl))
        for (m in seq_along(matCl)) {
            dropPts[[m]] <- df.f %>% mutate(Cl = matCl[[m]]) %>% group_by(Cl) %>% 
                summarize_all(funs(mean))
        }
        if (best.clust == T) {
            drops[[n]] <- list(BestConfigs = pars[best, ], DropCoords = dropPts, 
                Clusters = bestCl)
        } else {
            drops[[n]] <- list(BestConfigs = pars[best, ], DropCoords = dropPts, 
                Clusters = list(clusters = clusters, performance = performance))
        }
        
    }  #End party loop
    
    return(drops)
}
f <- "Party Affiliation"
data <- Voters.full
drops <- dropPoints(data = data, f = f)

startPoints Function: find optimal starting points based on pop. density.

startPoints

This function finds startings points at the center of poulation density for the cluster. By starting the route at the point with the highest population density, it ensures that the time available canvassing captures the greatest number of voters. Volunteers often have only a certain amount of time to spare, this function helps to make that time more effective.

Arguments

data

original data with group column added based on the selected k-means output.

grp

The variable name chosen for the column with cluster group value. A character vector.

Value

A dataframe with three columns, lat/lon of the starting point & group id.

Just below the function is a short script to vectorize this function over all data in the output from dropPoints.

# ----------------------- Sat Mar 17 16:28:04 2018 ------------------------#
# Inputs: data - original data, grp - variable name for the cluster group value
# Outputs: A df with three columns, lat/lon & Group id Vectorization of the
# function suited for the output from dropPoints is below Depends: stringr
startPoints <- function(data, grp) {
    var.num <- names(data)[sapply(data, is.numeric)]
    lalogrp <- c(stringr::str_extract(var.num, regex("(?:lat?i?t?u?d?e?)", ignore_case = T)) %>% 
        .[!is.na(.)], stringr::str_extract(var.num, regex("(?:lon?g?i?t?u?d?e?)", 
        ignore_case = T)) %>% .[!is.na(.)], grp)
    d <- data[, lalogrp]
    
    grps <- unique(d[, grp])
    sPs <- sapply(grps, function(g, d, grp) {
        # ----------------------- Sat Mar 17 15:55:34 2018 ------------------------# For
        # each point, determine the number of points within 1 SD, the point with the most
        # other points within radius wins.
        d.grp <- d[d[, grp] == g, ]
        d.grp.sd <- sapply(d.grp[, lalogrp[1:2]], sd)
        d.grp.nn <- apply(d.grp[, 1:2], 1, FUN = function(r, d.grp.sd, d.grp, lalogrp) {
            nn <- intersect(which(d.grp[, lalogrp[1]] > r[1] - d.grp.sd[1] & d.grp[, 
                lalogrp[1]] < r[1] + d.grp.sd[1]), which(d.grp[, lalogrp[2]] > r[2] - 
                d.grp.sd[2] & d.grp[, lalogrp[2]] < r[2] + d.grp.sd[2])) %>% length  # Find which (the index of) values that are within one sd of each point in lat and in lon respectively, and return the intersection (IE individual lat/lon points). Return the length of this vector as the number of nodes it's nearest too.
        }, d.grp.sd = d.grp.sd, d.grp = d.grp, lalogrp = lalogrp)  # Apply over rows outputs the number of points each point is nearest too 
        d.grp[which.max(d.grp.nn), ]  # Returns the index of the point with the most nearest neighbors. 
    }, d = d, grp = grp, USE.NAMES = T, simplify = T)  # Returns the starting points for each group 
    sPs <- as.data.frame(matrix(data = sPs %>% unlist, ncol = 3, byrow = T, dimnames = list(rows = NULL, 
        cols = lalogrp)))
    return(sPs)
}
sPs <- startPoints(D.grpd, "Group")  #For Testing
# ----------------------- 3/19/2018 3:51:58 PM ------------------------#
# Vectorization of startPoints function
sPs <- sapply(data.clusters, function(item) {
    sPs <- lapply(item, function(cl) {
        sPs <- startPoints(cl, "Group")
    })
})


meters <- c(lat = 111080.836035119, lon = 111577.128269352)  # Meters per unit latitude,longitude - not used.

visDrops function: visualizing groups with centroids or starting points.

This function is intended for visualizing output from the dropPoints function such that specific cluster models can be selected visually. The function prints each graph to the chunk output. Commented within the function is a ggsave command that allowed saving the individual graphs to pdf for use in the presentation.

Arguments

data

The original data with lat/lon, does not need cluster group value column as that is extracted from the dropPoints output argument: drops

f

The column name with the factor by which the data was sorted. 3/21/2018 10:06:37 PM TODO(Include functionality to allow this to be omitted if drops is provided)

drops

The output list from the dropPoints algorithm

sPs

The lists of starting points dataframes from the vectorization of the startPoints algorithm over the data with cluster group id column. 3/21/2018 10:09:02 PM TODO(Standardize the outputs to either the smallest unit of output or for a common output across each function for a linear progression of inputs & outputs)

# ----------------------- Fri Mar 09 17:53:32 2018 ------------------------#
# Visualize the Drop points 3/19/2018 2:10:01 PM EDT TODO(Add support for
# hierarchical clustering and bounding boxes around clusters.)
visDrops <- function(data = data, f = f, drops = drops, sPs = NULL) {
    
    c <- RColorBrewer::brewer.pal(length(drops), "Set1")
    for (d in seq_along(drops)) {
        df.f <- data %>% subset(.[[f]] == names(drops)[d]) %>% select(c("lat", "lon"))
        if (exists(Waltham, envir = parent.env(environment(fun = visDrops))) == F) {
            Waltham <- get_googlemap(center = c(mean(df.f$lon), mean(df.f$lat)), 
                zoom = 13, format = "png8", maptype = "roadmap")
        }
        
        co <- c[d]
        for (i in seq_along(drops[[d]][["DropCoords"]])) {
            # ----------------------- Mon Mar 12 13:13:54 2018 ------------------------# Add
            # gridarrangement to output
            ld <- length(drops[[d]][["DropCoords"]])
            grob.list <- vector("list", ld)
            gcol <- c(2, 3, 4)
            gcol <- gcol[max(which(ld%%gcol == 0 | ld%%gcol == 1))]
            
            ld/gcol
            t <- paste("Factor=", names(drops)[d], ", k=", drops[[d]][["BestConfigs"]][["k"]][i], 
                sep = "")
            
            if (is.null(sPs)) {
                centers <- drops[[d]][["DropCoords"]][[i]]
            } else {
                centers <- sPs[[d]][[i]]
                # sapply(centers,cat)
            }
            
            if (length(centers$lon) > 1) {
                corners <- c(range(df.f$lon), range(df.f$lat))
                voronoi <- deldir::deldir(centers$lon, centers$lat, rw = corners)
                ggmap(Waltham) + geom_point(data = df.f, mapping = aes(x = lon, y = lat), 
                  color = co, size = 0.6, na.rm = F) + geom_point(data = centers, 
                  mapping = aes(x = lon, y = lat), color = "black") + geom_segment(aes(x = x1, 
                  y = y1, xend = x2, yend = y2), size = 0.4, data = voronoi$dirsgs, 
                  linetype = 1, color = "black") + labs(title = t, subtitle = "", 
                  caption = "", x = "", y = "") + theme(plot.title = element_text(hjust = 0.5), 
                  plot.subtitle = element_text(hjust = 0.5), axis.text = element_blank())
                # ggplot2::ggsave(filename=paste(t,'.pdf',sep=''),plot=last_plot(),
                # path='Plots',device = 'pdf',width = 5, height = 5, units = 'in')
            } else {
                ggmap(Waltham) + geom_point(data = df.f, mapping = aes(x = lon, y = lat), 
                  color = co, size = 0.6, na.rm = F) + geom_point(data = centers, 
                  mapping = aes(x = lon, y = lat), color = "black") + labs(title = t, 
                  subtitle = "", caption = "", x = "", y = "") + theme(plot.title = element_text(hjust = 0.5), 
                  plot.subtitle = element_text(hjust = 0.5), axis.text = element_blank(), 
                  axis.ticks = element_blank())
                # ggplot2::ggsave(filename=paste(t,'.pdf',sep=''),plot=last_plot(),
                # path='Plots',device = 'pdf',width = 5, height = 5, units = 'in')
            }
            
        }
    }
}
visDrops(data = data, f = f, drops = drops, sPs = sPs)

A kmeans object from the dropPoints output is to be selected manually based on the number of volunteers available. This cluster data is then used to group the original data.

routes function: Route Optimization

The function expects data with a column entitled “Group” (or specify with grp argument) that contains the cluster number the observation belongs to. The data is filtered by group. The function then uses an iterative k-nearest neighbors from the spdep package to provide an index of “as the crow flies” nearest neighbors for each point observation in the cluster. The starting points sPs object is used to select the point with the highest number of nearest neighbors within one standard deviation to start the route data frame (route). The nearest neighbors indexes for this point (d.nn columns: nnX) are used to subset the cluster to just the k-nearest neighbors as potential destinations (nn). K defaults to 5 and increments up if all k nearest neighbors are already in the route, the overall nearest neighbors index object (d.nn) is then subsetted to include only points not already in the route before recalculating nearest neighbors indexes. The nearest neighbors (nn) are used to minimize the number of requests made on the gmapsdistance API, as a standard plan API key has rate limits. The return from gmapdistance is used to select the point with the least walking time as the next destination that is added to the route. The next iteration of loop starts with this point. The outcome (provided no errors are encountered), is a list of data frames, one for each cluster, with a list of the unique points in the cluster sorted into a shortest path walking route indicated by the Index column. The first point will be the optimal point to start so as to maximize the number of canvassed houses in the least amount of time. The data frames can be joined with the original data to include the full data for each observation sorted by factor. The data is then filtered by group to form new dataframes for each of the groups. Each group dataframed is sorted by index for optimal route, and then provided to the canvasser.

## [1] "AIzaSyB7spZTdy-yEt8qUe089twfvKRIvx738Q4"

Visualizing routes

The visual output from the routes function. I selected a specific route to visualize for the poster. 3/21/2018 10:10:54 PM TODO(Functionalize this for visualizing routes in a vectorized operation over the list output from routes, iteratively printing graphs.)

hc.dropPoints function: Hierarchical Clustering for Variance Minimization

HC Clustering. The results are fairly unimpressive. I find that k-Means is more effective at partitioning the data in a geographically more contained manner.

Arguments

data

Input data as dataframe. Latitude and Longitude columns must be labeled la and lo at the very least or any longer variation (case insensitive) to be detected. Required

f

The name of the factor variable on which to split the data. Must be a character vector enclosed in quotes Required

k

The c (# of hierarchical clusters), an integer vector of the number of clusters to try. Optional: Defaults to

c(2:10)

verbose

A logical indicating whether debugging output should be printed to the console.

Value

# TODO(Add Hierarchical clustering options to function to allow for clustering
# based on the number of nodes in each cluster.)  3/19/2018 2:08:10 PM EDT

data <- Voters.full
hc.dropPoints <- function(data, f, c = NULL, verbose = F) {
    nms <- unique(data[[f]])
    drop.areas <- vector("list", length(nms))
    names(drop.areas) <- nms
    var.num <- names(data)[sapply(data, is.numeric)]
    lalo <- c(stringr::str_extract(var.num, regex("(?:lat?i?t?u?d?e?)", ignore_case = T)) %>% 
        .[!is.na(.)], stringr::str_extract(var.num, regex("(?:lon?g?i?t?u?d?e?)", 
        ignore_case = T)) %>% .[!is.na(.)])
    # Begin party loop
    for (m in seq_along(nms)) {
        df.f <- (df.s <- data %>% subset(subset = (.[[f]] == nms[m]))) %>% select(lalo) %>% 
            stats::dist(method = "euclidean")
        # Dissimilarity matrix
        if (nrow(df.s) < 10) {
            drop.areas[[nms[m]]] <- "Under 10 observations, no clustering necessary"
            next
        }
        if (is.null(c)) {
            c <- 2:10
            runs <- expand.grid(method = c("ward.D", "ward.D2", "average"), c = c)
        } else {
            runs <- expand.grid(method = c("ward.D", "ward.D2", "average"), c = c)
        }
        hcs <- vector("list", nrow(runs))
        S <- data.frame(S = seq(nrow(runs)), Index = seq(nrow(runs)), stringsAsFactors = F)
        for (r in 1:nrow(runs)) {
            if (verbose == T) {
                print.AsIs(runs[r, ])
                cat(nrow(df.s))
            }
            hc <- hclust(df.f, method = runs[r, 1])
            hcs[[r]] <- dendextend::cutree(hc, runs[r, 2])
            S[r, 1] <- summary(silhouette(hcs[[r]], df.f))$si.summary[3]
        }
        hcs.best <- hcs[S[order(S$S, decreasing = T), ][["Index"]][1:5]]
        names(hcs.best) <- 1:5
        hcs.df <- lapply(hcs.best, FUN = function(x, df.s) {
            df <- data.frame(Rowname = rownames(df.s), lat = df.s[[lalo[1]]], lon = df.s[[lalo[2]]], 
                Group = x %>% as.factor)
            return(df)
        }, df.s = df.s)
        names(hcs.df) <- sapply(hcs.best, FUN = function(x) {
            x %>% unique %>% length
        }, simplify = F, USE.NAMES = T) %>% paste0(names(.), "-", "Cuts:", .)
        
        drop.areas[[nms[m]]] <- hcs.df
        
    }  # End loops
    
    return(drop.areas)
}

hc.drops <- hc.dropPoints(data = data, f = f, verbose = F)

clustHull function: create convex hulls for visually containing clusters from other function outputs.

This function was created as per suggestion of Pablo to confine each area in a convex hull. It is currently coded to work with the output of hc.dropPoints. 3/21/2018 10:25:08 PM TODO(Create an if detection and control mechanism to also process output from dropPoints). 3/22/2018 9:57:19 AM TODO(Revise to use ddply method of creating hulls and return those as separate dfs for ease of graphing)

clusterlist

Currently the list output from hc.dropPoints for which convex hulls will be generated.

verbose

logical indicating whether debugging output should be printed to the console.

Value

This function returns a list of dataframes with the same structure as the hc.dropPoints output each with lon/lat data segmented by group, with a T/F column indicating whether the value belongs to the convex hull. Each dataframe includes two attributes:

  1. attr(“DC”): The rowname from the original data set that contains the lat/lon point at which the center of pop. density lies based on points solely on the hull.
  2. attr(“Group”): The group id to which the dataframe belongs.

# ----------------------- Mon Mar 19 21:32:50 2018 ------------------------# For
# debugging


clustHull <- function(clusterlist, verbose = F) {
    # ----------------------- Mon Mar 19 20:56:41 2018 ------------------------#
    # Create Output Object Get names
    nms <- names(clusterlist)
    # Begin parsing clusterlist
    out <- lapply(clusterlist, function(cli) {
        if (!is.list(cli)) {
            out <- cli
            return(out)
        }
        cli.nms <- names(cli)
        if (verbose == T) {
            print(cli.nms)
        }
        cli.chs <- lapply(cli, function(cdf) {
            if (verbose == T) {
                print(class(cdf))
            }
            # Get Lat and Lon var names
            var.nums <- sapply(cdf, is.numeric) %>% names
            lalo <- c(stringr::str_extract(var.nums, regex("(?:lat?i?t?u?d?e?)", 
                ignore_case = T)) %>% .[!is.na(.)], stringr::str_extract(var.nums, 
                regex("(?:lon?g?i?t?u?d?e?)", ignore_case = T)) %>% .[!is.na(.)])
            # Get unique clusters/groups
            grps <- unique(cdf[["Group"]])
            # Make a convex hull for each
            cdf.chs <- sapply(grps, function(grp, cdf, lalo) {
                if (verbose == T) {
                  print(grp)
                }
                crds <- unique(cdf[cdf[["Group"]] %in% grp, lalo])
                cdf.ch <- chull(as.matrix(crds))
                crds[, 3] <- F
                crds[cdf.ch, 3] <- T
                dist.crds <- dist(crds[cdf.ch, c(1:2)]) %>% as.matrix %>% colSums
                attr(crds, "DC") <- names(which.min(dist.crds)) %>% as.numeric
                attr(crds, "Group") <- grp
                names(crds) <- c(lalo, "hull")
                return(crds)
                # Call DC - density.centroid Index with which(rownames(crds)==attr(crds,'DC'))
            }, cdf = cdf, lalo = lalo, simplify = F)
        })
        # Name it with original names for parity
        names(cli.chs) <- cli.nms
        sPs <- lapply(cli.chs, function(sli) {
            sP <- sapply(sli, function(sdf) {
                sp <- c(unlist(sdf[which(rownames(sdf) == attr(sdf, "DC")), lalo, 
                  drop = T]), attr(sdf, "Group"))
            }, simplify = T) %>% t %>% as.data.frame()
            names(sP) <- c(lalo, "Group")
            return(sP)
        })
        cli.chs.sPs <- list(startingPoints = sPs, Chulls = cli.chs)
        return(cli.chs.sPs)
    })
    # Retain Names
    
    names(out) <- nms
    return(out)
}

# ----------------------- Mon Mar 19 22:12:12 2018 ------------------------#
# Convert hull to polygon
# https://stackoverflow.com/questions/25606512/create-convex-hull-polygon-from-points-and-save-as-shapefile

# Could be a more robust way of finding the center of density, the minimum
# distance from all other points

hc.hulls <- clustHull(hc.drops, verbose = F)
# ----------------------- Mon Mar 19 22:11:33 2018 ------------------------#
# TODO(https://www.r-bloggers.com/spatial-clustering-with-equal-sizes/)

lat2 = as_radians(to$lat)
lon2 = as_radians(to$lon)
a = 3963.191;
b = 3949.903;
numerator = ( a^2 * cos(lat2) )^2 + ( b^2 * sin(lat2) ) ^2
denominator = ( a * cos(lat2) )^2 + ( b * sin(lat2) )^2
radiusofearth = sqrt(numerator/denominator) #Accounts for the ellipticity of the earth.
d = radiusofearth * acos( sin(lat1) * sin(lat2) + cos(lat1)*cos(lat2)*cos(lon2 - lon1) )
d.return = list(distance_miles=d)
return(d.return)
}
 
raw.og = read.csv("http://statistical-research.com/wp-content/uploads/2013/11/sample_geo.txt", header=T, sep="\t")
 
orig.data = raw.og[,1:3]
 
dirichletClusters_constrained = function(orig.data, k=7921, max.iter = 1000, tolerance = 237630, plot.iter=TRUE) {
fr = to = NULL
 
r.k.start = sample(seq(1:k))
n = nrow( orig.data )
k.size = ceiling(n/k)
initial.clusters = rep(r.k.start, k.size)
 
if(n%%length(initial.clusters)!=0){
exclude.k = length(initial.clusters) - n%%length(initial.clusters)
} else {
exclude.k = 0
}
orig.data$cluster = initial.clusters[1:(length(initial.clusters)-exclude.k)]
orig.data$cluster_original = orig.data$cluster
 
## Calc centers and merge
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
 
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
orig.data$distance.from.center_original = orig.data$distance.from.center
 
## Set some initial configuration values
is.converged = FALSE
iteration = 0
error.old = Inf
error.curr = Inf
 
while ( !is.converged && iteration < max.iter ) { # Iterate until threshold or maximum iterations
 
if(plot.iter==TRUE){
plot(orig.data$Longitude, orig.data$Latitude, col=orig.data$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
}
iteration = iteration + 1
start.time = as.numeric(Sys.time())
cat("Iteration ", iteration,sep="")
for( i in 1:n ) {
# Iterate over each observation and measure the distance each observation' from its mean center
# Produces an exchange. It takes the observation closest to it's mean and in return it gives the observation
# closest to the giver, k, mean
fr = to = distances = NULL
for( j in 1:k ){
# Determine the distance from each k group
fr$lat = orig.data$Latitude[i]; fr$lon = orig.data$Longitude[i]
to$lat = mu[j,1]; to$lon = mu[j,2]
distances[j] = as.numeric( calc_dist(fr, to) )
}
 
# Which k cluster is the observation closest.
which.min.distance = which(distances==min(distances), arr.ind=TRUE)
previous.cluster = orig.data$cluster[i]
orig.data$cluster[i] = which.min.distance # Replace cluster with closest cluster
 
# Trade an observation that is closest to the giving cluster
if(previous.cluster != which.min.distance){
new.cluster.group = orig.data[orig.data$cluster==which.min.distance,]
 
fr$lat = mu[previous.cluster,1]; fr$lon = mu[previous.cluster,2]
to$lat = new.cluster.group$Latitude; to$lon = new.cluster.group$Longitude
new.cluster.group$tmp.dist = calc_dist(fr, to)$distance_miles
 
take.out.new.cluster.group = which(new.cluster.group$tmp.dist==min(new.cluster.group$tmp.dist), arr.ind=TRUE)
LocationID = new.cluster.group$LocationID[take.out.new.cluster.group]
orig.data$cluster[orig.data$LocationID == LocationID] = previous.cluster
}
 
}
 
# Calculate new cluster means
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
tmp1 = matrix( match(orig.data$cluster, mu[,3]) )
orig.data.centers = cbind(as.matrix(orig.data), mu[tmp1,])[,c(1:2,4:6)]
mu = cbind( by(orig.data$Latitude, orig.data$cluster, mean), by(orig.data$Longitude, orig.data$cluster, mean), seq(1:k) )
 
## Calc initial distance from centers
fr$lat = orig.data.centers[,3]; fr$lon = orig.data.centers[,4]
to$lat = orig.data.centers[,1]; to$lon = orig.data.centers[,2]
orig.data$distance.from.center = calc_dist(fr, to)$distance_miles
 
# Test for convergence. Is the previous distance within the threshold of the current total distance from center
error.curr = sum(orig.data$distance.from.center)
 
error.diff = abs( error.old - error.curr )
error.old = error.curr
if( !is.nan( error.diff ) && error.diff < tolerance ) {
is.converged = TRUE
}
 
# Set a time to see how long the process will take is going through all iterations
stop.time = as.numeric(Sys.time())
hour.diff = (((stop.time - start.time) * (max.iter - iteration))/60)/60
cat("\n Error ",error.diff," Hours remain from iterations ",hour.diff,"\n")
 
# Write out iterations. Can later be used as a starting point if iterations need to pause
write.table(orig.data, paste("C:\\optimize_iteration_",iteration,"_instore_data.csv", sep=""), sep=",", row.names=F)
}
 
centers = data.frame(mu)
ret.val = list("centers" = centers, "cluster" = factor(orig.data$cluster), "LocationID" = orig.data$LocationID,
"Latitude" = orig.data$Latitude, "Longitude" = orig.data$Longitude,
"k" = k, "iterations" = iteration, "error.diff" = error.diff)
 
return(ret.val)
}
 
# Constrained clustering
cl_constrain = dirichletClusters_constrained(orig.data, k=4, max.iter=5, tolerance=.0001, plot.iter=TRUE)
table( cl_constrain$cluster )
plot(cl_constrain$Longitude, cl_constrain$Latitude, col=cl_constrain$cluster, pch=16, cex=.6,
xlab="Longitude",ylab="Latitude")
 
library(maps)
map("state", add=T)
points(cl_constrain$centers[,c(2,1)], pch=4, cex=2, col='orange', lwd=4)

Visualizing Convex Hulls

Visualizing the convex hulls. The triangle represents the starting point.

# ----------------------- Wed Mar 21 22:48:31 2018 ------------------------# Add
# Hulls Combine each into one large df
hulls <- do.call("rbind", hc.hulls[["D"]][["Chulls"]][["4-Cuts:8"]])
# Get the starting points on the hull
hull.sps <- sapply(hc.hulls[["D"]][["Chulls"]][["4-Cuts:8"]], attr, "DC", simplify = T)

# get the gtroups
rns <- lapply(hc.hulls[["D"]][["Chulls"]][["4-Cuts:8"]], rownames)
test <- purrr::map2(.x = rns, .y = 1:8, function(.x, .y) {
    rep(.y, length(.x))
})
hulls$Group <- test %>% unlist %>% as.factor
hull.sps <- hulls[rownames(hulls) %in% as.character(hull.sps), ]
find_hull <- function(df) {
    df[chull(df$lon, df$lat), ]
}
hulls.gg <- plyr::ddply(hulls, "Group", find_hull)
# hull.gg.ordered <- sapply(unique(test),function(u,hulls.gg){ h.gg <-
# hulls.gg[hulls.gg$Group %in% u,c('lat','lon')] k <- nrow(h.gg)-1 nn.nms <-
# c(paste0('nn',1:k)) hull.gg.ordered <- spdep::knearneigh(as.matrix(h.gg),k=k)
# %>% spdep::knn2nb(sym=F) %>% unlist %>%
# matrix(ncol=k,byrow=T,dimnames=list(rows=names(.),cols=nn.nms)) %>% unclass %>%
# cbind(h.gg,.)  },hulls.gg=hulls.gg)
ggmap(Waltham, mapping = aes(x = lon, y = lat, color = Group)) + geom_point(data = hc.drops[["D"]][["4-Cuts:8"]], 
    mapping = aes(color = Group), size = 0.75) + scale_color_brewer(8, palette = "Set1") + 
    geom_polygon(data = hulls.gg, mapping = aes(x = lon, y = lat, color = Group), 
        fill = NA) + geom_point(data = hull.sps, mapping = aes(color = Group), shape = 17, 
    color = "red", size = 2) + labs(title = "Democrats", subtitle = "Cuts = 8", caption = "", 
    x = "", y = "") + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), 
    legend.position = "none")

Census Demographics Visualization

Load Data

Loading and joining census data. This proved to be an unfruitful endeavor, so I approached it differently in the next chunk.

Visualize census data

These graphs were used in the presentation for the economics and access section.

# ggsave(last_plot(), filename = 'IncDiff.pdf', path='Plots',device = 'pdf',width
# = 5, height = 5, units = 'in') c <- RColorBrewer::brewer.pal(9,'Greens')
# ggmap(Waltham)+ geom_polygon(data=CensusBG.W.Inc,aes(x=long, y=lat,
# group=group, fill=Income13), size=.2,color='grey', alpha=.7)+
# scale_fill_gradient2(low=c[1],mid=c[5],high=c[9])+ geom_point(data=pp,mapping =
# aes(x=lon,y=lat))+ labs(title = 'Waltham Married Median Income', subtitle =
# '2013 Median by Census Block \n Polling Locations in Black', caption = '', x =
# '',y = '') + theme(plot.title = element_text(hjust = .5),plot.subtitle =
# element_text(hjust = .5),axis.text = element_blank(),axis.ticks =
# element_blank()) ggsave(last_plot(), filename = 'Inc13.pdf',
# path='Plots',device = 'pdf',width = 5, height = 5, units = 'in')
# ----------------------- Mon Mar 12 12:20:57 2018 ------------------------# 2016
# Income
c <- RColorBrewer::brewer.pal(9, "Greens")
ggmap(Waltham) + geom_polygon(data = CensusBG.W.Inc, aes(x = long, y = lat, group = group, 
    fill = Income16), size = 0.2, color = "grey", alpha = 0.75) + scale_fill_gradientn(colors = RColorBrewer::brewer.pal(9, 
    "Greens")) + geom_point(data = pp, mapping = aes(x = lon, y = lat), color = RColorBrewer::brewer.pal(9, 
    "Set1")[2]) + labs(title = "Proximity to Polling Location by Income", subtitle = "2016 Median by Census Block \n Polling Locations in Blue", 
    caption = "", x = "", y = "") + theme(plot.title = element_text(hjust = 0.5), 
    plot.subtitle = element_text(hjust = 0.5), axis.text = element_blank(), axis.ticks = element_blank(), 
    plot.margin = margin(rep(0, 4), "pt"))