Libraries Used

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(plyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(stringr)
library(tidyverse)
library(RecordLinkage)
library(tm)
library(RWeka)

Reading in data after SQL pull
repairs_data <- read.csv('https://raw.githubusercontent.com/zachalexander/harlem_nychalists/master/repairs-summary_10-21-2019.csv')

Filtering raw data on relevant cases

Since we’ll want to filter the data based on notes that only mention some aspect of a repair, we’ll look for stems such as “leak”, “paint”, “plaster”, etc. I’ve also worked with our case manager to document as many follow up notes with the phrase “follow-up” incorporated in it, so I’ve also looked for this

# using regular expressions to filter only on case notes with these characters
repair_subset <- repairs_data %>% 
  filter(str_detect(Case.Note, '\\.*foll*.*') |
         str_detect(Case.Note, '\\.*leak*.*') |
         str_detect(Case.Note, '\\.*paint*.*')|
         str_detect(Case.Note, '\\.*plaster*.*')|
         str_detect(Case.Note, '\\.*cabinet*.*')|
         str_detect(Case.Note, '\\.*radiat*.*')|
         str_detect(Case.Note, '\\.*window*.*')|
         str_detect(Case.Note, '\\.*mold*.*')|
         str_detect(Case.Note, '\\.*door*.*')|
         str_detect(Case.Note, '\\.*refrigerat*.*')|
         str_detect(Case.Note, '\\.*exterminat*.*')
        ) %>% 
  mutate(Compliance.Note.Date = as.Date(as.Date(Compliance.Note.Date, format="%d-%b-%y")), format="%b-%d-%Y") %>% 
  select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>%
  arrange(Full.Name, Compliance.Note.Date)

kable(head(repair_subset), align = rep('c', 2)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)
Full.Name Development.Name Compliance.Note.Date Case.Note
Tenant10 Lincoln 2019-06-17 follow up cabinets and ?
Tenant100 Taft 2019-02-11 still needs: front door (NYCHA disputes), still needs sink glazing, fridge, stove, mold
Tenant100 Taft 2019-02-11 follow up - door (24h), sink, bathroom door, fridge, stove, bedroom window, mold,
Tenant100 Taft 2019-03-11 follow up cabinet and paint
Tenant100 Taft 2019-05-15 refrigerator gasket
Tenant100 Taft 2019-06-11 damaged base kitchen cabinet

## [1] "This cut the dataset from 2613 cases to 1240 cases."

Removing extraneous text with more regular expressions

I created a column that counts the number of cases for each tenant, so that it will be easier to group notes later on in the analysis.

# looping through to count how many notes per tenant
repair_subset <- repair_subset[3:nrow(repair_subset),]
repair_subset$ncount <- NA
repair_subset$ncount[1] <- 1

for (i in 2:length(repair_subset$Full.Name)) {
  if(repair_subset$Full.Name[i] == repair_subset$Full.Name[i-1]){
    repair_subset$ncount[i] <- repair_subset$ncount[i-1] + 1
  } else {
    repair_subset$ncount[i] <- 1
  }
}

# removing follow-up and access language from notes
repair_subset <- repair_subset %>% 
  select(Full.Name, Development.Name, ncount, Compliance.Note.Date, Case.Note) %>% 
  mutate(Case.Note = tolower(Case.Note)) %>% 
  mutate(Case.Note = str_replace_all(Case.Note, 'follow\\s?\\-?\\s?up?:?;?\\s?-?\\s?,?\\s?', '')) %>% 
  mutate(Case.Note = str_replace_all(Case.Note, 'access\\s?\\-?\\s?\\/?\\s', ''))

Tidying into cleaner dataframe

repair_fnl <- repair_subset %>% 
  dplyr::select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>% 
  dplyr::arrange(Full.Name, Compliance.Note.Date) %>%
  dplyr::rename("Tenant Name" = Full.Name,
         "NYCHA Development" = Development.Name,
         "Note Date" = Compliance.Note.Date,
         "Case Note" = Case.Note)

kable(head(repair_fnl, n = 20L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)
Tenant Name NYCHA Development Note Date Case Note
Tenant100 Taft 2019-02-11 door (24h), sink, bathroom door, fridge, stove, bedroom window, mold,
Tenant100 Taft 2019-03-11 cabinet and paint
Tenant100 Taft 2019-05-15 refrigerator gasket
Tenant100 Taft 2019-06-11 damaged base kitchen cabinet
Tenant101 Wagner 2019-05-16 bathtub and kitchen window and counter
Tenant101 Wagner 2019-06-14 window
Tenant102 Jefferson 2019-02-11 mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door
Tenant103 Wagner 2019-08-29 to leak in bathroom
Tenant103 Wagner 2019-09-16 leak in bathroom, plaster and paint. due 9/23 but coming to court this friday
Tenant104 Robinson 2019-03-22 leak, exterminate, entrance and bathroom door, radiators, floor tiles
Tenant105 Park Ave E 123 2019-02-05 stove
Tenant106 Wagner 2019-05-31 bathroom enclosure, paint wall, paint kitchen
Tenant107 Wagner 2019-02-19 tub
Tenant107 Wagner 2019-03-19 paint plaster - lm
Tenant109 Wagner Houses 2019-08-24 bathroom pipes leak
Tenant109 Wagner Houses 2019-09-13 windows in apartment need to be fix(hard to open)
Tenant109 Wagner Houses 2019-10-13 bedroom ceiling peeling off
Tenant110 2019-08-26 room windows
Tenant110 2019-08-26 kitchen cabinet
Tenant110 2019-10-11 k sink, mold in bathroom

Data frame is ready, now it’s time to prepare for text analysis

Now that I’ve subsetted the data, I’m ready to create a corpus of the case notes column and prepare these strings for text analysis.

repair_corpus <- VCorpus(VectorSource(repair_fnl$`Case Note`))

repair_corpus <- tm_map(repair_corpus, removeNumbers)
repair_corpus <- tm_map(repair_corpus, removePunctuation)
repair_corpus <- tm_map(repair_corpus, stripWhitespace)
repair_corpus <- tm_map(repair_corpus, removeWords, stopwords())
repair_corpus <- tm_map(repair_corpus, content_transformer(tolower))

As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

# tokenize into uni-grams, bi-grams and tri-grams
unigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = UnigramTokenizer))
bigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = BigramTokenizer))
trigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = TrigramTokenizer))

# put into data frame
freq_unigram_repairTDM <- sort(rowSums(as.matrix(unigram_repairTDM)),decreasing = TRUE)
freq_uni_df = data.frame(word=names(freq_unigram_repairTDM), freq=freq_unigram_repairTDM)

freq_bigram_repairTDM <- sort(rowSums(as.matrix(bigram_repairTDM)),decreasing = TRUE)
freq_bi_df = data.frame(word=names(freq_bigram_repairTDM), freq=freq_bigram_repairTDM)

freq_trigram_repairTDM <- sort(rowSums(as.matrix(trigram_repairTDM)),decreasing = TRUE)
freq_tri_df = data.frame(word=names(freq_trigram_repairTDM), freq=freq_trigram_repairTDM)

I looked at term frequency using Term Document Matrices similar to the example in Chapter 10, you can find a snapshot of one of my matrices below.

inspect(unigram_repairTDM[400:480, 60:90])
## <<TermDocumentMatrix (terms: 81, documents: 31)>>
## Non-/sparse entries: 11/2500
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)
## Sample             :
##             Docs
## Terms        60 61 64 68 77 84 85 86 87 89
##   letters     0  0  0  0  0  0  0  0  0  0
##   level       0  0  0  0  0  0  0  0  1  0
##   lever       0  0  0  0  0  0  0  0  0  0
##   light       0  0  0  0  1  0  0  0  0  0
##   living      0  1  0  1  0  0  0  0  0  0
##   management  0  0  1  0  0  0  0  0  0  0
##   mold        0  0  0  0  0  1  0  0  0  0
##   now         0  0  0  0  0  0  0  2  0  0
##   nycha       0  0  0  0  0  0  1  1  0  1
##   office      0  0  1  0  0  0  0  0  0  0

From above, you can see that the terms are mapped out in matrix form.


Plotting the results

ggplot(head(freq_uni_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Unigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note unigrams")

ggplot(head(freq_bi_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note bigrams")

ggplot(head(freq_tri_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Trigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note trigrams")

As you can see, it looks like plaster, paint, and floor tile repairs are some of the most frequent types of repairs that aren’t being followed up on by NYCHA. Also, by looking at n-gram sequences of three, “plaster paint bathroom” gets us even more specificity.


Text analysis to find tenants who had the same repair complaint more than once


Step 1: Create a dataframe with one tenant per row
repair_fnl_grouped <- repair_fnl %>% 
  dplyr::group_by(`Tenant Name`) %>% 
  dplyr::summarise(pooled_cases = paste0(`Case Note`, collapse = " "))

kable(head(repair_fnl_grouped, n = 20L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)
Tenant Name pooled_cases
Tenant100 door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet
Tenant101 bathtub and kitchen window and counter window
Tenant102 mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door
Tenant103 to leak in bathroom leak in bathroom, plaster and paint. due 9/23 but coming to court this friday
Tenant104 leak, exterminate, entrance and bathroom door, radiators, floor tiles
Tenant105 stove
Tenant106 bathroom enclosure, paint wall, paint kitchen
Tenant107 tub paint plaster - lm
Tenant109 bathroom pipes leak windows in apartment need to be fix(hard to open) bedroom ceiling peeling off
Tenant110 room windows kitchen cabinet k sink, mold in bathroom
Tenant111 radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may
Tenant112 leaky pipe in kitchen kitchen cabinets broken exterminate for mice and roaches mold in bathroom bedroom window crack in bathroom wall plastering in living room, bedroom and bathroom
Tenant113 mold plaster paint
Tenant115 bathtub enclosure
Tenant117 broken window in kitchen, living room and master bedroom, kitchen sink stoppage
Tenant118 wrong phone number on stip. she walked in, corrected phone, i sent email
Tenant119 extermination 4/1: according to ms. varner they fixed all the issues that needed to get fixed.
Tenant12 radiator, sink, outlets, plaster cabinet, pipe , paint floor tiles
Tenant120 follow - mold paint and plaster hallway kitchen cabinets paint and plaster bedroom
Tenant121 sent email front door within 24 hours nycha reports work done inside apartment. waiting on new door tenant claims front door still not secure. nycha inspected door again and says it is secure. mold - 30 days

Step 2: Convert to corpus and normalize, stem, and remove stopwords from corpus
repair_corpus_pooled <- VCorpus(VectorSource(repair_fnl_grouped$pooled_cases))

repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeNumbers)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removePunctuation)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, stripWhitespace)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeWords, stopwords())
repair_corpus_pooled <- tm_map(repair_corpus_pooled, content_transformer(tolower))

As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.


Step 3: Create Document Term Matrix, run bi-grams
repair_corpus_pooledDTM <- DocumentTermMatrix(repair_corpus_pooled, control = list(tokenize = BigramTokenizer))
repairs_dups <- data.frame(matrix(findMostFreqTerms(repair_corpus_pooledDTM)))

names(repairs_dups) <- c("repairs")

Tidy data frame to identify documents with repeat bi-grams

#separate the long string of frequencies into multiple columns
repairs_dups_total <- separate(data = repairs_dups, col = repairs, 
                               into = c("repair1", "repair2", "repair3", "repair4", "repair5",
                                        "repair6"), sep = ", ")


# remove extraneous characters using regular expressions
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]][[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[0]?', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, 'numeric', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]]', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[0]?', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, 'numeric', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[c]][[(]]', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[0]?', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, 'numeric', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[c]][[(]]', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[0]?', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, 'numeric', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[c]][[(]]', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[0]?', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, 'numeric', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[c]][[(]]', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[0]?', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, 'numeric', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[c]][[(]]', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[$&+,:;?`@#|<>.^*()%!-]', '')


# merge the frequencies in with the initial grouped dataframe by index
df_fnl <- merge(repair_fnl_grouped, repairs_dups_total, by=0)

# replace all blanks with NAs
df_fnl <- apply(df_fnl, 2, function(x) gsub("^$|^ $", NA, x))

# convert to dataframe
df_fnl <- data.frame(df_fnl)

# create one column of all frequencies
df_fnl$repeats <- paste0(df_fnl$repair1,
                         df_fnl$repair2,
                         df_fnl$repair3,
                         df_fnl$repair4,
                         df_fnl$repair5,
                         df_fnl$repair6)

# replace all character NAs with blanks for final column, as well as any misc. numbers
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[NA]]{1,6}', '')
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[0-9]]{6}', '')

# replace blanks with NAs for final counts
df_fnl$repeats <- as.character(df_fnl$repeats)
df_fnl$repeats[df_fnl$repeats==""] <- NA
df_fnl$repeats <- as.factor(df_fnl$repeats)

# identify those with NAs as FALSE, and those with values as TRUE
df_fnl <- df_fnl %>% 
  select(Tenant.Name, pooled_cases, repeats) %>% 
  mutate(repeat_bool = ifelse(is.na(repeats), FALSE, TRUE))

With the data tidy’d, we can now see which tenants have repeat bi-grams in their case notes.

kable(head(df_fnl, n = 15L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)
Tenant.Name pooled_cases repeats repeat_bool
Tenant100 door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet NA FALSE
Tenant110 room windows kitchen cabinet k sink, mold in bathroom NA FALSE
Tenant200 exterminate roaches NA FALSE
Tenant201 leak, plaster, mold, windows NA FALSE
Tenant202 oven windows in kitchen/living room bedroom/bathroom plaster/paint NA FALSE
Tenant203 plaster bathroom and kitchen paint bathroom and kitchen, defective vent in bathroom, defective sink cabinet, defective kitchen window balance leak bathroom pipe, shower head kitchen sink cabinet, bathtub enclosure, plaster paint bathroom kitchen = 2sink cabinet = 2 TRUE
Tenant204 bathroom pipe, tub enclosure NA FALSE
Tenant205 damaged pipe above toilet, tub enclosure NA FALSE
Tenant207 mold in bathroom, NA FALSE
Tenant208 shower, leak, toilet - phone out of service outlet, fan damaged plaster and bubbling paint plaster paint mold NA FALSE
Tenant209 broken lock on apartment door mold in bathroom broken metal door frame plaster and paint kitchen plaster and paint ceiling and all walls in bathroom ceramic tiles at bathroom floor plaster paint = 2 TRUE
Tenant21 bathtub, closet rack, mold plaster/paint plaster paint, mold removal (bathroom), paint bedroom NA FALSE
Tenant111 radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may radiator work = 2 TRUE
Tenant210 for bathroom mold paint kitchen bathroom and bedroom,1st bedroom door NA FALSE
Tenant211 front door and kitchen NA FALSE

From this, we can calculate the percentage of tenants out of the total that have repeat bi-grams in their case notes document.

percentRepeats_tbl <- data.frame(table(df_fnl$repeat_bool))

names(percentRepeats_tbl) <- c("Repeat Repair Complaint?", "Count")

kable(percentRepeats_tbl, align = rep('c', 2)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)
Repeat Repair Complaint? Count
FALSE 451
TRUE 81
tenantsTotal <- length(df_fnl$repeat_bool)
repeatComplaints <- table(df_fnl$repeat_bool)[[2]]

percentRepeats <- repeatComplaints / tenantsTotal

percentRepeats
## [1] 0.1522556

It looks like 81 tenants (15% of all tenants in the dataset) have repairs that stil have not been followed up on even after multiple housing court appearances.