knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(plyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(stringr)
library(tidyverse)
library(RecordLinkage)
library(tm)
library(RWeka)
repairs_data <- read.csv('https://raw.githubusercontent.com/zachalexander/harlem_nychalists/master/repairs-summary_10-21-2019.csv')
Since we’ll want to filter the data based on notes that only mention some aspect of a repair, we’ll look for stems such as “leak”, “paint”, “plaster”, etc. I’ve also worked with our case manager to document as many follow up notes with the phrase “follow-up” incorporated in it, so I’ve also looked for this
# using regular expressions to filter only on case notes with these characters
repair_subset <- repairs_data %>%
filter(str_detect(Case.Note, '\\.*foll*.*') |
str_detect(Case.Note, '\\.*leak*.*') |
str_detect(Case.Note, '\\.*paint*.*')|
str_detect(Case.Note, '\\.*plaster*.*')|
str_detect(Case.Note, '\\.*cabinet*.*')|
str_detect(Case.Note, '\\.*radiat*.*')|
str_detect(Case.Note, '\\.*window*.*')|
str_detect(Case.Note, '\\.*mold*.*')|
str_detect(Case.Note, '\\.*door*.*')|
str_detect(Case.Note, '\\.*refrigerat*.*')|
str_detect(Case.Note, '\\.*exterminat*.*')
) %>%
mutate(Compliance.Note.Date = as.Date(as.Date(Compliance.Note.Date, format="%d-%b-%y")), format="%b-%d-%Y") %>%
select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>%
arrange(Full.Name, Compliance.Note.Date)
kable(head(repair_subset), align = rep('c', 2)) %>%
kable_styling(bootstrap_options = c("striped"), full_width = F)
| Full.Name | Development.Name | Compliance.Note.Date | Case.Note |
|---|---|---|---|
| Tenant10 | Lincoln | 2019-06-17 | follow up cabinets and ? |
| Tenant100 | Taft | 2019-02-11 | still needs: front door (NYCHA disputes), still needs sink glazing, fridge, stove, mold |
| Tenant100 | Taft | 2019-02-11 | follow up - door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, |
| Tenant100 | Taft | 2019-03-11 | follow up cabinet and paint |
| Tenant100 | Taft | 2019-05-15 | refrigerator gasket |
| Tenant100 | Taft | 2019-06-11 | damaged base kitchen cabinet |
## [1] "This cut the dataset from 2613 cases to 1240 cases."
I created a column that counts the number of cases for each tenant, so that it will be easier to group notes later on in the analysis.
# looping through to count how many notes per tenant
repair_subset <- repair_subset[3:nrow(repair_subset),]
repair_subset$ncount <- NA
repair_subset$ncount[1] <- 1
for (i in 2:length(repair_subset$Full.Name)) {
if(repair_subset$Full.Name[i] == repair_subset$Full.Name[i-1]){
repair_subset$ncount[i] <- repair_subset$ncount[i-1] + 1
} else {
repair_subset$ncount[i] <- 1
}
}
# removing follow-up and access language from notes
repair_subset <- repair_subset %>%
select(Full.Name, Development.Name, ncount, Compliance.Note.Date, Case.Note) %>%
mutate(Case.Note = tolower(Case.Note)) %>%
mutate(Case.Note = str_replace_all(Case.Note, 'follow\\s?\\-?\\s?up?:?;?\\s?-?\\s?,?\\s?', '')) %>%
mutate(Case.Note = str_replace_all(Case.Note, 'access\\s?\\-?\\s?\\/?\\s', ''))
repair_fnl <- repair_subset %>%
dplyr::select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>%
dplyr::arrange(Full.Name, Compliance.Note.Date) %>%
dplyr::rename("Tenant Name" = Full.Name,
"NYCHA Development" = Development.Name,
"Note Date" = Compliance.Note.Date,
"Case Note" = Case.Note)
kable(head(repair_fnl, n = 20L), align = rep('c', 4)) %>%
kable_styling(bootstrap_options = c("striped"), full_width = F)
| Tenant Name | NYCHA Development | Note Date | Case Note |
|---|---|---|---|
| Tenant100 | Taft | 2019-02-11 | door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, |
| Tenant100 | Taft | 2019-03-11 | cabinet and paint |
| Tenant100 | Taft | 2019-05-15 | refrigerator gasket |
| Tenant100 | Taft | 2019-06-11 | damaged base kitchen cabinet |
| Tenant101 | Wagner | 2019-05-16 | bathtub and kitchen window and counter |
| Tenant101 | Wagner | 2019-06-14 | window |
| Tenant102 | Jefferson | 2019-02-11 | mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door |
| Tenant103 | Wagner | 2019-08-29 | to leak in bathroom |
| Tenant103 | Wagner | 2019-09-16 | leak in bathroom, plaster and paint. due 9/23 but coming to court this friday |
| Tenant104 | Robinson | 2019-03-22 | leak, exterminate, entrance and bathroom door, radiators, floor tiles |
| Tenant105 | Park Ave E 123 | 2019-02-05 | stove |
| Tenant106 | Wagner | 2019-05-31 | bathroom enclosure, paint wall, paint kitchen |
| Tenant107 | Wagner | 2019-02-19 | tub |
| Tenant107 | Wagner | 2019-03-19 | paint plaster - lm |
| Tenant109 | Wagner Houses | 2019-08-24 | bathroom pipes leak |
| Tenant109 | Wagner Houses | 2019-09-13 | windows in apartment need to be fix(hard to open) |
| Tenant109 | Wagner Houses | 2019-10-13 | bedroom ceiling peeling off |
| Tenant110 | 2019-08-26 | room windows | |
| Tenant110 | 2019-08-26 | kitchen cabinet | |
| Tenant110 | 2019-10-11 | k sink, mold in bathroom |
Now that I’ve subsetted the data, I’m ready to create a corpus of the case notes column and prepare these strings for text analysis.
repair_corpus <- VCorpus(VectorSource(repair_fnl$`Case Note`))
repair_corpus <- tm_map(repair_corpus, removeNumbers)
repair_corpus <- tm_map(repair_corpus, removePunctuation)
repair_corpus <- tm_map(repair_corpus, stripWhitespace)
repair_corpus <- tm_map(repair_corpus, removeWords, stopwords())
repair_corpus <- tm_map(repair_corpus, content_transformer(tolower))
As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
# tokenize into uni-grams, bi-grams and tri-grams
unigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = UnigramTokenizer))
bigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = BigramTokenizer))
trigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = TrigramTokenizer))
# put into data frame
freq_unigram_repairTDM <- sort(rowSums(as.matrix(unigram_repairTDM)),decreasing = TRUE)
freq_uni_df = data.frame(word=names(freq_unigram_repairTDM), freq=freq_unigram_repairTDM)
freq_bigram_repairTDM <- sort(rowSums(as.matrix(bigram_repairTDM)),decreasing = TRUE)
freq_bi_df = data.frame(word=names(freq_bigram_repairTDM), freq=freq_bigram_repairTDM)
freq_trigram_repairTDM <- sort(rowSums(as.matrix(trigram_repairTDM)),decreasing = TRUE)
freq_tri_df = data.frame(word=names(freq_trigram_repairTDM), freq=freq_trigram_repairTDM)
I looked at term frequency using Term Document Matrices similar to the example in Chapter 10, you can find a snapshot of one of my matrices below.
inspect(unigram_repairTDM[400:480, 60:90])
## <<TermDocumentMatrix (terms: 81, documents: 31)>>
## Non-/sparse entries: 11/2500
## Sparsity : 100%
## Maximal term length: 14
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 60 61 64 68 77 84 85 86 87 89
## letters 0 0 0 0 0 0 0 0 0 0
## level 0 0 0 0 0 0 0 0 1 0
## lever 0 0 0 0 0 0 0 0 0 0
## light 0 0 0 0 1 0 0 0 0 0
## living 0 1 0 1 0 0 0 0 0 0
## management 0 0 1 0 0 0 0 0 0 0
## mold 0 0 0 0 0 1 0 0 0 0
## now 0 0 0 0 0 0 0 2 0 0
## nycha 0 0 0 0 0 0 1 1 0 1
## office 0 0 1 0 0 0 0 0 0 0
From above, you can see that the terms are mapped out in matrix form.
ggplot(head(freq_uni_df, 15), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
ggtitle("Most frequent repair-note unigrams")
ggplot(head(freq_bi_df, 15), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
ggtitle("Most frequent repair-note bigrams")
ggplot(head(freq_tri_df, 15), aes(reorder(word,freq), freq)) +
geom_bar(stat = "identity") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
ggtitle("Most frequent repair-note trigrams")
As you can see, it looks like plaster, paint, and floor tile repairs are some of the most frequent types of repairs that aren’t being followed up on by NYCHA. Also, by looking at n-gram sequences of three, “plaster paint bathroom” gets us even more specificity.
repair_fnl_grouped <- repair_fnl %>%
dplyr::group_by(`Tenant Name`) %>%
dplyr::summarise(pooled_cases = paste0(`Case Note`, collapse = " "))
kable(head(repair_fnl_grouped, n = 20L), align = rep('c', 4)) %>%
kable_styling(bootstrap_options = c("striped"), full_width = F)
| Tenant Name | pooled_cases |
|---|---|
| Tenant100 | door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet |
| Tenant101 | bathtub and kitchen window and counter window |
| Tenant102 | mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door |
| Tenant103 | to leak in bathroom leak in bathroom, plaster and paint. due 9/23 but coming to court this friday |
| Tenant104 | leak, exterminate, entrance and bathroom door, radiators, floor tiles |
| Tenant105 | stove |
| Tenant106 | bathroom enclosure, paint wall, paint kitchen |
| Tenant107 | tub paint plaster - lm |
| Tenant109 | bathroom pipes leak windows in apartment need to be fix(hard to open) bedroom ceiling peeling off |
| Tenant110 | room windows kitchen cabinet k sink, mold in bathroom |
| Tenant111 | radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may |
| Tenant112 | leaky pipe in kitchen kitchen cabinets broken exterminate for mice and roaches mold in bathroom bedroom window crack in bathroom wall plastering in living room, bedroom and bathroom |
| Tenant113 | mold plaster paint |
| Tenant115 | bathtub enclosure |
| Tenant117 | broken window in kitchen, living room and master bedroom, kitchen sink stoppage |
| Tenant118 | wrong phone number on stip. she walked in, corrected phone, i sent email |
| Tenant119 | extermination 4/1: according to ms. varner they fixed all the issues that needed to get fixed. |
| Tenant12 | radiator, sink, outlets, plaster cabinet, pipe , paint floor tiles |
| Tenant120 | follow - mold paint and plaster hallway kitchen cabinets paint and plaster bedroom |
| Tenant121 | sent email front door within 24 hours nycha reports work done inside apartment. waiting on new door tenant claims front door still not secure. nycha inspected door again and says it is secure. mold - 30 days |
repair_corpus_pooled <- VCorpus(VectorSource(repair_fnl_grouped$pooled_cases))
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeNumbers)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removePunctuation)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, stripWhitespace)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeWords, stopwords())
repair_corpus_pooled <- tm_map(repair_corpus_pooled, content_transformer(tolower))
As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.
repair_corpus_pooledDTM <- DocumentTermMatrix(repair_corpus_pooled, control = list(tokenize = BigramTokenizer))
repairs_dups <- data.frame(matrix(findMostFreqTerms(repair_corpus_pooledDTM)))
names(repairs_dups) <- c("repairs")
#separate the long string of frequencies into multiple columns
repairs_dups_total <- separate(data = repairs_dups, col = repairs,
into = c("repair1", "repair2", "repair3", "repair4", "repair5",
"repair6"), sep = ", ")
# remove extraneous characters using regular expressions
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]][[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[0]?', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, 'numeric', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]]', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[$&+,:;?`@#|<>.^*()%!-]', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[0]?', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, 'numeric', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[c]][[(]]', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[$&+,:;?`@#|<>.^*()%!-]', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[0]?', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, 'numeric', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[c]][[(]]', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[$&+,:;?`@#|<>.^*()%!-]', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[0]?', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, 'numeric', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[c]][[(]]', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[$&+,:;?`@#|<>.^*()%!-]', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[0]?', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, 'numeric', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[c]][[(]]', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[$&+,:;?`@#|<>.^*()%!-]', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[0]?', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, 'numeric', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[c]][[(]]', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[$&+,:;?`@#|<>.^*()%!-]', '')
# merge the frequencies in with the initial grouped dataframe by index
df_fnl <- merge(repair_fnl_grouped, repairs_dups_total, by=0)
# replace all blanks with NAs
df_fnl <- apply(df_fnl, 2, function(x) gsub("^$|^ $", NA, x))
# convert to dataframe
df_fnl <- data.frame(df_fnl)
# create one column of all frequencies
df_fnl$repeats <- paste0(df_fnl$repair1,
df_fnl$repair2,
df_fnl$repair3,
df_fnl$repair4,
df_fnl$repair5,
df_fnl$repair6)
# replace all character NAs with blanks for final column, as well as any misc. numbers
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[NA]]{1,6}', '')
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[0-9]]{6}', '')
# replace blanks with NAs for final counts
df_fnl$repeats <- as.character(df_fnl$repeats)
df_fnl$repeats[df_fnl$repeats==""] <- NA
df_fnl$repeats <- as.factor(df_fnl$repeats)
# identify those with NAs as FALSE, and those with values as TRUE
df_fnl <- df_fnl %>%
select(Tenant.Name, pooled_cases, repeats) %>%
mutate(repeat_bool = ifelse(is.na(repeats), FALSE, TRUE))
With the data tidy’d, we can now see which tenants have repeat bi-grams in their case notes.
kable(head(df_fnl, n = 15L), align = rep('c', 4)) %>%
kable_styling(bootstrap_options = c("striped"), full_width = F)
| Tenant.Name | pooled_cases | repeats | repeat_bool |
|---|---|---|---|
| Tenant100 | door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet | NA | FALSE |
| Tenant110 | room windows kitchen cabinet k sink, mold in bathroom | NA | FALSE |
| Tenant200 | exterminate roaches | NA | FALSE |
| Tenant201 | leak, plaster, mold, windows | NA | FALSE |
| Tenant202 | oven windows in kitchen/living room bedroom/bathroom plaster/paint | NA | FALSE |
| Tenant203 | plaster bathroom and kitchen paint bathroom and kitchen, defective vent in bathroom, defective sink cabinet, defective kitchen window balance leak bathroom pipe, shower head kitchen sink cabinet, bathtub enclosure, plaster paint | bathroom kitchen = 2sink cabinet = 2 | TRUE |
| Tenant204 | bathroom pipe, tub enclosure | NA | FALSE |
| Tenant205 | damaged pipe above toilet, tub enclosure | NA | FALSE |
| Tenant207 | mold in bathroom, | NA | FALSE |
| Tenant208 | shower, leak, toilet - phone out of service outlet, fan damaged plaster and bubbling paint plaster paint mold | NA | FALSE |
| Tenant209 | broken lock on apartment door mold in bathroom broken metal door frame plaster and paint kitchen plaster and paint ceiling and all walls in bathroom ceramic tiles at bathroom floor | plaster paint = 2 | TRUE |
| Tenant21 | bathtub, closet rack, mold plaster/paint plaster paint, mold removal (bathroom), paint bedroom | NA | FALSE |
| Tenant111 | radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may | radiator work = 2 | TRUE |
| Tenant210 | for bathroom mold paint kitchen bathroom and bedroom,1st bedroom door | NA | FALSE |
| Tenant211 | front door and kitchen | NA | FALSE |
From this, we can calculate the percentage of tenants out of the total that have repeat bi-grams in their case notes document.
percentRepeats_tbl <- data.frame(table(df_fnl$repeat_bool))
names(percentRepeats_tbl) <- c("Repeat Repair Complaint?", "Count")
kable(percentRepeats_tbl, align = rep('c', 2)) %>%
kable_styling(bootstrap_options = c("striped"), full_width = F)
| Repeat Repair Complaint? | Count |
|---|---|
| FALSE | 451 |
| TRUE | 81 |
tenantsTotal <- length(df_fnl$repeat_bool)
repeatComplaints <- table(df_fnl$repeat_bool)[[2]]
percentRepeats <- repeatComplaints / tenantsTotal
percentRepeats
## [1] 0.1522556
It looks like 81 tenants (15% of all tenants in the dataset) have repairs that stil have not been followed up on even after multiple housing court appearances.