DATA 607 - Data Science In Context

Libraries Used

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(plyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(stringr)
library(tidyverse)
library(RecordLinkage)
library(tm)
library(RWeka)

Reading in data after SQL pull

repairs_data <- read.csv('https://raw.githubusercontent.com/zachalexander/harlem_nychalists/master/repairs-summary_10-21-2019.csv')

Filtering raw data on relevant cases

Since we’ll want to filter the data based on notes that only mention some aspect of a repair, we’ll look for stems such as “leak”, “paint”, “plaster”, etc. I’ve also worked with our case manager to document as many follow up notes with the phrase “follow-up” incorporated in it, so I’ve also looked for this

# using regular expressions to filter only on case notes with these characters
repair_subset <- repairs_data %>% 
  filter(str_detect(Case.Note, '\\.*foll*.*') |
         str_detect(Case.Note, '\\.*leak*.*') |
         str_detect(Case.Note, '\\.*paint*.*')|
         str_detect(Case.Note, '\\.*plaster*.*')|
         str_detect(Case.Note, '\\.*cabinet*.*')|
         str_detect(Case.Note, '\\.*radiat*.*')|
         str_detect(Case.Note, '\\.*window*.*')|
         str_detect(Case.Note, '\\.*mold*.*')|
         str_detect(Case.Note, '\\.*door*.*')|
         str_detect(Case.Note, '\\.*refrigerat*.*')|
         str_detect(Case.Note, '\\.*exterminat*.*')
        ) %>% 
  mutate(Compliance.Note.Date = as.Date(as.Date(Compliance.Note.Date, format="%d-%b-%y")), format="%b-%d-%Y") %>% 
  select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>%
  arrange(Full.Name, Compliance.Note.Date)

kable(head(repair_subset), align = rep('c', 2)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)

Full.Name	Development.Name	Compliance.Note.Date	Case.Note
Tenant10	Lincoln	2019-06-17	follow up cabinets and ?
Tenant100	Taft	2019-02-11	still needs: front door (NYCHA disputes), still needs sink glazing, fridge, stove, mold
Tenant100	Taft	2019-02-11	follow up - door (24h), sink, bathroom door, fridge, stove, bedroom window, mold,
Tenant100	Taft	2019-03-11	follow up cabinet and paint
Tenant100	Taft	2019-05-15	refrigerator gasket
Tenant100	Taft	2019-06-11	damaged base kitchen cabinet

## [1] "This cut the dataset from 2613 cases to 1240 cases."

Removing extraneous text with more regular expressions

I created a column that counts the number of cases for each tenant, so that it will be easier to group notes later on in the analysis.

# looping through to count how many notes per tenant
repair_subset <- repair_subset[3:nrow(repair_subset),]
repair_subset$ncount <- NA
repair_subset$ncount[1] <- 1

for (i in 2:length(repair_subset$Full.Name)) {
  if(repair_subset$Full.Name[i] == repair_subset$Full.Name[i-1]){
    repair_subset$ncount[i] <- repair_subset$ncount[i-1] + 1
  } else {
    repair_subset$ncount[i] <- 1
  }
}

# removing follow-up and access language from notes
repair_subset <- repair_subset %>% 
  select(Full.Name, Development.Name, ncount, Compliance.Note.Date, Case.Note) %>% 
  mutate(Case.Note = tolower(Case.Note)) %>% 
  mutate(Case.Note = str_replace_all(Case.Note, 'follow\\s?\\-?\\s?up?:?;?\\s?-?\\s?,?\\s?', '')) %>% 
  mutate(Case.Note = str_replace_all(Case.Note, 'access\\s?\\-?\\s?\\/?\\s', ''))

Tidying into cleaner dataframe

repair_fnl <- repair_subset %>% 
  dplyr::select(Full.Name, Development.Name, Compliance.Note.Date, Case.Note) %>% 
  dplyr::arrange(Full.Name, Compliance.Note.Date) %>%
  dplyr::rename("Tenant Name" = Full.Name,
         "NYCHA Development" = Development.Name,
         "Note Date" = Compliance.Note.Date,
         "Case Note" = Case.Note)

kable(head(repair_fnl, n = 20L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)

Tenant Name	NYCHA Development	Note Date	Case Note
Tenant100	Taft	2019-02-11	door (24h), sink, bathroom door, fridge, stove, bedroom window, mold,
Tenant100	Taft	2019-03-11	cabinet and paint
Tenant100	Taft	2019-05-15	refrigerator gasket
Tenant100	Taft	2019-06-11	damaged base kitchen cabinet
Tenant101	Wagner	2019-05-16	bathtub and kitchen window and counter
Tenant101	Wagner	2019-06-14	window
Tenant102	Jefferson	2019-02-11	mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door
Tenant103	Wagner	2019-08-29	to leak in bathroom
Tenant103	Wagner	2019-09-16	leak in bathroom, plaster and paint. due 9/23 but coming to court this friday
Tenant104	Robinson	2019-03-22	leak, exterminate, entrance and bathroom door, radiators, floor tiles
Tenant105	Park Ave E 123	2019-02-05	stove
Tenant106	Wagner	2019-05-31	bathroom enclosure, paint wall, paint kitchen
Tenant107	Wagner	2019-02-19	tub
Tenant107	Wagner	2019-03-19	paint plaster - lm
Tenant109	Wagner Houses	2019-08-24	bathroom pipes leak
Tenant109	Wagner Houses	2019-09-13	windows in apartment need to be fix(hard to open)
Tenant109	Wagner Houses	2019-10-13	bedroom ceiling peeling off
Tenant110		2019-08-26	room windows
Tenant110		2019-08-26	kitchen cabinet
Tenant110		2019-10-11	k sink, mold in bathroom

Data frame is ready, now it’s time to prepare for text analysis

Now that I’ve subsetted the data, I’m ready to create a corpus of the case notes column and prepare these strings for text analysis.

repair_corpus <- VCorpus(VectorSource(repair_fnl$`Case Note`))

repair_corpus <- tm_map(repair_corpus, removeNumbers)
repair_corpus <- tm_map(repair_corpus, removePunctuation)
repair_corpus <- tm_map(repair_corpus, stripWhitespace)
repair_corpus <- tm_map(repair_corpus, removeWords, stopwords())
repair_corpus <- tm_map(repair_corpus, content_transformer(tolower))

As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

# tokenize into uni-grams, bi-grams and tri-grams
unigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = UnigramTokenizer))
bigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = BigramTokenizer))
trigram_repairTDM <- TermDocumentMatrix(repair_corpus, control = list(tokenize = TrigramTokenizer))

# put into data frame
freq_unigram_repairTDM <- sort(rowSums(as.matrix(unigram_repairTDM)),decreasing = TRUE)
freq_uni_df = data.frame(word=names(freq_unigram_repairTDM), freq=freq_unigram_repairTDM)

freq_bigram_repairTDM <- sort(rowSums(as.matrix(bigram_repairTDM)),decreasing = TRUE)
freq_bi_df = data.frame(word=names(freq_bigram_repairTDM), freq=freq_bigram_repairTDM)

freq_trigram_repairTDM <- sort(rowSums(as.matrix(trigram_repairTDM)),decreasing = TRUE)
freq_tri_df = data.frame(word=names(freq_trigram_repairTDM), freq=freq_trigram_repairTDM)

I looked at term frequency using Term Document Matrices similar to the example in Chapter 10, you can find a snapshot of one of my matrices below.

inspect(unigram_repairTDM[400:480, 60:90])

## <<TermDocumentMatrix (terms: 81, documents: 31)>>
## Non-/sparse entries: 11/2500
## Sparsity           : 100%
## Maximal term length: 14
## Weighting          : term frequency (tf)
## Sample             :
##             Docs
## Terms        60 61 64 68 77 84 85 86 87 89
##   letters     0  0  0  0  0  0  0  0  0  0
##   level       0  0  0  0  0  0  0  0  1  0
##   lever       0  0  0  0  0  0  0  0  0  0
##   light       0  0  0  0  1  0  0  0  0  0
##   living      0  1  0  1  0  0  0  0  0  0
##   management  0  0  1  0  0  0  0  0  0  0
##   mold        0  0  0  0  0  1  0  0  0  0
##   now         0  0  0  0  0  0  0  2  0  0
##   nycha       0  0  0  0  0  0  1  1  0  1
##   office      0  0  1  0  0  0  0  0  0  0

From above, you can see that the terms are mapped out in matrix form.

Plotting the results

ggplot(head(freq_uni_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Unigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note unigrams")

ggplot(head(freq_bi_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Bigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note bigrams")

ggplot(head(freq_tri_df, 15), aes(reorder(word,freq), freq)) +
  geom_bar(stat = "identity") + coord_flip() +
  xlab("Trigrams") + ylab("Frequency") +
  geom_text(aes(label=freq), vjust=0.5, hjust=1.25, position = position_dodge(width = 0.9), color="white", fontface="bold") +
  ggtitle("Most frequent repair-note trigrams")

As you can see, it looks like plaster, paint, and floor tile repairs are some of the most frequent types of repairs that aren’t being followed up on by NYCHA. Also, by looking at n-gram sequences of three, “plaster paint bathroom” gets us even more specificity.

Text analysis to find tenants who had the same repair complaint more than once

Step 1: Create a dataframe with one tenant per row

repair_fnl_grouped <- repair_fnl %>% 
  dplyr::group_by(`Tenant Name`) %>% 
  dplyr::summarise(pooled_cases = paste0(`Case Note`, collapse = " "))

kable(head(repair_fnl_grouped, n = 20L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)

Tenant Name	pooled_cases
Tenant100	door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet
Tenant101	bathtub and kitchen window and counter window
Tenant102	mold in bathroom, leak behind bathroom wall, kitchen sink cabinets water damage, upper cabinet door
Tenant103	to leak in bathroom leak in bathroom, plaster and paint. due 9/23 but coming to court this friday
Tenant104	leak, exterminate, entrance and bathroom door, radiators, floor tiles
Tenant105	stove
Tenant106	bathroom enclosure, paint wall, paint kitchen
Tenant107	tub paint plaster - lm
Tenant109	bathroom pipes leak windows in apartment need to be fix(hard to open) bedroom ceiling peeling off
Tenant110	room windows kitchen cabinet k sink, mold in bathroom
Tenant111	radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may
Tenant112	leaky pipe in kitchen kitchen cabinets broken exterminate for mice and roaches mold in bathroom bedroom window crack in bathroom wall plastering in living room, bedroom and bathroom
Tenant113	mold plaster paint
Tenant115	bathtub enclosure
Tenant117	broken window in kitchen, living room and master bedroom, kitchen sink stoppage
Tenant118	wrong phone number on stip. she walked in, corrected phone, i sent email
Tenant119	extermination 4/1: according to ms. varner they fixed all the issues that needed to get fixed.
Tenant12	radiator, sink, outlets, plaster cabinet, pipe , paint floor tiles
Tenant120	follow - mold paint and plaster hallway kitchen cabinets paint and plaster bedroom
Tenant121	sent email front door within 24 hours nycha reports work done inside apartment. waiting on new door tenant claims front door still not secure. nycha inspected door again and says it is secure. mold - 30 days

Step 2: Convert to corpus and normalize, stem, and remove stopwords from corpus

repair_corpus_pooled <- VCorpus(VectorSource(repair_fnl_grouped$pooled_cases))

repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeNumbers)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removePunctuation)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, stripWhitespace)
repair_corpus_pooled <- tm_map(repair_corpus_pooled, removeWords, stopwords())
repair_corpus_pooled <- tm_map(repair_corpus_pooled, content_transformer(tolower))

As mentioned in Chapter 10 of Data Science for Business, I normalized, stemmed, removed stopwords, stripped white space, and removed punctuation from the corpus, in order to make my n-gram analysis more effective.

Step 3: Create Document Term Matrix, run bi-grams

repair_corpus_pooledDTM <- DocumentTermMatrix(repair_corpus_pooled, control = list(tokenize = BigramTokenizer))
repairs_dups <- data.frame(matrix(findMostFreqTerms(repair_corpus_pooledDTM)))

names(repairs_dups) <- c("repairs")

Tidy data frame to identify documents with repeat bi-grams

#separate the long string of frequencies into multiple columns
repairs_dups_total <- separate(data = repairs_dups, col = repairs, 
                               into = c("repair1", "repair2", "repair3", "repair4", "repair5",
                                        "repair6"), sep = ", ")


# remove extraneous characters using regular expressions
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]][[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[0]?', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, 'numeric', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[[c]][[(]]', '')
repairs_dups_total$repair1 <- str_replace_all(repairs_dups_total$repair1, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[0]?', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, 'numeric', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[[c]][[(]]', '')
repairs_dups_total$repair2 <- str_replace_all(repairs_dups_total$repair2, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[0]?', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, 'numeric', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[[c]][[(]]', '')
repairs_dups_total$repair3 <- str_replace_all(repairs_dups_total$repair3, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[0]?', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, 'numeric', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[[c]][[(]]', '')
repairs_dups_total$repair4 <- str_replace_all(repairs_dups_total$repair4, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[0]?', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, 'numeric', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[[c]][[(]]', '')
repairs_dups_total$repair5 <- str_replace_all(repairs_dups_total$repair5, '[$&+,:;?`@#|<>.^*()%!-]', '')

repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[`]]\\w+\\s\\w+.\\s=\\s1', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[0]?', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, 'numeric', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[[c]][[(]]', '')
repairs_dups_total$repair6 <- str_replace_all(repairs_dups_total$repair6, '[$&+,:;?`@#|<>.^*()%!-]', '')


# merge the frequencies in with the initial grouped dataframe by index
df_fnl <- merge(repair_fnl_grouped, repairs_dups_total, by=0)

# replace all blanks with NAs
df_fnl <- apply(df_fnl, 2, function(x) gsub("^$|^ $", NA, x))

# convert to dataframe
df_fnl <- data.frame(df_fnl)

# create one column of all frequencies
df_fnl$repeats <- paste0(df_fnl$repair1,
                         df_fnl$repair2,
                         df_fnl$repair3,
                         df_fnl$repair4,
                         df_fnl$repair5,
                         df_fnl$repair6)

# replace all character NAs with blanks for final column, as well as any misc. numbers
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[NA]]{1,6}', '')
df_fnl$repeats <- str_replace_all(df_fnl$repeats, '[[0-9]]{6}', '')

# replace blanks with NAs for final counts
df_fnl$repeats <- as.character(df_fnl$repeats)
df_fnl$repeats[df_fnl$repeats==""] <- NA
df_fnl$repeats <- as.factor(df_fnl$repeats)

# identify those with NAs as FALSE, and those with values as TRUE
df_fnl <- df_fnl %>% 
  select(Tenant.Name, pooled_cases, repeats) %>% 
  mutate(repeat_bool = ifelse(is.na(repeats), FALSE, TRUE))

With the data tidy’d, we can now see which tenants have repeat bi-grams in their case notes.

kable(head(df_fnl, n = 15L), align = rep('c', 4)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)

Tenant.Name	pooled_cases	repeats	repeat_bool
Tenant100	door (24h), sink, bathroom door, fridge, stove, bedroom window, mold, cabinet and paint refrigerator gasket damaged base kitchen cabinet	NA	FALSE
Tenant110	room windows kitchen cabinet k sink, mold in bathroom	NA	FALSE
Tenant200	exterminate roaches	NA	FALSE
Tenant201	leak, plaster, mold, windows	NA	FALSE
Tenant202	oven windows in kitchen/living room bedroom/bathroom plaster/paint	NA	FALSE
Tenant203	plaster bathroom and kitchen paint bathroom and kitchen, defective vent in bathroom, defective sink cabinet, defective kitchen window balance leak bathroom pipe, shower head kitchen sink cabinet, bathtub enclosure, plaster paint	bathroom kitchen = 2sink cabinet = 2	TRUE
Tenant204	bathroom pipe, tub enclosure	NA	FALSE
Tenant205	damaged pipe above toilet, tub enclosure	NA	FALSE
Tenant207	mold in bathroom,	NA	FALSE
Tenant208	shower, leak, toilet - phone out of service outlet, fan damaged plaster and bubbling paint plaster paint mold	NA	FALSE
Tenant209	broken lock on apartment door mold in bathroom broken metal door frame plaster and paint kitchen plaster and paint ceiling and all walls in bathroom ceramic tiles at bathroom floor	plaster paint = 2	TRUE
Tenant21	bathtub, closet rack, mold plaster/paint plaster paint, mold removal (bathroom), paint bedroom	NA	FALSE
Tenant111	radiator work not completed. emailed nycha. asked her to hold off on osc sent email re may painting date radiator work done. bathroom ceiling scheduled for may	radiator work = 2	TRUE
Tenant210	for bathroom mold paint kitchen bathroom and bedroom,1st bedroom door	NA	FALSE
Tenant211	front door and kitchen	NA	FALSE

From this, we can calculate the percentage of tenants out of the total that have repeat bi-grams in their case notes document.

percentRepeats_tbl <- data.frame(table(df_fnl$repeat_bool))

names(percentRepeats_tbl) <- c("Repeat Repair Complaint?", "Count")

kable(percentRepeats_tbl, align = rep('c', 2)) %>% 
  kable_styling(bootstrap_options = c("striped"), full_width = F)

Repeat Repair Complaint?	Count
FALSE	451
TRUE	81

tenantsTotal <- length(df_fnl$repeat_bool)
repeatComplaints <- table(df_fnl$repeat_bool)[[2]]

percentRepeats <- repeatComplaints / tenantsTotal

percentRepeats

## [1] 0.1522556

It looks like 81 tenants (15% of all tenants in the dataset) have repairs that stil have not been followed up on even after multiple housing court appearances.

DATA 607 - Data Science In Context

Zach Alexander

November 4, 2019

Libraries Used

Reading in data after SQL pull

Filtering raw data on relevant cases

Removing extraneous text with more regular expressions

Tidying into cleaner dataframe

Data frame is ready, now it’s time to prepare for text analysis

Plotting the results

Text analysis to find tenants who had the same repair complaint more than once

Step 1: Create a dataframe with one tenant per row

Step 2: Convert to corpus and normalize, stem, and remove stopwords from corpus

Step 3: Create Document Term Matrix, run bi-grams

Tidy data frame to identify documents with repeat bi-grams