Text Analysis in R

Ekaterina Kalabush

The required packages

knitr::opts_chunk$set(echo = T, message = F, warning = F, cache = T)
setwd("C:/DA/hw2")
library(tm)
library(ggplot2)
library(stopwords)
library(stringr)
library(dplyr)

Count the total number X of letters in the last name

x <- 'Kalabush'
nchar(x)

## [1] 8

#Starting from observation
y <- nchar(x)*50
y

## [1] 400

Import Data

rent <- as_tibble(data.table::fread("C:/DA/hw2/rent.csv", encoding= "UTF-8"))
my_rent<-rent[1231:1631, ]
head(my_rent)

## # A tibble: 6 x 10
##      V1 average_rate_per~ bedrooms_count city   date_of_listing description     
##   <int> <chr>             <chr>          <chr>  <chr>           <chr>           
## 1  1231 $52               1              Dallas April 2016      "Central locati~
## 2  1232 $118              1              Austin October 2016    "Location!\\n- ~
## 3  1233 $142              Studio         Port ~ June 2016       "Come visit Cam~
## 4  1234 $100              3              Van    February 2013   "Beautiful rust~
## 5  1235 $1800             1              Bastr~ June 2014       "Beautiful home~
## 6  1236 $391              2              Corpu~ December 2015   "Village by the~
## # ... with 4 more variables: latitude <dbl>, longitude <dbl>, title <chr>,
## #   url <chr>

Data

#Example of 'description'
my_rent$description[1]

## [1] "Central location w/gated parking. Walk, short Uber, or public transport 2 Downtown/Uptown, Deep Ellum, Fair Park, Lower Greenville, Convention Ctr, Art District, American A/L Ctr, Baylor Med &amp; Dental. Easy commute 2 Design/Medical District. Convenience store, restaurants, concerts/sports very accessible.*Must* be verified &amp; include your check in/out time,if have car,flight info&amp;any special requests. I can easier confirm you w/ this information &amp; coordinate other guests schedules &amp; mine.Thanks!"

dim(my_rent)

## [1] 401  10

Preparing the dataset

library(stringr)
#remove duplicates
dup_id<-my_rent %>%
  count(description) %>%
  filter(n > 1)

dup_id

## # A tibble: 5 x 2
##   description                                                                  n
##   <chr>                                                                    <int>
## 1 Just mins away from restaurants and 15 mins away from Bush airport! Swi~     2
## 2 My place is good for couples, solo adventurers, and business travelers.      2
## 3 Plan your ultimate Texas escape to this 3-bedroom, 2.5-bathroom vacatio~     2
## 4 Stately English Tudor home circa 1930 period unique architecture acclai~     2
## 5 Welcome to our Air BnB page! We are very excited to Welcome you into ou~     2

rent_dist<-my_rent %>%
  distinct(description, .keep_all = TRUE)
#Now use only distinct 
my_rent<-rent_dist

#is there http?
my_rent_detect<-my_rent %>% 
  filter(str_detect(description,"http"))
my_rent_detect$description[1] #Sample element with http

## [1] "Ocean Grove Condos is a upscale, gated community with unobstructed oceanfront view! This 7th floor condo has 3 bedrooms with 1 king size bed, 2 queen size beds, 2 bathrooms, and fully stocked kitchen. Two assigned underground parking spaces.\\nIn the event this condo is booked on your selected dates checkout my other unit https://www.airbnb.com/rooms/4255632."

#We see in data there are \\n (new line) and &amp. 
#Remove http, emoj, \t
removeURL <- function(x) {
  gsub("http[^[:space:]]*", "", x)
  iconv(x, "latin1", "ASCII", sub="")
  gsub("[ \t]{2}", " ", x)
  }
my_rent$description<-removeURL(my_rent$description)

#remove new lines
my_rent$description <- str_replace_all(my_rent$description, "\\\\n" , " ")
# rm ampersand
my_rent$description <- str_replace_all(my_rent$description, "&amp", " ")
my_rent$description <- str_replace_all(my_rent$description, "’", "'")

Creating “corpus”

#build a corpus
myCorpus <- Corpus(VectorSource(my_rent$description))
myCorpus[[1]]$content #Sample element in corpus

## [1] "Central location w/gated parking. Walk, short Uber, or public transport 2 Downtown/Uptown, Deep Ellum, Fair Park, Lower Greenville, Convention Ctr, Art District, American A/L Ctr, Baylor Med  ; Dental. Easy commute 2 Design/Medical District. Convenience store, restaurants, concerts/sports very accessible.*Must* be verified  ; include your check in/out time,if have car,flight info ;any special requests. I can easier confirm you w/ this information  ; coordinate other guests schedules  ; mine.Thanks!"

#clean corpus. function from kaggle https://www.kaggle.com/erikbruin/text-mining-the-clinton-and-trump-election-tweets/report
CleanCorpus <- function(x){
  x <- tm_map(x, content_transformer(tolower))
  x <- tm_map(x, removeNumbers) #remove numbers before removing words
  x <- tm_map(x, removeWords, tidytext::stop_words$word)
  x <- tm_map(x, removePunctuation)
  x <- tm_map(x, stripWhitespace)
  return(x)
}

myCorpus<-CleanCorpus(myCorpus)
myCorpus[[1]]$content

## [1] "central location gated parking walk short uber public transport downtownuptown deep ellum fair park lower greenville convention ctr art district american ctr baylor med dental easy commute designmedical district convenience store restaurants concertssports accessible verified include check time carflight info special requests easier confirm information coordinate guests schedules mine"

Stemming words

# Keep a copy to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
# Stem words
myCorpus <- tm_map(myCorpus, stemDocument)

# Define the alternative function instead of stemCompletion in tm package
stemCompletion2 <- function(x, dictionary) {
  x <- unlist(strsplit(as.character(x), " "))
  # Unexpectedly, stemCompletion completes an empty string to
  # a word in dictionary. Remove empty string to avoid above issue.
  x <- x[x != ""]
  x <- stemCompletion(x, dictionary=dictionary)
  x <- paste(x, sep="", collapse=" ")
  PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
#Below - extract the first element from the list
myCorpus <- Corpus(VectorSource(unlist(lapply(myCorpus, '[[', 1))))

myCorpus[[1]]$content #Sample element in corpus

## [1] "centrally located gated park walk short uber public transportation downtownuptown deep ellum fair park lower greenville convention ctr art district american ctr baylor medical dental easily commute designmedical district conveniently store restaurants concertssports access verified included check time carflight info special request easier confirm information coordinate guests schedules mine"

Document-Term Matrix

CreateTermsMatrix <- function(x) {
  x <- TermDocumentMatrix(x)
  x <- as.matrix(x)
  y <- rowSums(x)
  y <- sort(y, decreasing=TRUE)
  return(y)
}

TermFreq <- CreateTermsMatrix(myCorpus)

word_df <- data.frame(word=names(TermFreq), count=TermFreq)

dtm <- DocumentTermMatrix(myCorpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(myCorpus)))
dtm

## <<DocumentTermMatrix (documents: 396, terms: 205)>>
## Non-/sparse entries: 5888/75292
## Sparsity           : 93%
## Maximal term length: 14
## Weighting          : term frequency (tf)

#Inspect a part of matrix
inspect(dtm[2:5, 7:15])

## <<DocumentTermMatrix (documents: 4, terms: 9)>>
## Non-/sparse entries: 6/30
## Sparsity           : 83%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs gated guests included located medical park restaurants short store
##    2     0      0        0       1       0    1           0     0     0
##    3     0      0        1       1       0    0           1     0     0
##    4     0      0        0       1       0    0           0     0     0
##    5     0      0        0       0       0    0           0     0     0

Identifying frequent words

#tdm is a transpose of dtm
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))

#Find frequent words with at least 25 cases
(freq.terms <- findFreqTerms(tdm, lowfreq = 25))

##  [1] "access"       "art"          "centrally"    "district"     "easily"      
##  [6] "guests"       "included"     "located"      "park"         "restaurants" 
## [11] "walk"         "apartment"    "bedroom"      "cable"        "downtown"    
## [16] "drive"        "kitchen"      "minutes"      "pool"         "queen"       
## [21] "setting"      "size"         "tv"           "adventurers"  "beach"       
## [26] "couples"      "distance"     "families"     "kids"         "love"        
## [31] "offers"       "relax"        "solo"         "stay"         "travelers"   
## [36] "beautiful"    "fishing"      "miles"        "worth"        "bars"        
## [41] "bathroom"     "home"         "enjoy"        "live"         "rental"      
## [46] "texas"        "blocks"       "comfortable"  "friends"      "house"       
## [51] "perfect"      "sleeps"       "views"        "water"        "amenities"   
## [56] "condo"        "floor"        "master"       "spacious"     "close"       
## [61] "dallas"       "modern"       "shopping"     "street"       "wifi"        
## [66] "private"      "airport"      "business"     "cities"       "fort"        
## [71] "space"        "stadium"      "neighborhood" "quiet"        "houston"     
## [76] "center"       "austin"       "lake"         "coziness"     "feel"        
## [81] "galveston"    "deck"         "clean"        "pets"

#Prepare for visualization
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 25)
df <- data.frame(term = names(term.freq), freq = term.freq)

R code - visualize frequent words

#data frame with the most frequent words
most_freq<-word_df %>% 
  arrange(desc(count)) %>% 
  top_n(20)

most_freq

##                    word count
## bedroom         bedroom   232
## minutes         minutes   176
## home               home   175
## located         located   153
## bathroom       bathroom   133
## downtown       downtown   118
## private         private   109
## park               park   107
## close             close   107
## house             house    94
## walk               walk    88
## beach             beach    87
## miles             miles    86
## kitchen         kitchen    85
## families       families    82
## enjoy             enjoy    76
## restaurants restaurants    74
## access           access    69
## love               love    69
## couples         couples    66
## travelers     travelers    66

#Create a plot 20 most frequent words
ggplot(most_freq, aes(x = reorder(word, count), y = count, fill=factor(ifelse(word==most_freq$word[2], "Highlighted","Normal"))))+
  geom_bar(stat = "identity")+
  scale_fill_manual(name = "term", values=c("red","grey50"))+
  coord_flip()+
  theme(legend.position = "none")+
  xlab("Words") + ylab("Count")

Finding associations

#find association with 2nd word                         
freq_as<-findAssocs(tdm,"minutes" , 0.2)
#create df to make plot
freq_as<-as.data.frame(freq_as$minutes) 
freq_as <- data.frame(term = row.names(freq_as), freq_as)
rownames(freq_as) <- NULL
#top 5 assosiations in data frame
freq_as_top5<-freq_as %>% 
  rename(freq=freq_as.minutes, ) %>% 
  top_n(5)

#check a sentence with word bluff
proof <- data.frame(matrix(unlist(myCorpus), nrow=1001, byrow=T),stringsAsFactors=FALSE)
names(proof)[1]<-'word'
df_new<-proof %>% 
  filter(str_detect(word,'airporttdrivetdistance'))
#remove 5 words from freq_as because of error
freq_as_top5<-freq_as[-c(1,2,3,4,5),] %>% 
  rename(freq=freq_as.minutes, ) %>% 
  top_n(5)
freq_as_top5

##         term freq
## 1        tcu 0.30
## 2 stockyards 0.29
## 3    jacuzzi 0.29
## 4    rangers 0.28
## 5      drive 0.27
## 6         fw 0.27
## 7   sundance 0.27

#build a plot with assosiations
ggplot(freq_as_top5, aes(x = reorder(term, freq), y = freq))+
  geom_bar(stat = "identity")+
  scale_fill_manual(name = "term", values=c("grey50"))+
  coord_flip()+
  xlab("Words") + ylab("Associated")

Word cloud

library(wordcloud)
#create wordcloud
wordcloud(word_df$word, word_df$count, max.words = 100, scale=c(2.5,.5), random.color = TRUE, colors=brewer.pal(9,"Set1"))

wordcloud2::wordcloud2(word_df[1:100,], color = "random-light", backgroundColor = "grey", shuffle=FALSE, size=0.4)