Ekaterina Kalabush
## [1] 8
## [1] 400
rent <- as_tibble(data.table::fread("C:/DA/hw2/rent.csv", encoding= "UTF-8"))
my_rent<-rent[1231:1631, ]
head(my_rent)## # A tibble: 6 x 10
## V1 average_rate_per~ bedrooms_count city date_of_listing description
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 1231 $52 1 Dallas April 2016 "Central locati~
## 2 1232 $118 1 Austin October 2016 "Location!\\n- ~
## 3 1233 $142 Studio Port ~ June 2016 "Come visit Cam~
## 4 1234 $100 3 Van February 2013 "Beautiful rust~
## 5 1235 $1800 1 Bastr~ June 2014 "Beautiful home~
## 6 1236 $391 2 Corpu~ December 2015 "Village by the~
## # ... with 4 more variables: latitude <dbl>, longitude <dbl>, title <chr>,
## # url <chr>
## [1] "Central location w/gated parking. Walk, short Uber, or public transport 2 Downtown/Uptown, Deep Ellum, Fair Park, Lower Greenville, Convention Ctr, Art District, American A/L Ctr, Baylor Med & Dental. Easy commute 2 Design/Medical District. Convenience store, restaurants, concerts/sports very accessible.*Must* be verified & include your check in/out time,if have car,flight info&any special requests. I can easier confirm you w/ this information & coordinate other guests schedules & mine.Thanks!"
## [1] 401 10
## # A tibble: 5 x 2
## description n
## <chr> <int>
## 1 Just mins away from restaurants and 15 mins away from Bush airport! Swi~ 2
## 2 My place is good for couples, solo adventurers, and business travelers. 2
## 3 Plan your ultimate Texas escape to this 3-bedroom, 2.5-bathroom vacatio~ 2
## 4 Stately English Tudor home circa 1930 period unique architecture acclai~ 2
## 5 Welcome to our Air BnB page! We are very excited to Welcome you into ou~ 2
rent_dist<-my_rent %>%
distinct(description, .keep_all = TRUE)
#Now use only distinct
my_rent<-rent_dist
#is there http?
my_rent_detect<-my_rent %>%
filter(str_detect(description,"http"))
my_rent_detect$description[1] #Sample element with http## [1] "Ocean Grove Condos is a upscale, gated community with unobstructed oceanfront view! This 7th floor condo has 3 bedrooms with 1 king size bed, 2 queen size beds, 2 bathrooms, and fully stocked kitchen. Two assigned underground parking spaces.\\nIn the event this condo is booked on your selected dates checkout my other unit https://www.airbnb.com/rooms/4255632."
#We see in data there are \\n (new line) and &.
#Remove http, emoj, \t
removeURL <- function(x) {
gsub("http[^[:space:]]*", "", x)
iconv(x, "latin1", "ASCII", sub="")
gsub("[ \t]{2}", " ", x)
}
my_rent$description<-removeURL(my_rent$description)
#remove new lines
my_rent$description <- str_replace_all(my_rent$description, "\\\\n" , " ")
# rm ampersand
my_rent$description <- str_replace_all(my_rent$description, "&", " ")
my_rent$description <- str_replace_all(my_rent$description, "’", "'")#build a corpus
myCorpus <- Corpus(VectorSource(my_rent$description))
myCorpus[[1]]$content #Sample element in corpus## [1] "Central location w/gated parking. Walk, short Uber, or public transport 2 Downtown/Uptown, Deep Ellum, Fair Park, Lower Greenville, Convention Ctr, Art District, American A/L Ctr, Baylor Med ; Dental. Easy commute 2 Design/Medical District. Convenience store, restaurants, concerts/sports very accessible.*Must* be verified ; include your check in/out time,if have car,flight info ;any special requests. I can easier confirm you w/ this information ; coordinate other guests schedules ; mine.Thanks!"
#clean corpus. function from kaggle https://www.kaggle.com/erikbruin/text-mining-the-clinton-and-trump-election-tweets/report
CleanCorpus <- function(x){
x <- tm_map(x, content_transformer(tolower))
x <- tm_map(x, removeNumbers) #remove numbers before removing words
x <- tm_map(x, removeWords, tidytext::stop_words$word)
x <- tm_map(x, removePunctuation)
x <- tm_map(x, stripWhitespace)
return(x)
}
myCorpus<-CleanCorpus(myCorpus)
myCorpus[[1]]$content## [1] "central location gated parking walk short uber public transport downtownuptown deep ellum fair park lower greenville convention ctr art district american ctr baylor med dental easy commute designmedical district convenience store restaurants concertssports accessible verified include check time carflight info special requests easier confirm information coordinate guests schedules mine"
# Keep a copy to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
# Stem words
myCorpus <- tm_map(myCorpus, stemDocument)
# Define the alternative function instead of stemCompletion in tm package
stemCompletion2 <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
# Unexpectedly, stemCompletion completes an empty string to
# a word in dictionary. Remove empty string to avoid above issue.
x <- x[x != ""]
x <- stemCompletion(x, dictionary=dictionary)
x <- paste(x, sep="", collapse=" ")
PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
#Below - extract the first element from the list
myCorpus <- Corpus(VectorSource(unlist(lapply(myCorpus, '[[', 1))))
myCorpus[[1]]$content #Sample element in corpus ## [1] "centrally located gated park walk short uber public transportation downtownuptown deep ellum fair park lower greenville convention ctr art district american ctr baylor medical dental easily commute designmedical district conveniently store restaurants concertssports access verified included check time carflight info special request easier confirm information coordinate guests schedules mine"
CreateTermsMatrix <- function(x) {
x <- TermDocumentMatrix(x)
x <- as.matrix(x)
y <- rowSums(x)
y <- sort(y, decreasing=TRUE)
return(y)
}
TermFreq <- CreateTermsMatrix(myCorpus)
word_df <- data.frame(word=names(TermFreq), count=TermFreq)
dtm <- DocumentTermMatrix(myCorpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(myCorpus)))
dtm## <<DocumentTermMatrix (documents: 396, terms: 205)>>
## Non-/sparse entries: 5888/75292
## Sparsity : 93%
## Maximal term length: 14
## Weighting : term frequency (tf)
## <<DocumentTermMatrix (documents: 4, terms: 9)>>
## Non-/sparse entries: 6/30
## Sparsity : 83%
## Maximal term length: 11
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs gated guests included located medical park restaurants short store
## 2 0 0 0 1 0 1 0 0 0
## 3 0 0 1 1 0 0 1 0 0
## 4 0 0 0 1 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
#tdm is a transpose of dtm
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
#Find frequent words with at least 25 cases
(freq.terms <- findFreqTerms(tdm, lowfreq = 25))## [1] "access" "art" "centrally" "district" "easily"
## [6] "guests" "included" "located" "park" "restaurants"
## [11] "walk" "apartment" "bedroom" "cable" "downtown"
## [16] "drive" "kitchen" "minutes" "pool" "queen"
## [21] "setting" "size" "tv" "adventurers" "beach"
## [26] "couples" "distance" "families" "kids" "love"
## [31] "offers" "relax" "solo" "stay" "travelers"
## [36] "beautiful" "fishing" "miles" "worth" "bars"
## [41] "bathroom" "home" "enjoy" "live" "rental"
## [46] "texas" "blocks" "comfortable" "friends" "house"
## [51] "perfect" "sleeps" "views" "water" "amenities"
## [56] "condo" "floor" "master" "spacious" "close"
## [61] "dallas" "modern" "shopping" "street" "wifi"
## [66] "private" "airport" "business" "cities" "fort"
## [71] "space" "stadium" "neighborhood" "quiet" "houston"
## [76] "center" "austin" "lake" "coziness" "feel"
## [81] "galveston" "deck" "clean" "pets"
#data frame with the most frequent words
most_freq<-word_df %>%
arrange(desc(count)) %>%
top_n(20)
most_freq## word count
## bedroom bedroom 232
## minutes minutes 176
## home home 175
## located located 153
## bathroom bathroom 133
## downtown downtown 118
## private private 109
## park park 107
## close close 107
## house house 94
## walk walk 88
## beach beach 87
## miles miles 86
## kitchen kitchen 85
## families families 82
## enjoy enjoy 76
## restaurants restaurants 74
## access access 69
## love love 69
## couples couples 66
## travelers travelers 66
#Create a plot 20 most frequent words
ggplot(most_freq, aes(x = reorder(word, count), y = count, fill=factor(ifelse(word==most_freq$word[2], "Highlighted","Normal"))))+
geom_bar(stat = "identity")+
scale_fill_manual(name = "term", values=c("red","grey50"))+
coord_flip()+
theme(legend.position = "none")+
xlab("Words") + ylab("Count")#find association with 2nd word
freq_as<-findAssocs(tdm,"minutes" , 0.2)
#create df to make plot
freq_as<-as.data.frame(freq_as$minutes)
freq_as <- data.frame(term = row.names(freq_as), freq_as)
rownames(freq_as) <- NULL
#top 5 assosiations in data frame
freq_as_top5<-freq_as %>%
rename(freq=freq_as.minutes, ) %>%
top_n(5)
#check a sentence with word bluff
proof <- data.frame(matrix(unlist(myCorpus), nrow=1001, byrow=T),stringsAsFactors=FALSE)
names(proof)[1]<-'word'
df_new<-proof %>%
filter(str_detect(word,'airporttdrivetdistance'))
#remove 5 words from freq_as because of error
freq_as_top5<-freq_as[-c(1,2,3,4,5),] %>%
rename(freq=freq_as.minutes, ) %>%
top_n(5)
freq_as_top5 ## term freq
## 1 tcu 0.30
## 2 stockyards 0.29
## 3 jacuzzi 0.29
## 4 rangers 0.28
## 5 drive 0.27
## 6 fw 0.27
## 7 sundance 0.27
#build a plot with assosiations
ggplot(freq_as_top5, aes(x = reorder(term, freq), y = freq))+
geom_bar(stat = "identity")+
scale_fill_manual(name = "term", values=c("grey50"))+
coord_flip()+
xlab("Words") + ylab("Associated")