library(ggplot2)
library(dplyr)
library(tidyr)
library(stringr)
Airbnb <- read.csv("listings.csv", header = T, stringsAsFactors=FALSE)
names(Airbnb)
## [1] "id" "listing_url"
## [3] "scrape_id" "last_scraped"
## [5] "name" "summary"
## [7] "space" "description"
## [9] "experiences_offered" "neighborhood_overview"
## [11] "notes" "transit"
## [13] "access" "interaction"
## [15] "house_rules" "thumbnail_url"
## [17] "medium_url" "picture_url"
## [19] "xl_picture_url" "host_id"
## [21] "host_url" "host_name"
## [23] "host_since" "host_location"
## [25] "host_about" "host_response_time"
## [27] "host_response_rate" "host_acceptance_rate"
## [29] "host_is_superhost" "host_thumbnail_url"
## [31] "host_picture_url" "host_neighbourhood"
## [33] "host_listings_count" "host_total_listings_count"
## [35] "host_verifications" "host_has_profile_pic"
## [37] "host_identity_verified" "street"
## [39] "neighbourhood" "neighbourhood_cleansed"
## [41] "neighbourhood_group_cleansed" "city"
## [43] "state" "zipcode"
## [45] "market" "smart_location"
## [47] "country_code" "country"
## [49] "latitude" "longitude"
## [51] "is_location_exact" "property_type"
## [53] "room_type" "accommodates"
## [55] "bathrooms" "bedrooms"
## [57] "beds" "bed_type"
## [59] "amenities" "square_feet"
## [61] "price" "weekly_price"
## [63] "monthly_price" "security_deposit"
## [65] "cleaning_fee" "guests_included"
## [67] "extra_people" "minimum_nights"
## [69] "maximum_nights" "calendar_updated"
## [71] "has_availability" "availability_30"
## [73] "availability_60" "availability_90"
## [75] "availability_365" "calendar_last_scraped"
## [77] "number_of_reviews" "first_review"
## [79] "last_review" "review_scores_rating"
## [81] "review_scores_accuracy" "review_scores_cleanliness"
## [83] "review_scores_checkin" "review_scores_communication"
## [85] "review_scores_location" "review_scores_value"
## [87] "requires_license" "license"
## [89] "jurisdiction_names" "instant_bookable"
## [91] "cancellation_policy" "require_guest_profile_picture"
## [93] "require_guest_phone_verification" "calculated_host_listings_count"
## [95] "reviews_per_month"
# YOU SHOULD TAKE A LOOK AT THE DATA
#View(Airbnb)
#Remove unwanted columns
Airbnb <- dplyr::select(Airbnb, -c(listing_url, scrape_id, name, experiences_offered, thumbnail_url: host_url, host_response_time: host_identity_verified, neighbourhood))
#Clean up price (remove $ sign so its numeric)
Airbnb$price = as.numeric(gsub("\\$|,","", Airbnb$price))
qplot(Airbnb$bedrooms)
qplot(Airbnb$bathrooms)
qplot(Airbnb$beds)
There are very few apartments with more than 2 bed/bath. We can recode our variables to aggregate small observations. TIP for Future work (not needed for assignment): Easy way to recode is to use “add-in” in Rstudio add ins: see this link](https://cran.r-project.org/web/packages/addinslist/README.html) and follow instructions to install the add-inn “questionR,” which makes variable recoding much easier.
# Recoding Airbnb$bedrooms into Airbnb$bedrooms_recoded
Airbnb$bedrooms_recoded <- as.character(Airbnb$bedrooms)
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "4"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "3"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "9"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "5"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "6"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "7"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "10"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "8"] <- "3+"
Airbnb$bedrooms_recoded <- factor(Airbnb$bedrooms_recoded)
# Recoding Airbnb$bathrooms into Airbnb$bathrooms_recoded
Airbnb$bathrooms_recoded <- as.character(Airbnb$bathrooms)
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "3"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "3.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "4.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "4"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "8"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "6"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "5.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "6.5"] <- "3+"
Airbnb$bathrooms_recoded <- factor(Airbnb$bathrooms_recoded)
# Recoding Airbnb$beds into Airbnb$beds_recoded
Airbnb$beds_recoded <- as.character(Airbnb$beds)
Airbnb$beds_recoded[Airbnb$beds == "4"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "5"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "9"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "7"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "8"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "6"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "16"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "10"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "11"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "12"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "13"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "14"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "15"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "0"] <- NA
Airbnb$beds_recoded <- factor(Airbnb$beds_recoded)
```
#Quick chart of bedrooms
qplot(Airbnb$bedrooms_recoded)
qplot(Airbnb$bathrooms_recoded)
qplot(Airbnb$beds_recoded)
qplot(Airbnb$price,
geom="histogram",
binwidth = 20,
main = "Histogram for Price",
xlab = "Price",
fill=I("#228B22"),
col=I("black"),
alpha=I(.8),
xlim=c(0,1000))
# Density graph
ggplot(Airbnb, aes(x = log(price))) +
geom_density(aes(fill = neighbourhood_group_cleansed), alpha = 0.5) +
ggtitle("Airbnb Log(Price) by Borough")
NOTE: Do this for some other text column besides “description” which is what this code is doing. For example, you can pick column “house_rules” or “space” or “summary” ##Sentiment Analysis
#Load text mining libraries
pacman::p_load(tidytext, syuzhet, wordcloud, DT)
##tidytext
text_df <- data.frame(Airbnb$house_rules)
names(text_df)[1] <- "text"
text_df$text <- as.character(text_df$text)
tidy <- text_df %>%
unnest_tokens(word, text)
bing_word_counts <- tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
datatable(bing_word_counts)
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
library(reshape2)
tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 500)