Part 1

Load Packages + Data

library(ggplot2)
library(dplyr)
library(tidyr)
library(stringr)

Load data

Airbnb <- read.csv("listings.csv", header = T, stringsAsFactors=FALSE)
names(Airbnb)
##  [1] "id"                               "listing_url"                     
##  [3] "scrape_id"                        "last_scraped"                    
##  [5] "name"                             "summary"                         
##  [7] "space"                            "description"                     
##  [9] "experiences_offered"              "neighborhood_overview"           
## [11] "notes"                            "transit"                         
## [13] "access"                           "interaction"                     
## [15] "house_rules"                      "thumbnail_url"                   
## [17] "medium_url"                       "picture_url"                     
## [19] "xl_picture_url"                   "host_id"                         
## [21] "host_url"                         "host_name"                       
## [23] "host_since"                       "host_location"                   
## [25] "host_about"                       "host_response_time"              
## [27] "host_response_rate"               "host_acceptance_rate"            
## [29] "host_is_superhost"                "host_thumbnail_url"              
## [31] "host_picture_url"                 "host_neighbourhood"              
## [33] "host_listings_count"              "host_total_listings_count"       
## [35] "host_verifications"               "host_has_profile_pic"            
## [37] "host_identity_verified"           "street"                          
## [39] "neighbourhood"                    "neighbourhood_cleansed"          
## [41] "neighbourhood_group_cleansed"     "city"                            
## [43] "state"                            "zipcode"                         
## [45] "market"                           "smart_location"                  
## [47] "country_code"                     "country"                         
## [49] "latitude"                         "longitude"                       
## [51] "is_location_exact"                "property_type"                   
## [53] "room_type"                        "accommodates"                    
## [55] "bathrooms"                        "bedrooms"                        
## [57] "beds"                             "bed_type"                        
## [59] "amenities"                        "square_feet"                     
## [61] "price"                            "weekly_price"                    
## [63] "monthly_price"                    "security_deposit"                
## [65] "cleaning_fee"                     "guests_included"                 
## [67] "extra_people"                     "minimum_nights"                  
## [69] "maximum_nights"                   "calendar_updated"                
## [71] "has_availability"                 "availability_30"                 
## [73] "availability_60"                  "availability_90"                 
## [75] "availability_365"                 "calendar_last_scraped"           
## [77] "number_of_reviews"                "first_review"                    
## [79] "last_review"                      "review_scores_rating"            
## [81] "review_scores_accuracy"           "review_scores_cleanliness"       
## [83] "review_scores_checkin"            "review_scores_communication"     
## [85] "review_scores_location"           "review_scores_value"             
## [87] "requires_license"                 "license"                         
## [89] "jurisdiction_names"               "instant_bookable"                
## [91] "cancellation_policy"              "require_guest_profile_picture"   
## [93] "require_guest_phone_verification" "calculated_host_listings_count"  
## [95] "reviews_per_month"
# YOU SHOULD TAKE A LOOK AT THE DATA
#View(Airbnb)

Clean Up

#Remove unwanted columns
Airbnb <- dplyr::select(Airbnb, -c(listing_url, scrape_id, name, experiences_offered, thumbnail_url: host_url, host_response_time: host_identity_verified,  neighbourhood))

#Clean up price (remove $ sign so its numeric)
Airbnb$price = as.numeric(gsub("\\$|,","", Airbnb$price))

Variable Recoding and NA removal

qplot(Airbnb$bedrooms)

qplot(Airbnb$bathrooms)

qplot(Airbnb$beds)

There are very few apartments with more than 2 bed/bath. We can recode our variables to aggregate small observations. TIP for Future work (not needed for assignment): Easy way to recode is to use “add-in” in Rstudio add ins: see this link](https://cran.r-project.org/web/packages/addinslist/README.html) and follow instructions to install the add-inn “questionR,” which makes variable recoding much easier.

# Recoding Airbnb$bedrooms into Airbnb$bedrooms_recoded
Airbnb$bedrooms_recoded <- as.character(Airbnb$bedrooms)
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "4"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "3"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "9"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "5"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "6"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "7"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "10"] <- "3+"
Airbnb$bedrooms_recoded[Airbnb$bedrooms == "8"] <- "3+"
Airbnb$bedrooms_recoded <- factor(Airbnb$bedrooms_recoded)

# Recoding Airbnb$bathrooms into Airbnb$bathrooms_recoded
Airbnb$bathrooms_recoded <- as.character(Airbnb$bathrooms)
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "3"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "3.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "4.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "4"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "8"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "6"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "5.5"] <- "3+"
Airbnb$bathrooms_recoded[Airbnb$bathrooms == "6.5"] <- "3+"
Airbnb$bathrooms_recoded <- factor(Airbnb$bathrooms_recoded)

# Recoding Airbnb$beds into Airbnb$beds_recoded
Airbnb$beds_recoded <- as.character(Airbnb$beds)
Airbnb$beds_recoded[Airbnb$beds == "4"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "5"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "9"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "7"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "8"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "6"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "16"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "10"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "11"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "12"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "13"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "14"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "15"] <- "4+"
Airbnb$beds_recoded[Airbnb$beds == "0"] <- NA
Airbnb$beds_recoded <- factor(Airbnb$beds_recoded)

```

#Quick chart of bedrooms

qplot(Airbnb$bedrooms_recoded)

qplot(Airbnb$bathrooms_recoded)

qplot(Airbnb$beds_recoded)

Histogram of Price

qplot(Airbnb$price,
      geom="histogram",
      binwidth = 20,  
      main = "Histogram for Price", 
      xlab = "Price",  
      fill=I("#228B22"), 
      col=I("black"), 
      alpha=I(.8),
      xlim=c(0,1000))

Price distribution by NY Boroughs

# Density graph
ggplot(Airbnb, aes(x = log(price))) + 
  geom_density(aes(fill = neighbourhood_group_cleansed), alpha = 0.5) + 
  ggtitle("Airbnb Log(Price) by Borough")

Working with Text Columns

NOTE: Do this for some other text column besides “description” which is what this code is doing. For example, you can pick column “house_rules” or “space” or “summary” ##Sentiment Analysis

#Load text mining libraries
pacman::p_load(tidytext, syuzhet, wordcloud, DT)

##tidytext

text_df <- data.frame(Airbnb$house_rules)
names(text_df)[1] <- "text"
text_df$text <- as.character(text_df$text)

tidy <- text_df %>%
  unnest_tokens(word, text)

bing_word_counts <- tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

datatable(bing_word_counts) 

Plot high frequency words

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

library(reshape2)

tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 500)