Notebook Instructions


Notebook Version

platform       x86_64-apple-darwin15.6.0   
arch           x86_64                      
os             darwin15.6.0                
system         x86_64, darwin15.6.0        
status                                     
major          3                           
minor          5.0                         
year           2018                        
month          04                          
day            23                          
svn rev        74626                       
language       R                           
version.string R version 3.5.0 (2018-04-23)
nickname       Joy in Playing   

About

  • Airbnb claims to be part of the “sharing economy” and disrupting the hotel industry.

  • However, data shows that the majority of Airbnb listings in most cities are entire homes, many of which are rented all year round - disrupting housing and communities.

  • http://insideairbnb.com/index.html

Project Layout

.
└── Airbnb
    ├── airbnb-notebook.Rmd
    │
    ├── data
    │   ├── 2015
    │   │   ├── airbnb.csv
    │   │   ├── airbnb_calendar.csv
    │   │   ├── airbnb_clean.csv
    │   │   ├── airbnb_loc.csv
    │   │   ├── airbnb_nlp.csv
    │   │   ├── airbnb_reviews.csv
    │   │   ├── airbnb_summary.csv
    │   │   └── airbnb_urls.csv
    │   │
    │   ├── 2017
    │   │   ├── airbnb.csv
    │   │   ├── airbnb_calendar.csv
    │   │   ├── airbnb_reviews.csv
    │   │   └── airbnb_summary.csv
    │   │
    │   ├── location
    │   │   ├── 2015-neighbourhoods.csv
    │   │   ├── 2015-neighbourhoods.geojson
    │   │   ├── 2017-neighbourhoods.csv
    │   │   ├── 2017-neighbourhoods.geojson
    │   │   └── names-zipcode.csv
    │   │
    │   └── raw
    │       ├── 2015-airbnb-listings.csv.gz
    │       ├── 2015-calendar.csv.gz
    │       ├── 2015-neighbourhoods.zip
    │       ├── 2015-reviews.csv.gz
    │       ├── 2017-airbnb-listings.csv.gz
    │       ├── 2017-calendar.csv.gz
    │       ├── 2017-neighbourhoods.zip
    │       └── 2017-reviews.csv.gz
    │
    ├── resources
    │   ├── Airbnb-Proposal.docx
    │   └── Airbnb-Proposal.pdf
    │
    └── scripts

Load Packages in R/RStudio

We are going to use tidyverse a collection of R packages designed for data science.


Data Import and Inspection



airbnb <- read_csv("data/2015/airbnb_raw.csv")

Inspect head and tail of the dataset

head(airbnb)
tail(airbnb)
colnames(airbnb)
 [1] "id"                               "listing_url"                     
 [3] "scrape_id"                        "last_scraped"                    
 [5] "name"                             "summary"                         
 [7] "space"                            "description"                     
 [9] "experiences_offered"              "neighborhood_overview"           
[11] "notes"                            "transit"                         
[13] "thumbnail_url"                    "medium_url"                      
[15] "picture_url"                      "xl_picture_url"                  
[17] "host_id"                          "host_url"                        
[19] "host_name"                        "host_since"                      
[21] "host_location"                    "host_about"                      
[23] "host_response_time"               "host_response_rate"              
[25] "host_acceptance_rate"             "host_is_superhost"               
[27] "host_thumbnail_url"               "host_picture_url"                
[29] "host_neighbourhood"               "host_listings_count"             
[31] "host_total_listings_count"        "host_verifications"              
[33] "host_has_profile_pic"             "host_identity_verified"          
[35] "street"                           "neighbourhood"                   
[37] "neighbourhood_cleansed"           "neighbourhood_group_cleansed"    
[39] "city"                             "state"                           
[41] "zipcode"                          "market"                          
[43] "smart_location"                   "country_code"                    
[45] "country"                          "latitude"                        
[47] "longitude"                        "is_location_exact"               
[49] "property_type"                    "room_type"                       
[51] "accommodates"                     "bathrooms"                       
[53] "bedrooms"                         "beds"                            
[55] "bed_type"                         "amenities"                       
[57] "square_feet"                      "price"                           
[59] "weekly_price"                     "monthly_price"                   
[61] "security_deposit"                 "cleaning_fee"                    
[63] "guests_included"                  "extra_people"                    
[65] "minimum_nights"                   "maximum_nights"                  
[67] "calendar_updated"                 "has_availability"                
[69] "availability_30"                  "availability_60"                 
[71] "availability_90"                  "availability_365"                
[73] "calendar_last_scraped"            "number_of_reviews"               
[75] "first_review"                     "last_review"                     
[77] "review_scores_rating"             "review_scores_accuracy"          
[79] "review_scores_cleanliness"        "review_scores_checkin"           
[81] "review_scores_communication"      "review_scores_location"          
[83] "review_scores_value"              "requires_license"                
[85] "license"                          "jurisdiction_names"              
[87] "instant_bookable"                 "cancellation_policy"             
[89] "require_guest_profile_picture"    "require_guest_phone_verification"
[91] "calculated_host_listings_count"   "reviews_per_month"               

Data Selection


airbnb <- rename(airbnb, airbnb_id = id)
airbnb_id <- airbnb$airbnb_id
colnames(airbnb)[1:30]
 [1] "airbnb_id"             "listing_url"           "scrape_id"            
 [4] "last_scraped"          "name"                  "summary"              
 [7] "space"                 "description"           "experiences_offered"  
[10] "neighborhood_overview" "notes"                 "transit"              
[13] "thumbnail_url"         "medium_url"            "picture_url"          
[16] "xl_picture_url"        "host_id"               "host_url"             
[19] "host_name"             "host_since"            "host_location"        
[22] "host_about"            "host_response_time"    "host_response_rate"   
[25] "host_acceptance_rate"  "host_is_superhost"     "host_thumbnail_url"   
[28] "host_picture_url"      "host_neighbourhood"    "host_listings_count"  
colnames(airbnb)[31:65]
 [1] "host_total_listings_count"    "host_verifications"           "host_has_profile_pic"        
 [4] "host_identity_verified"       "street"                       "neighbourhood"               
 [7] "neighbourhood_cleansed"       "neighbourhood_group_cleansed" "city"                        
[10] "state"                        "zipcode"                      "market"                      
[13] "smart_location"               "country_code"                 "country"                     
[16] "latitude"                     "longitude"                    "is_location_exact"           
[19] "property_type"                "room_type"                    "accommodates"                
[22] "bathrooms"                    "bedrooms"                     "beds"                        
[25] "bed_type"                     "amenities"                    "square_feet"                 
[28] "price"                        "weekly_price"                 "monthly_price"               
[31] "security_deposit"             "cleaning_fee"                 "guests_included"             
[34] "extra_people"                 "minimum_nights"              
colnames(airbnb)[66:92]
 [1] "maximum_nights"                   "calendar_updated"                
 [3] "has_availability"                 "availability_30"                 
 [5] "availability_60"                  "availability_90"                 
 [7] "availability_365"                 "calendar_last_scraped"           
 [9] "number_of_reviews"                "first_review"                    
[11] "last_review"                      "review_scores_rating"            
[13] "review_scores_accuracy"           "review_scores_cleanliness"       
[15] "review_scores_checkin"            "review_scores_communication"     
[17] "review_scores_location"           "review_scores_value"             
[19] "requires_license"                 "license"                         
[21] "jurisdiction_names"               "instant_bookable"                
[23] "cancellation_policy"              "require_guest_profile_picture"   
[25] "require_guest_phone_verification" "calculated_host_listings_count"  
[27] "reviews_per_month"               

Select only columns containg text

text_columns <- c("host_name","name","amenities", "experiences_offered",
                  "host_verifications","transit","notes", 
                 "neighborhood_overview", "host_about", 
                 "description", "space","summary")
airbnb_nlp <- select(airbnb, text_columns)
airbnb_nlp <- add_column(airbnb_nlp, airbnb_id, .before = 1)
head(airbnb_nlp)

Use select and one_of function to remove the columns from previous datasets (ulr_columns, text_columns, location_columns)

# unique(as.character(airbnb$scrape_id))
remove_columns <- c(ulr_columns, text_columns, location_columns, "scrape_id", "last_scraped")
airbnb_clean <- select(airbnb, -one_of(remove_columns))

Inspect the new more compacted dataset. By doing this we reduced the size by almost half

head(airbnb_clean)

Save the new dataset for reference and later use

write_csv(airbnb_urls, "data/2015/airbnb_urls.csv")
write_csv(airbnb_nlp, "data/2015/airbnb_nlp.csv")
write_csv(airbnb_loc, "data/2015/airbnb_loc.csv")

Data Types and Cleaning


head(airbnb_clean)
airbnb_clean$room_type <- airbnb_clean$room_type %>% 
  tolower() %>% 
  str_replace_all(., " ", "_") %>% 
  str_replace(., "home/apt","place") %>% 
  as_factor()
airbnb_clean$bed_type <- airbnb_clean$bed_type %>% 
  tolower() %>% 
  str_replace_all(., " ", "_") %>% 
  str_replace(., "pull.*","sofa_bed") %>% 
  str_replace(., "real_.*","bed") %>% 
  as_factor()
airbnb_clean$host_response_rate <- airbnb_clean$host_response_rate %>% 
  str_remove(. , "%") %>% 
  as.numeric()/100
NAs introduced by coercion
airbnb_clean$host_acceptance_rate <- airbnb_clean$host_acceptance_rate %>% 
  str_remove(. , "%") %>% 
  as.numeric()/100
NAs introduced by coercion
airbnb_clean$property_type  <- airbnb_clean$property_type %>%  
  str_replace(., " \\& ", "_") %>% 
  str_replace(., "\\/", "_") %>% 
  as_factor()
airbnb_clean$price <- airbnb_clean$price %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
airbnb_clean$weekly_price <- airbnb_clean$weekly_price %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
airbnb_clean$monthly_price <- airbnb_clean$monthly_price %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
airbnb_clean$security_deposit <- airbnb_clean$security_deposit %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
airbnb_clean$extra_people <- airbnb_clean$extra_people %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
airbnb_clean <- rename(airbnb_clean, extra_people_fee = extra_people)
airbnb_clean$cleaning_fee <- airbnb_clean$cleaning_fee %>% 
  str_remove(., "\\$") %>% 
  str_remove(., ",") %>% 
  as.numeric()
updated_days <- function(item){
  if(str_detect(item, "today"))
        item <- 0
  if(str_detect(item, "yesterday"))
        item <- 1
  if(str_detect(item, "never"))
        item <- 9999
  if(str_detect(item, "(a week ago)|(1 week ago)"))
        item <- 7
  if(str_detect(item, "days"))
    item <- str_extract(item, "\\d+") %>% 
      as.numeric()
  if(str_detect(item, "weeks"))
    item <- str_extract(item, "\\d+") %>% 
      as.numeric() * 7
  if(str_detect(item, "months"))
    item <- str_extract(item, "\\d+") %>% 
      as.numeric() * 30 
  return(item)
}
calendar_updated_days <- map_dbl(airbnb_clean$calendar_updated, updated_days)
airbnb_clean$calendar_updated_days <- if_else(calendar_updated_days %in% 9999, 
         airbnb_clean$calendar_last_scraped - airbnb_clean$host_since, 
         airbnb_clean$calendar_last_scraped - (airbnb_clean$calendar_last_scraped - calendar_updated_days))
airbnb_clean$calendar_updated <- NULL
airbnb_clean$host_response_time <- airbnb_clean$host_response_time %>% 
  str_replace("N/A", "NA") %>% 
  str_replace(".*(an hour)",  "60") %>% 
  str_replace(".*(few hours)",  as.character(60*12)) %>% 
  str_replace(".*(a day)",  as.character(60*24)) %>% 
  str_replace(".*(few days).*",  as.character(60*(5*24))) %>% 
  factor(., levels = c("60", "720", "1440", "7200"))

Change variable to correct datatypes

airbnb_clean <- airbnb_clean %>% 
  mutate(airbnb_id = as.character(airbnb_id)) %>% 
  mutate(host_id = as.character(host_id)) %>% 
  mutate(property_type = as_factor(tolower(property_type))) %>% 
  mutate(cancellation_policy = factor(tolower(cancellation_policy),
                                      levels = c("flexible", "moderate", 
                                                 "strict", "super_strict_30")))
  

Change variable to logical (TRUE/FALSE)

airbnb_clean <- airbnb_clean %>% 
  mutate(host_is_superhost = parse_logical(host_is_superhost)) %>% 
  mutate(host_has_profile_pic = parse_logical(host_has_profile_pic)) %>% 
  mutate(host_identity_verified = parse_logical(host_identity_verified)) %>% 
  mutate(is_location_exact = parse_logical(is_location_exact)) %>% 
  mutate(requires_license = parse_logical(requires_license)) %>% 
  mutate(instant_bookable = parse_logical(instant_bookable)) %>% 
  mutate(has_availability = parse_logical(has_availability)) %>% 
  mutate(require_guest_profile_picture = parse_logical(require_guest_profile_picture)) %>% 
  mutate(require_guest_phone_verification = parse_logical(require_guest_phone_verification))
head(airbnb_clean)

Save clean dataset with different name

saveRDS(airbnb_clean, "data/airbnb_clean")

Descriptive Statistics



airbnb_clean <- readRDS("data/2015/airbnb_clean.rds")
summary(airbnb_clean[1:18])
  airbnb_id           host_id            host_since         host_response_time host_response_rate
 Length:5147        Length:5147        Min.   :2008-05-06   60  :2175          Min.   :0.100     
 Class :character   Class :character   1st Qu.:2012-12-27   720 :1493          1st Qu.:0.900     
 Mode  :character   Mode  :character   Median :2014-05-01   1440: 934          Median :1.000     
                                       Mean   :2013-12-14   7200:  98          Mean   :0.925     
                                       3rd Qu.:2015-04-07   NA's: 447          3rd Qu.:1.000     
                                       Max.   :2015-10-02                      Max.   :1.000     
                                                                               NA's   :447       
 host_acceptance_rate host_is_superhost host_listings_count host_total_listings_count
 Min.   :0.0000       Mode :logical     Min.   :  1.000     Min.   :  1.000          
 1st Qu.:0.8000       FALSE:4660        1st Qu.:  1.000     1st Qu.:  1.000          
 Median :0.9600       TRUE :487         Median :  1.000     Median :  1.000          
 Mean   :0.8682                         Mean   :  4.791     Mean   :  4.791          
 3rd Qu.:1.0000                         3rd Qu.:  2.000     3rd Qu.:  2.000          
 Max.   :1.0000                         Max.   :480.000     Max.   :480.000          
 NA's   :565                                                                         
 host_has_profile_pic host_identity_verified is_location_exact     property_type 
 Mode :logical        Mode :logical          Mode :logical     apartment  :4000  
 FALSE:22             FALSE:1201             FALSE:653         house      : 593  
 TRUE :5125           TRUE :3946             TRUE :4494        condominium: 285  
                                                               loft       : 154  
                                                               townhouse  :  42  
                                                               (Other)    :  72  
                                                               NA's       :   1  
        room_type     accommodates      bathrooms        bedrooms           beds       
 entire_place:2928   Min.   : 1.000   Min.   :0.000   Min.   : 0.000   Min.   : 1.000  
 private_room:1972   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.: 1.000   1st Qu.: 1.000  
 shared_room : 247   Median : 2.000   Median :1.000   Median : 1.000   Median : 1.000  
                     Mean   : 3.275   Mean   :1.223   Mean   : 1.279   Mean   : 1.642  
                     3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 2.000   3rd Qu.: 2.000  
                     Max.   :16.000   Max.   :6.000   Max.   :10.000   Max.   :16.000  
                                      NA's   :18      NA's   :15       NA's   :13      
summary(airbnb_clean[19:37])
     bed_type     square_feet        price         weekly_price     monthly_price  
 bed     :4845   Min.   :    0   Min.   :  10.0   Min.   :   70.0   Min.   :  310  
 airbed  : 113   1st Qu.:  575   1st Qu.:  75.0   1st Qu.:  425.0   1st Qu.: 1358  
 futon   :  96   Median : 1000   Median : 110.0   Median :  630.5   Median : 2000  
 sofa_bed:  51   Mean   : 1236   Mean   : 149.5   Mean   :  814.6   Mean   : 2652  
 couch   :  42   3rd Qu.: 1350   3rd Qu.: 175.0   3rd Qu.:  950.0   3rd Qu.: 3150  
                 Max.   :22000   Max.   :4900.0   Max.   :10000.0   Max.   :30000  
                 NA's   :5060                     NA's   :2525      NA's   :3035   
 security_deposit  cleaning_fee    guests_included extra_people_fee minimum_nights   
 Min.   :  95.0   Min.   :  5.00   Min.   : 0.00   Min.   :  0.00   Min.   :  1.000  
 1st Qu.: 100.0   1st Qu.: 20.00   1st Qu.: 1.00   1st Qu.:  0.00   1st Qu.:  1.000  
 Median : 200.0   Median : 40.00   Median : 1.00   Median :  0.00   Median :  1.000  
 Mean   : 307.2   Mean   : 47.87   Mean   : 1.59   Mean   : 12.61   Mean   :  2.118  
 3rd Qu.: 350.0   3rd Qu.: 65.00   3rd Qu.: 2.00   3rd Qu.: 20.00   3rd Qu.:  2.000  
 Max.   :4000.0   Max.   :400.00   Max.   :16.00   Max.   :300.00   Max.   :180.000  
 NA's   :3387     NA's   :2100                                                       
 maximum_nights       has_availability availability_30 availability_60 availability_90
 Min.   :         1   Mode:logical     Min.   : 0.00   Min.   : 0.00   Min.   : 0.00  
 1st Qu.:        93   TRUE:5147        1st Qu.: 0.00   1st Qu.: 5.00   1st Qu.:14.00  
 Median :      1125                    Median : 9.00   Median :31.00   Median :58.00  
 Mean   :    418271                    Mean   :10.91   Mean   :28.57   Mean   :48.29  
 3rd Qu.:      1125                    3rd Qu.:19.00   3rd Qu.:47.00   3rd Qu.:77.00  
 Max.   :2147483647                    Max.   :30.00   Max.   :60.00   Max.   :90.00  
                                                                                      
 availability_365 calendar_last_scraped number_of_reviews  first_review       
 Min.   :  0.0    Min.   :2015-10-02    Min.   :  0.0     Min.   :2009-03-06  
 1st Qu.:123.0    1st Qu.:2015-10-02    1st Qu.:  1.0     1st Qu.:2014-08-04  
 Median :311.0    Median :2015-10-03    Median :  5.0     Median :2015-05-17  
 Mean   :244.9    Mean   :2015-10-02    Mean   : 14.6     Mean   :2014-12-06  
 3rd Qu.:349.0    3rd Qu.:2015-10-03    3rd Qu.: 16.0     3rd Qu.:2015-08-01  
 Max.   :365.0    Max.   :2015-10-03    Max.   :298.0     Max.   :2015-10-03  
                                                          NA's   :1005        
summary(airbnb_clean[38:54])
  last_review         review_scores_rating review_scores_accuracy review_scores_cleanliness
 Min.   :2010-08-09   Min.   : 20.00       Min.   : 2.000         Min.   : 2.000           
 1st Qu.:2015-08-23   1st Qu.: 91.00       1st Qu.: 9.000         1st Qu.: 9.000           
 Median :2015-09-18   Median : 96.00       Median :10.000         Median :10.000           
 Mean   :2015-08-19   Mean   : 93.99       Mean   : 9.555         Mean   : 9.315           
 3rd Qu.:2015-09-25   3rd Qu.:100.00       3rd Qu.:10.000         3rd Qu.:10.000           
 Max.   :2015-10-03   Max.   :100.00       Max.   :10.000         Max.   :10.000           
 NA's   :1005         NA's   :1056         NA's   :1073           NA's   :1075             
 review_scores_checkin review_scores_communication review_scores_location review_scores_value
 Min.   : 2.000        Min.   : 2.00               Min.   : 4.000         Min.   : 2.000     
 1st Qu.:10.000        1st Qu.:10.00               1st Qu.: 9.000         1st Qu.: 9.000     
 Median :10.000        Median :10.00               Median :10.000         Median :10.000     
 Mean   : 9.743        Mean   : 9.79               Mean   : 9.466         Mean   : 9.376     
 3rd Qu.:10.000        3rd Qu.:10.00               3rd Qu.:10.000         3rd Qu.:10.000     
 Max.   :10.000        Max.   :10.00               Max.   :10.000         Max.   :10.000     
 NA's   :1073          NA's   :1069                NA's   :1068           NA's   :1069       
 requires_license    license          instant_bookable      cancellation_policy
 Mode :logical    Min.   :      102   Mode :logical    flexible       :2021    
 FALSE:7          1st Qu.:  2093472   FALSE:4584       moderate       :1487    
 TRUE :5140       Median :  2233303   TRUE :563        strict         :1623    
                  Mean   : 17724533                    super_strict_30:  16    
                  3rd Qu.:  2314739                                            
                  Max.   :352167776                                            
                  NA's   :5111                                                 
 require_guest_profile_picture require_guest_phone_verification calculated_host_listings_count
 Mode :logical                 Mode :logical                    Min.   : 1.000                
 FALSE:4890                    FALSE:4832                       1st Qu.: 1.000                
 TRUE :257                     TRUE :315                        Median : 1.000                
                                                                Mean   : 2.805                
                                                                3rd Qu.: 2.000                
                                                                Max.   :42.000                
                                                                                              
 reviews_per_month calendar_updated_days
 Min.   : 0.020    Min.   :   0.00      
 1st Qu.: 0.900    1st Qu.:   2.00      
 Median : 1.710    Median :   7.00      
 Mean   : 2.173    Mean   :  28.87      
 3rd Qu.: 3.000    3rd Qu.:  28.00      
 Max.   :14.000    Max.   :1687.00      
 NA's   :1005                           
