# Import the data
airbnb_data <- read_csv(file = 'up_to_date_clean_data_version_one_bedroom.csv')
## Rows: 3789 Columns: 76
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (32): host_name, listing_url, name, description, neighborhood_overview, ...
## dbl (43): id, host_id, scrape_id, host_is_superhost_bin, host_listings_count...
## lgl (1): has_availability
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Sort the dataset by expected revenue in descending order
sortedairbnb_data <- airbnb_data %>% arrange(desc(full_year_expected_revenue))
# add row id
sortedairbnb_data <- sortedairbnb_data %>%
mutate(unique_id = row_number())
write_xlsx(sortedairbnb_data, "airbnb_cleaned_data.xlsx")
# get top 200 listings based on revenue
top_200_revenue <- sortedairbnb_data %>%
top_n(200, full_year_expected_revenue)
# Extract the ammenity column with arrays of strings
amenitiesArrayColumn <- top_200_revenue$amenities
#for full count switch to sortedairbnb_data
# Split the amenities into separate strings
splitAmmentityStrings <- strsplit(amenitiesArrayColumn, ",\\s*")
# Create new data frame with split ammentity strings
df <- data.frame(strings = unlist(splitAmmentityStrings))
# Count the frequencies of amenities
ammentiyCounts <- df %>%
group_by(strings) %>%
summarize(count = n()) %>%
arrange(desc(count))
# Print the ordered data frame
print.data.frame(ammentiyCounts)
## strings count
## 1 "Smoke alarm" 192
## 2 "Wifi" 185
## 3 "Carbon monoxide alarm" 175
## 4 "Hair dryer" 175
## 5 "Iron" 169
## 6 "Shampoo" 161
## 7 "Hangers" 156
## 8 "Heating" 155
## 9 "Essentials" 144
## 10 "Hot water" 139
## 11 "Refrigerator" 134
## 12 "Bed linens" 131
## 13 "Kitchen" 115
## 14 "Microwave" 115
## 15 "First aid kit" 113
## 16 "Coffee maker" 110
## 17 "Fire extinguisher" 108
## 18 "TV" 103
## 19 "Dishwasher" 101
## 20 "Dishes and silverware" 100
## 21 "Air conditioning" 99
## 22 "Washer" 96
## 23 "Elevator" 95
## 24 "Dedicated workspace" 91
## 25 "Private entrance" 79
## 26 "Cooking basics" 72
## 27 "Self check-in" 72
## 28 "Extra pillows and blankets" 70
## 29 "Luggage dropoff allowed" 66
## 30 ["Oven" 60
## 31 "Building staff" 57
## 32 "Pets allowed" 57
## 33 "Bathtub" 55
## 34 "Dryer" 55
## 35 "Long term stays allowed" 52
## 36 "Paid parking on premises" 49
## 37 "TV with standard cable" 48
## 38 "Gym" 46
## 39 ["Self check-in" 44
## 40 "Paid parking off premises" 43
## 41 "Hot water kettle" 41
## 42 ["Essentials" 41
## 43 "Lockbox"] 40
## 44 "Stove" 39
## 45 "Body soap" 37
## 46 "Dining table" 37
## 47 "Coffee" 36
## 48 "Lock on bedroom door" 36
## 49 "Gym"] 35
## 50 "Private living room" 35
## 51 "Shower gel" 35
## 52 "Dryer"] 33
## 53 "Ethernet connection" 33
## 54 "Freezer" 32
## 55 "Backyard" 30
## 56 "City skyline view" 30
## 57 "Free street parking" 30
## 58 "Exercise equipment" 28
## 59 "Wine glasses" 28
## 60 "Cleaning products" 27
## 61 "Conditioner" 27
## 62 "Free parking on premises" 27
## 63 "Kitchen"] 27
## 64 "Toaster" 26
## 65 "Dryer \\u2013\\u00a0In unit" 25
## 66 "Room-darkening shades" 25
## 67 "Washer \\u2013\\u00a0In unit" 25
## 68 "Patio or balcony" 24
## 69 "Crib" 23
## 70 "Heating"] 22
## 71 "Indoor fireplace" 22
## 72 "Outdoor furniture" 20
## 73 "Safe" 20
## 74 "Central air conditioning" 19
## 75 "Host greets you" 19
## 76 "Pack \\u2019n play/Travel crib" 19
## 77 "Single level home"] 17
## 78 "Baking sheet" 16
## 79 "Breakfast" 16
## 80 "Fire extinguisher"] 16
## 81 "Paid valet parking on premises" 16
## 82 "Security cameras on property" 16
## 83 DVD player" 16
## 84 "Babysitter recommendations" 15
## 85 "Books and reading material" 15
## 86 "Free dryer \\u2013 In unit" 15
## 87 "Resort access" 15
## 88 "Blender" 14
## 89 "Sauna" 14
## 90 "BBQ grill" 13
## 91 "HDTV with standard cable 13
## 92 "Private patio or balcony" 13
## 93 ["Security cameras on property" 13
## 94 "Central heating" 12
## 95 "Outdoor dining area" 12
## 96 "Shared gym in building" 12
## 97 ["Shampoo" 12
## 98 "Bay view" 11
## 99 "Free washer \\u2013 In unit" 11
## 100 "Pool" 11
## 101 "Portable fans" 10
## 102 "Clothing storage" 9
## 103 "Laundromat nearby" 9
## 104 "Smart lock" 9
## 105 Netflix 9
## 106 "Board games" 8
## 107 "Ceiling fan" 8
## 108 "Cleaning available during stay" 8
## 109 "Fire pit" 8
## 110 "Keypad" 8
## 111 ["Carbon monoxide alarm" 8
## 112 "Coffee maker: drip coffee maker" 7
## 113 "Free dryer \\u2013 In building" 7
## 114 "Free washer \\u2013 In building" 7
## 115 "Private backyard \\u2013 Fully fenced" 7
## 116 "Sound system" 7
## 117 "Stainless steel oven" 7
## 118 Hulu 7
## 119 "Clothing storage: dresser and closet" 6
## 120 "Mini fridge" 6
## 121 "Pool table" 6
## 122 "Shared patio or balcony" 6
## 123 "Coffee maker: Keurig coffee machine" 5
## 124 "Shared hot tub" 5
## 125 ["Luggage dropoff allowed" 5
## 126 Disney+ 5
## 127 HBO Max 5
## 128 Netflix" 5
## 129 "Bidet" 4
## 130 "Bose sound system" 4
## 131 "Drying rack for clothing" 4
## 132 "Game console" 4
## 133 "Gas stove" 4
## 134 "Hot tub" 4
## 135 "Ocean view" 4
## 136 "Oven" 4
## 137 Amazon Prime Video 4
## 138 premium cable" 4
## 139 "Clothing storage: closet and dresser" 3
## 140 "Coffee maker: Nespresso" 3
## 141 "EV charger" 3
## 142 "Fireplace guards" 3
## 143 "Garden view" 3
## 144 "HDTV with Chromecast 3
## 145 "Pocket wifi" 3
## 146 "Rice maker" 3
## 147 "Shared backyard \\u2013 Fully fenced" 3
## 148 "Stainless steel gas stove" 3
## 149 "Waterfront" 3
## 150 ["55\\" HDTV with standard cable 3
## 151 ["Cooking basics" 3
## 152 ["Paid parking garage off premises" 3
## 153 premium cable 3
## 154 Roku" 3
## 155 "Bertazzoni stainless steel gas stove" 2
## 156 "Bikes" 2
## 157 "Clothing storage: closet" 2
## 158 "Clothing storage: walk-in closet and closet"] 2
## 159 "Clothing storage: wardrobe" 2
## 160 "Coffee maker: drip coffee maker 2
## 161 "Coffee maker: pour-over coffee" 2
## 162 "Courtyard view" 2
## 163 "Electric stove" 2
## 164 "Exercise equipment: elliptical 2
## 165 "Free resort access" 2
## 166 "HDTV with Amazon Prime Video 2
## 167 "Indoor fireplace: gas" 2
## 168 "Mountain view" 2
## 169 "Paid dryer \\u2013 In building" 2
## 170 "Paid parking lot off premises" 2
## 171 "Paid street parking off premises" 2
## 172 "Paid washer \\u2013 In building" 2
## 173 "Portable heater" 2
## 174 "Radiant heating" 2
## 175 "Record player" 2
## 176 "Smoking allowed" 2
## 177 "Sun loungers" 2
## 178 "TV with standard cable 2
## 179 "Wifi"] 2
## 180 Apple TV 2
## 181 Apple TV" 2
## 182 free weights 2
## 183 pour-over coffee" 2
## 184 Roku 2
## 185 stationary bike 2
## 186 yoga mat" 2
## 187 "100\\" TV with Apple TV" 1
## 188 "43\\" HDTV with Amazon Prime Video 1
## 189 "50\\" HDTV with Chromecast 1
## 190 "50\\" HDTV with Disney+ 1
## 191 "50\\" HDTV with Netflix" 1
## 192 "50\\" HDTV with standard cable" 1
## 193 "55\\" HDTV with HBO Max 1
## 194 "60\\" HDTV with Amazon Prime Video 1
## 195 "65\\" HDTV with Amazon Prime Video 1
## 196 "75\\" HDTV with Apple TV 1
## 197 "Backyard - Fully fenced" 1
## 198 "Barbecue utensils" 1
## 199 "Beach access \\u2013 Beachfront" 1
## 200 "Beach access" 1
## 201 "Beach essentials" 1
## 202 "Beach view" 1
## 203 "Bluetooth sound system" 1
## 204 "Bosch oven" 1
## 205 "Bosch stove" 1
## 206 "Bosch stainless steel electric stove" 1
## 207 "Bosch stainless steel oven" 1
## 208 "Bose Sound Bar Bluetooth sound system" 1
## 209 "Children\\u2019s books and toys for ages 0-2 years old 1
## 210 "Children\\u2019s books and toys" 1
## 211 "Children\\u2019s dinnerware" 1
## 212 "Clothing storage: dresser" 1
## 213 "Clothing storage: walk-in closet" 1
## 214 "Clothing storage: wardrobe 1
## 215 "Clothing storage"] 1
## 216 "Coffee maker: Nespresso 1
## 217 "Dcs stainless steel gas stove" 1
## 218 "Dr. Bronner's Pure Castille body soap" 1
## 219 "Dryer \\u2013 In building" 1
## 220 "Eco Friendly conditioner" 1
## 221 "EO lavender body soap" 1
## 222 "Exercise equipment: free weights 1
## 223 "Exercise equipment: yoga mat" 1
## 224 "Fast wifi \\u2013 124 Mbps" 1
## 225 "Fast wifi \\u2013 259 Mbps" 1
## 226 "Fast wifi \\u2013 275 Mbps" 1
## 227 "Fast wifi \\u2013 328 Mbps" 1
## 228 "Fast wifi \\u2013 337 Mbps" 1
## 229 "Fast wifi \\u2013 458 Mbps" 1
## 230 "Fast wifi \\u2013 564 Mbps" 1
## 231 "Fast wifi \\u2013 78 Mbps" 1
## 232 "Fast wifi \\u2013 86 Mbps" 1
## 233 "Free carport on premises \\u2013 1 space" 1
## 234 "Free driveway parking on premises \\u2013 2 spaces" 1
## 235 "Free dryer" 1
## 236 "Free parking garage on premises \\u2013 1 space" 1
## 237 "Free residential garage on premises \\u2013 1 space" 1
## 238 "Free washer" 1
## 239 "Gaggenau stainless steel gas stove" 1
## 240 "Gas oven oven" 1
## 241 "Gym in building" 1
## 242 "Hair dryer"] 1
## 243 "Hammock" 1
## 244 "Harbor view" 1
## 245 "HDTV with Disney+ 1
## 246 "HDTV with HBO Max 1
## 247 "HDTV with Hulu 1
## 248 "HDTV with Roku" 1
## 249 "HDTV with standard cable" 1
## 250 "Honest Company body soap" 1
## 251 "Honest Company conditioner" 1
## 252 "Honest Company shampoo" 1
## 253 "Lockbox" 1
## 254 "Marina view" 1
## 255 "Multiple brands body soap" 1
## 256 "Multiple brands conditioner" 1
## 257 "Multiple brands shampoo" 1
## 258 "Outdoor kitchen" 1
## 259 "Outdoor shower" 1
## 260 "Outlet covers" 1
## 261 "Pack \\u2019n play/Travel crib - available upon request" 1
## 262 "Paid parking garage off premises" 1
## 263 "Paid parking garage on premises \\u2013 1 space" 1
## 264 "Paid resort access" 1
## 265 "Paid valet parking on premises \\u2013 1 space" 1
## 266 "Pool view" 1
## 267 "Portable air conditioning" 1
## 268 "Private entrance"] 1
## 269 "Private gym in building" 1
## 270 "Private hot tub" 1
## 271 "Public or shared beach access" 1
## 272 "Record player"] 1
## 273 "Samsung gas stove" 1
## 274 "Shared BBQ grill: gas" 1
## 275 "Shared indoor pool - heated 1
## 276 "Shared outdoor kitchen" 1
## 277 "Shared pool - available all year 1
## 278 "Shared pool" 1
## 279 "Single oven" 1
## 280 "Smoke alarm"] 1
## 281 "Sonos Bluetooth sound system" 1
## 282 "Sonos Surround Sound Bluetooth sound system" 1
## 283 "Sound system with Bluetooth and aux" 1
## 284 "Standalone high chair - always at the listing" 1
## 285 "Sub zero refrigerator" 1
## 286 "TV with Netflix 1
## 287 "TV with Netflix" 1
## 288 "Valley view" 1
## 289 "Washer \\u2013\\u00a0In building" 1
## 290 "Wifi \\u2013 43 Mbps" 1
## 291 "Window guards" 1
## 292 "Wolf gas stove" 1
## 293 "Wolf oven" 1
## 294 "Wolf refrigerator" 1
## 295 "Wolf stainless steel oven" 1
## 296 ["Building staff" 1
## 297 ["Coffee maker" 1
## 298 ["Crib" 1
## 299 ["Fast wifi \\u2013 322 Mbps" 1
## 300 ["HDTV with Hulu 1
## 301 ["Pool table" 1
## 302 ["Shared sauna" 1
## 303 ["Wifi \\u2013 43 Mbps" 1
## 304 2-5 years old 1
## 305 Amazon Prime Video" 1
## 306 and 5-10 years old" 1
## 307 and closet" 1
## 308 Chromecast 1
## 309 Disney+" 1
## 310 DVD player"] 1
## 311 Fire TV" 1
## 312 french press 1
## 313 Hulu" 1
## 314 Keurig coffee machine" 1
## 315 lap pool 1
## 316 rooftop" 1
## 317 saltwater" 1
## 318 standard cable 1
## 319 treadmill 1
## 320 treadmill" 1
## 321 walk-in closet 1
#write data frame to excel
write_xlsx(ammentiyCounts, "amenity_counts_new_version.xlsx")
# import same airbnb data as above
airbnb_data_2 <- read_csv(file = 'up_to_date_clean_data_version_one_bedroom.csv')
## Rows: 3789 Columns: 76
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (32): host_name, listing_url, name, description, neighborhood_overview, ...
## dbl (43): id, host_id, scrape_id, host_is_superhost_bin, host_listings_count...
## lgl (1): has_availability
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# add row id
airbnb_data_2 <- airbnb_data_2 %>%
mutate(unique_id = row_number())
# Create list of amenities of interest (top 10 amenities based on count of ammentity listed for top revenue listings)
essentials <- c("Smoke", "Wifi", "Hangers", "Carbon", "Hair", "Water", "Iron", "Essentials", "Shampoo", "Heating")
# Duplicate airbnb_data
essential_amenities_df <- data.frame(airbnb_data_2)
# Iterate over amenities in essentials
for (amenity in essentials) {
essential_amenities_df <- essential_amenities_df %>%
# Create boolean column
mutate(!!amenity :=
str_detect(
amenities,
regex(amenity, ignore_case = TRUE)
))
}
# Group by listing
listing_essential_amenities <- essential_amenities_df %>%
group_by(unique_id, id, full_year_expected_revenue) %>%
summarise(
Smoke_Alarm = any(Smoke),
Wifi = any(Wifi),
Hangers = any(Hangers),
Carbon_Monoxide_Alarm = any(Carbon),
Hair_Dryer = any(Hair),
Hot_Water = any(Water),
Iron = any(Iron),
Essentials = any(Essentials),
Shampoo = any(Shampoo),
Heating = any(Heating)
) %>%
pivot_longer(
cols = -unique_id,
names_to = "amenities",
values_to = "listed"
)
## `summarise()` has grouped output by 'unique_id', 'id'. You can override using the `.groups` argument.
# Merge price data
listing_essential_amenities <- listing_essential_amenities %>%
left_join(essential_amenities_df %>% select(id, full_year_expected_revenue, unique_id), by = "unique_id")
write_xlsx(listing_essential_amenities, "amenity_ListingId_revenue_new_updated.xlsx")
# update column type to numeric for listed column
listing_essential_amenities$listed <- as.numeric(listing_essential_amenities$listed)
# Aggregate the data using summarizing function
df_summarized <- listing_essential_amenities %>%
group_by(unique_id, id, full_year_expected_revenue, amenities) %>%
summarise(listed = max(as.integer(listed))) %>%
ungroup()
## `summarise()` has grouped output by 'unique_id', 'id', 'full_year_expected_revenue'. You can override using the `.groups` argument.
# Spread the dataframe to create new columns for each ammentiy
df_spread <- spread(df_summarized, key = amenities, value = listed, fill = 0)
# Print the spread dataframe
print(df_spread)
## # A tibble: 3,789 × 13
## unique_id Carbon_Monoxide_A… Essentials full_year_expecte… Hair_Dryer Hangers
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 47874 1 1
## 2 2 1 0 0 0 1
## 3 3 1 1 22660 1 1
## 4 4 1 1 3400 0 1
## 5 5 1 1 0 1 1
## 6 6 1 1 17050 1 1
## 7 7 1 1 3055 1 1
## 8 8 1 0 0 0 0
## 9 9 1 1 7420 1 1
## 10 10 1 1 66220 1 1
## # … with 3,779 more rows, and 7 more variables: Heating <dbl>, Hot_Water <dbl>,
## # id <dbl>, Iron <dbl>, Shampoo <dbl>, Smoke_Alarm <dbl>, Wifi <dbl>
# Sort the dataset by expected revenue in descending order
sortedairbnb_data2 <- df_spread %>% arrange(desc(full_year_expected_revenue))
# add row id
sortedairbnb_data2 <- sortedairbnb_data2 %>%
mutate(unique_id = row_number())
write_xlsx(sortedairbnb_data2, "ammenties_final_pivot.xlsx")
# Example data frames
airbnb_data3 <- read_csv(file = 'one_bedroom_amenities_joined_Validation_binary_models_sentiment3.csv')
## Rows: 3789 Columns: 117
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (32): host_name, listing_url, name, description, neighborhood_overview, ...
## dbl (84): listing_id, host_id, scrape_id, host_is_superhost_bin, host_listin...
## lgl (1): has_availability
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# change listing_id to not be scientific notation
options(scipen = 999)
airbnb_data3$listing_id <- as.numeric(airbnb_data3$listing_id )
# add groupings for expected revenue
airbnb_data3 <- airbnb_data3 %>%
mutate(Expected_Revenue_Grouping = case_when(
airbnb_data3$full_year_expected_revenue <= 5000 ~ "Low Expected Revenue (< 5,000)",
airbnb_data3$full_year_expected_revenue > 5000 &
airbnb_data3$full_year_expected_revenue <= 10000 ~ "Medium Expected Revenue (< 10,000)",
airbnb_data3$full_year_expected_revenue > 10000 &
airbnb_data3$full_year_expected_revenue <= 50000 ~ "High Expected Revenue (< 50,000)",
airbnb_data3$full_year_expected_revenue > 50000 &
airbnb_data3$full_year_expected_revenue <= 100000 ~ "Very High Expected Revenue ( < 100,000)",
TRUE ~ "Ultra High Expected Revenue ( > 100,000)"
))
reviews_data <- read_csv(file = 'reviews.csv')
## Rows: 332540 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): reviewer_name, comments
## dbl (3): listing_id, id, reviewer_id
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# reviews data frame with comments
reviews_data_new <- data.frame(listing_id = reviews_data$listing_id,
Comments = reviews_data$comments)
# Combine comments based on listing_id, all comments combined by each listing id
combined_df <- reviews_data_new %>%
group_by(listing_id) %>%
summarise(Comments = paste(Comments, collapse = " "))
# Join data frames based on the listing_id column
merged_df <- merge(combined_df, airbnb_data3, by = "listing_id", all.x = TRUE, all.y = TRUE)
# remove breaking space character
merged_df <- merged_df %>%
mutate_all(~ gsub("<br/>", "", .))
write.csv(merged_df, file = "new_revenue_grouping_comments_NEW.csv")
##write_xlsx(combined_df, "new_revenue_grouping_comments.xlsx")