# Some of the packages are not available on CRAN and can only be installed from Github
# Not all of these are needed at this point - need to include only relvant packages
#___________________________________________________________________
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, kohonen, datasets, grid, tidyverse, ggplot2, lubridate, arules,plyr)
pacman::p_load(here, data.table, purrrlyr,
tidyverse, simplevis, gt, gtsummary, flextable, SmartEDA, DataExplorer, DT, expss, ggcal, vtree, inspectdf,
lubridate, janitor, forcats, fastDummies, units,
tsibble, feasts, fable,
sf, raster, mapproj, tmap, tmaptools, mapdeck,leaflet, leafgl, rgeoda, osmplotr, osmdata,
exactextractr, geomerge, hereR, ggmap,
kableExtra, knitr,
colourvalues, viridis,
readxl, rio, fst,
tictoc, beepr,
ggfortify, gganimate,
grateful)
cpt_listings_in <- read_csv(here("/media/sebnem/E2B08206B081E201/Users/01438475/Google Drive/UCTcourses/datasets/Datasets/spatial","listings.csv.gz"))
class(cpt_listings_in)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
At this stage, the following code cleans the dataset, generates new variables from the existing variables such as if you look at the price variable, you will see that it contains $ sign and some commas. If you are going to use the price variable, then you would need to remove these characters and convert your price variable into numeric.
cpt_listings_in <- cpt_listings_in %>%
mutate(price_clean = price,
price_clean = as.numeric(str_remove_all( price_clean, "\\$|,")),
bathrooms_text = replace(bathrooms_text,bathrooms_text == "Half-bath", 0.5),
bathrooms_text = replace(bathrooms_text,bathrooms_text == "Shared half-bath", 0.5),
bathrooms_text = replace(bathrooms_text,bathrooms_text == "Private half-bath", 0.5),
bathrooms_text = str_remove_all(bathrooms_text, "private |shared |s"),
bathrooms= as.numeric(str_remove_all(bathrooms_text, " bath")),
#clean price - remove dollar and comma
haccrclean = host_acceptance_rate,
haccrclean = as.numeric(str_remove_all(host_acceptance_rate, "\\%")),
hresponserclean = host_response_rate,
hresponserclean = as.numeric(str_remove_all(host_response_rate, "\\%"))) %>%
mutate(across(where(is.logical), as.character)) %>%
separate(neighbourhood_cleansed, c("Ward ","number")) %>%
mutate(WARDNO=as.numeric(number)) %>%
mutate(fullyBooked = availability_365==0)
The following line of code generates dummy variables for some of the categorical variables, you may or may not need to do this. You can also choose to work with the data you obtained so far after cleaning.
cpt_listings_in = cpt_listings_in%>%mutate(var = 1) %>%
spread(room_type, var, fill = 0, sep = "_") %>%
left_join(cpt_listings_in) %>%
dplyr::select(everything())
cpt_listings_in = cpt_listings_in%>%mutate(var = 1) %>%
spread(property_type, var, fill = 0, sep = "_") %>%
left_join(cpt_listings_in) %>%
dplyr::select(everything())
cpt_listings_in = cpt_listings_in%>%mutate(var = 1) %>%
spread(instant_bookable, var, fill = 0, sep = "_") %>%
left_join(cpt_listings_in) %>%
dplyr::select(everything())
cpt_listings_in = cpt_listings_in%>%mutate(var = 1) %>%
spread(fullyBooked, var, fill = 0, sep = "_") %>%
left_join(cpt_listings_in) %>%
dplyr::select(everything())
cpt_listings_in = cpt_listings_in%>%mutate(var = 1) %>%
spread(host_is_superhost, var, fill = 0, sep = "_") %>%
left_join(cpt_listings_in) %>%
dplyr::select(everything())
names(cpt_listings_in)
[1] “id”
[2] “listing_url”
[3] “scrape_id”
[4] “last_scraped”
[5] “name”
[6] “description”
[7] “neighborhood_overview”
[8] “picture_url”
[9] “host_id”
[10] “host_url”
[11] “host_name”
[12] “host_since”
[13] “host_location”
[14] “host_about”
[15] “host_response_time”
[16] “host_response_rate”
[17] “host_acceptance_rate”
[18] “host_thumbnail_url”
[19] “host_picture_url”
[20] “host_neighbourhood”
[21] “host_listings_count”
[22] “host_total_listings_count”
[23] “host_verifications”
[24] “host_has_profile_pic”
[25] “host_identity_verified”
[26] “neighbourhood”
[27] “Ward”
[28] “number”
[29] “neighbourhood_group_cleansed”
[30] “latitude”
[31] “longitude”
[32] “accommodates”
[33] “bathrooms”
[34] “bathrooms_text”
[35] “bedrooms”
[36] “beds”
[37] “amenities”
[38] “price”
[39] “minimum_nights”
[40] “maximum_nights”
[41] “minimum_minimum_nights”
[42] “maximum_minimum_nights”
[43] “minimum_maximum_nights”
[44] “maximum_maximum_nights”
[45] “minimum_nights_avg_ntm”
[46] “maximum_nights_avg_ntm”
[47] “calendar_updated”
[48] “has_availability”
[49] “availability_30”
[50] “availability_60”
[51] “availability_90”
[52] “availability_365”
[53] “calendar_last_scraped”
[54] “number_of_reviews”
[55] “number_of_reviews_ltm”
[56] “number_of_reviews_l30d”
[57] “first_review”
[58] “last_review”
[59] “review_scores_rating”
[60] “review_scores_accuracy”
[61] “review_scores_cleanliness”
[62] “review_scores_checkin”
[63] “review_scores_communication”
[64] “review_scores_location”
[65] “review_scores_value”
[66] “license”
[67] “calculated_host_listings_count”
[68] “calculated_host_listings_count_entire_homes”
[69] “calculated_host_listings_count_private_rooms”
[70] “calculated_host_listings_count_shared_rooms”
[71] “reviews_per_month”
[72] “price_clean”
[73] “haccrclean”
[74] “hresponserclean”
[75] “WARDNO”
[76] “room_type_Entire home/apt”
[77] “room_type_Hotel room”
[78] “room_type_Private room”
[79] “room_type_Shared room”
[80] “room_type”
[81] “property_type_Barn”
[82] “property_type_Boat”
[83] “property_type_Camper/RV”
[84] “property_type_Casa particular”
[85] “property_type_Castle”
[86] “property_type_Earth house”
[87] “property_type_Entire bed and breakfast”
[88] “property_type_Entire bungalow”
[89] “property_type_Entire cabin”
[90] “property_type_Entire chalet”
[91] “property_type_Entire condominium (condo)”
[92] “property_type_Entire cottage”
[93] “property_type_Entire guest suite”
[94] “property_type_Entire guesthouse”
[95] “property_type_Entire hostel”
[96] “property_type_Entire loft”
[97] “property_type_Entire place”
[98] “property_type_Entire rental unit”
[99] “property_type_Entire residential home”
[100] “property_type_Entire resort”
[101] “property_type_Entire serviced apartment”
[102] “property_type_Entire townhouse”
[103] “property_type_Entire villa”
[104] “property_type_Farm stay”
[105] “property_type_Floor”
[106] “property_type_Houseboat”
[107] “property_type_Island”
[108] “property_type_Private room”
[109] “property_type_Private room in barn”
[110] “property_type_Private room in bed and breakfast”
[111] “property_type_Private room in boat”
[112] “property_type_Private room in bungalow”
[113] “property_type_Private room in cabin”
[114] “property_type_Private room in camper/rv”
[115] “property_type_Private room in casa particular”
[116] “property_type_Private room in chalet”
[117] “property_type_Private room in condominium (condo)” [118] “property_type_Private room in cottage”
[119] “property_type_Private room in dome house”
[120] “property_type_Private room in earth house”
[121] “property_type_Private room in farm stay”
[122] “property_type_Private room in guest suite”
[123] “property_type_Private room in guesthouse”
[124] “property_type_Private room in hostel”
[125] “property_type_Private room in hut”
[126] “property_type_Private room in lighthouse”
[127] “property_type_Private room in loft”
[128] “property_type_Private room in minsu”
[129] “property_type_Private room in nature lodge”
[130] “property_type_Private room in rental unit”
[131] “property_type_Private room in residential home”
[132] “property_type_Private room in resort”
[133] “property_type_Private room in serviced apartment” [134] “property_type_Private room in tent”
[135] “property_type_Private room in tiny house”
[136] “property_type_Private room in townhouse”
[137] “property_type_Private room in villa”
[138] “property_type_Room in aparthotel”
[139] “property_type_Room in bed and breakfast”
[140] “property_type_Room in boutique hotel”
[141] “property_type_Room in guesthouse”
[142] “property_type_Room in heritage hotel”
[143] “property_type_Room in hostel”
[144] “property_type_Room in hotel”
[145] “property_type_Room in nature lodge”
[146] “property_type_Room in serviced apartment”
[147] “property_type_Shared room”
[148] “property_type_Shared room in bed and breakfast”
[149] “property_type_Shared room in boutique hotel”
[150] “property_type_Shared room in bungalow”
[151] “property_type_Shared room in dome house”
[152] “property_type_Shared room in earth house”
[153] “property_type_Shared room in guest suite”
[154] “property_type_Shared room in guesthouse”
[155] “property_type_Shared room in hostel”
[156] “property_type_Shared room in loft”
[157] “property_type_Shared room in nature lodge”
[158] “property_type_Shared room in rental unit”
[159] “property_type_Shared room in residential home”
[160] “property_type_Shared room in serviced apartment”
[161] “property_type_Shared room in villa”
[162] “property_type_Tent”
[163] “property_type_Tiny house”
[164] “property_type_Yurt”
[165] “property_type”
[166] “instant_bookable_FALSE”
[167] “instant_bookable_TRUE”
[168] “instant_bookable”
[169] “fullyBooked_FALSE”
[170] “fullyBooked_TRUE”
[171] “fullyBooked”
[172] “host_is_superhost_FALSE”
[173] “host_is_superhost_TRUE”
[174] “host_is_superhost_NA”
[175] “host_is_superhost”
cpt_listings_in %>%ggplot(mapping = aes(x = bedrooms, y = log(price_clean))) + geom_point()
cpt_listings_in %>% ggplot(mapping = aes(x = accommodates, y = price_clean)) + geom_point()
The following lines of code generates ward level averages for some of the variables, not all. If there are other variables you want to use at this stage, then you are welcome to continue to do so from the very last line of the code chunk.
wardLevelAverages = cpt_listings_in %>%
group_by(WARDNO) %>%
dplyr::summarise(averageprice = mean(price_clean),
hsuperhostratio = mean(host_is_superhost_TRUE),
avertype_enthome = mean(`room_type_Entire home/apt`),
avertype_Hotelroom = mean(`room_type_Hotel room`),
avertype_Privateroom = mean(`room_type_Private room`),
avertype_Sharedroom = mean(`room_type_Shared room`),
aveaccommodates = mean(accommodates),
avebedrooms = mean(bedrooms), avebeds = mean(beds),
aveinsbook = mean(instant_bookable_TRUE),
avefbooked = mean(fullyBooked_TRUE))
#devtools::session_info()
# cite_packages() # Write get bib of R packages used