MappingHotelData <- read_excel_allsheets("~/Michal/agoda/mappinghotelsdataset_v2.xlsx")
names(MappingHotelData)
## [1] "Partner1" "Partner2" "examples"
colnames(MappingHotelData$Partner1)
## [1] "p1.key" "p1.hotel_name" "p1.city_name"
## [4] "p1.country_code" "p1.hotel_address" "p1.star_rating"
## [7] "p1.postal_code"
head(MappingHotelData$Partner1)
## # A tibble: 6 × 7
## p1.key p1.hotel_name
## <chr> <chr>
## 1 5E876BFEA81A39E42E3019FE17303D52 Elite Grande Hotel
## 2 4F315989358CC0F3F7869F569887743D Quality Inn West Chester
## 3 A4EEBCBB9932DADE591248DFFFBDC068 MAP5 Village Resort
## 4 2833BE9FD49A063A36D3DE1E5E28ABC4 Hampton Inn & Suites San Jose Hotel
## 5 F7C20B50AE5C6C807BAABB65B8926F07 Favehotel Daeng Tompo
## 6 C517E32BB18C1E2ED859947B25629103 Art Cottage Hotel
## # ... with 5 more variables: p1.city_name <chr>, p1.country_code <chr>,
## # p1.hotel_address <chr>, p1.star_rating <dbl>, p1.postal_code <chr>
Partner1 <- MappingHotelData$Partner1 %>%
mutate(p1.new_hotel_name = tolower(p1.hotel_name)) %>%
mutate(p1.new_hotel_name = gsub(" ","",p1.new_hotel_name,ignore.case = TRUE)) %>%
mutate(p1.new_hotel_name = gsub("/","",p1.new_hotel_name)) %>%
mutate(p1.new_hotel_name = gsub("@","",p1.new_hotel_name)) %>%
mutate(p1.new_hotel_name = gsub("\\|","",p1.new_hotel_name))%>%
mutate(p1.new_hotel_name = gsub("&","",p1.new_hotel_name))%>%
mutate(p1.new_hotel_name = gsub("[0-9]","",p1.new_hotel_name)) %>%
mutate(p1.new_hotel_name = gsub(",","",p1.new_hotel_name)) %>%
mutate(p1.new_hotel_name = gsub("-","",p1.new_hotel_name)) %>%
mutate(p1.new_hotel_name = gsub("(","",p1.new_hotel_name,fixed = T)) %>%
mutate(p1.new_hotel_name = gsub(")","",p1.new_hotel_name,fixed = T))
Partner2 <- MappingHotelData$Partner2 %>%
mutate(p2.new_hotel_name = tolower(p2.hotel_name)) %>%
mutate(p2.new_hotel_name = gsub(" ","",p2.new_hotel_name,ignore.case = TRUE)) %>%
mutate(p2.new_hotel_name = gsub("/","",p2.new_hotel_name)) %>%
mutate(p2.new_hotel_name = gsub("@","",p2.new_hotel_name)) %>%
mutate(p2.new_hotel_name = gsub("\\|","",p2.new_hotel_name))%>%
mutate(p2.new_hotel_name = gsub("&","",p2.new_hotel_name))%>%
mutate(p2.new_hotel_name = gsub("[0-9]","",p2.new_hotel_name)) %>%
mutate(p2.new_hotel_name = gsub(",","",p2.new_hotel_name)) %>%
mutate(p2.new_hotel_name = gsub("-","",p2.new_hotel_name)) %>%
mutate(p2.new_hotel_name = gsub("(","",p2.new_hotel_name,fixed = T)) %>%
mutate(p2.new_hotel_name = gsub(")","",p2.new_hotel_name,fixed = T))
New hotels name:
## [1] "elitegrandehotel" "qualityinnwestchester"
## [3] "mapvillageresort" "hamptoninnsuitessanjosehotel"
## [5] "favehoteldaengtompo" "artcottagehotel"
Partner1 <- Partner1 %>%
mutate(p1.new_hotel_add = tolower(p1.hotel_address)) %>%
mutate(p1.new_hotel_add = gsub(" ","",p1.new_hotel_add)) %>%
mutate(p1.new_hotel_add = gsub("/","",p1.new_hotel_add)) %>%
mutate(p1.new_hotel_add = gsub("@","",p1.new_hotel_add)) %>%
mutate(p1.new_hotel_add = gsub("\\|","",p1.new_hotel_add))%>%
mutate(p1.new_hotel_add = gsub("&","",p1.new_hotel_add))%>%
mutate(p1.new_hotel_add = gsub(",","",p1.new_hotel_add)) %>%
mutate(p1.new_hotel_add = gsub("-","",p1.new_hotel_add)) %>%
mutate(p1.new_hotel_add = gsub("(","",p1.new_hotel_add,fixed = T)) %>%
mutate(p1.new_hotel_add = gsub(")","",p1.new_hotel_add,fixed = T)) %>%
mutate(p1.new_hotel_add = gsub(".","",p1.new_hotel_add,fixed = T)) %>%
mutate(p1.new_hotel_add = gsub("\\","",p1.new_hotel_add,fixed=T))
Partner2 <- Partner2 %>%
mutate(p2.new_hotel_add = tolower(p2.hotel_address)) %>%
mutate(p2.new_hotel_add = gsub(" ","",p2.new_hotel_add)) %>%
mutate(p2.new_hotel_add = gsub("/","",p2.new_hotel_add)) %>%
mutate(p2.new_hotel_add = gsub("@","",p2.new_hotel_add)) %>%
mutate(p2.new_hotel_add = gsub("\\|","",p2.new_hotel_add))%>%
mutate(p2.new_hotel_add = gsub("&","",p2.new_hotel_add))%>%
mutate(p2.new_hotel_add = gsub(",","",p2.new_hotel_add)) %>%
mutate(p2.new_hotel_add = gsub("-","",p2.new_hotel_add)) %>%
mutate(p2.new_hotel_add = gsub("(","",p2.new_hotel_add,fixed = T)) %>%
mutate(p2.new_hotel_add = gsub(")","",p2.new_hotel_add,fixed = T)) %>%
mutate(p2.new_hotel_add = gsub(".","",p2.new_hotel_add,fixed = T)) %>%
mutate(p2.new_hotel_add = gsub("\\","",p2.new_hotel_add,fixed=T))
New hotels address:
## [1] "bldg3378road2845area428" "8567cincinnatidaytonroad"
## [3] "vithaldaswadoo" "55oldtullyroad"
## [5] "daengtompostreetnumber28losari" "1517jalanmedanipoh1b1"
I have noticed that the cities names in Partner1 include the state name whilst in Partner2 the state name not included. I removed the state name in parentheses from Partner1 list for cities that had only one state( the dataset contains 952 hotels that located in the US only 76 cities appears in more than one state, all the other 876 cities names can reduce to just city name without state name)
Partner1 <- Partner1 %>%
mutate(p1.new_city = tolower(p1.city_name)) %>%
mutate(p1.new_city = gsub(" ","",p1.new_city)) %>%
mutate(p1.new_city = gsub("/","",p1.new_city)) %>%
mutate(p1.new_city = gsub("@","",p1.new_city)) %>%
mutate(p1.new_city = gsub("\\|","",p1.new_city))%>%
mutate(p1.new_city = gsub("&","",p1.new_city))%>%
mutate(p1.new_city = gsub(",","",p1.new_city)) %>%
mutate(p1.new_city = gsub("-","",p1.new_city)) %>%
mutate(p1.new_city = gsub("(","",p1.new_city,fixed = T)) %>%
mutate(p1.new_city = gsub(")","",p1.new_city,fixed = T)) %>%
mutate(p1.new_city = gsub(".","",p1.new_city,fixed = T)) %>%
mutate(p1.new_city = gsub("[0-9]","",p1.new_city)) %>%
mutate(p1.new_city = gsub("\\","",p1.new_city,fixed=T))
Partner2 <- Partner2 %>%
mutate(p2.new_city = tolower(p2.city_name)) %>%
mutate(p2.new_city = gsub(" ","",p2.new_city)) %>%
mutate(p2.new_city = gsub("/","",p2.new_city)) %>%
mutate(p2.new_city = gsub("@","",p2.new_city)) %>%
mutate(p2.new_city = gsub("\\|","",p2.new_city))%>%
mutate(p2.new_city = gsub("&","",p2.new_city))%>%
mutate(p2.new_city = gsub(",","",p2.new_city)) %>%
mutate(p2.new_city = gsub("-","",p2.new_city)) %>%
mutate(p2.new_city = gsub("(","",p2.new_city,fixed = T)) %>%
mutate(p2.new_city = gsub(")","",p2.new_city,fixed = T)) %>%
mutate(p2.new_city = gsub(".","",p2.new_city,fixed = T)) %>%
mutate(p2.new_city = gsub("[0-9]","",p2.new_city)) %>%
mutate(p2.new_city = gsub("\\","",p2.new_city,fixed=T))
Partner1 <- Partner1 %>%
mutate(p1.city_postal_code =
tolower(paste(p1.new_city,p1.postal_code,sep="_")))
Partner2 <- Partner2 %>%
mutate(p2.city_postal_code =
tolower(paste(p2.new_city,p2.postal_code,sep="_")))
city and postal code:
## [1] "manama_5458" "westchester_45069" "goa_403512"
## [4] "sanjose_95111" "makassar_na" "ipoh_31400"
Words that appear in 20% or more of the hotels names are being removed, same goes for words in the hotels addresses.
# Finds frequent word in hotel names
corp_hotel_name <- Corpus(VectorSource(c(Partner1$p1.hotel_name,Partner2$p2.hotel_name)))
dtm_hotel_name <- DocumentTermMatrix(corp_hotel_name)
freq_hotel_name <- colSums(as.matrix(dtm_hotel_name))
ord_hotel_name <- order(freq_hotel_name)
freq_hotel_names <-
names(freq_hotel_name[ord_hotel_name][freq_hotel_name[ord_hotel_name] >=0.2*nrow(Partner1)])
# Finds frequent word in hotel address
corp_hotel_add <- Corpus(VectorSource(c(Partner2$p2.hotel_address,Partner2$p2.hotel_address)))
dtm_hotel_add <- DocumentTermMatrix(corp_hotel_add)
freq_hotel_add <- colSums(as.matrix(dtm_hotel_add))
ord_hotel_add <- order(freq_hotel_add)
freq_hotel_add <-
names(freq_hotel_add[ord_hotel_add][freq_hotel_add[ord_hotel_add] >=0.2*nrow(Partner1)])
for ( i in freq_hotel_names){
Partner1 <- Partner1 %>%
mutate(p1.new_hotel_name = gsub(i,"",p1.new_hotel_name,ignore.case = TRUE))
Partner2 <- Partner2 %>%
mutate(p2.new_hotel_name = gsub(i,"",p2.new_hotel_name,ignore.case = TRUE))}
for ( i in freq_hotel_add){
Partner1 <- Partner1 %>%
mutate(p1.new_hotel_add = gsub(i,"",p1.new_hotel_add,ignore.case = TRUE))
Partner2 <- Partner2 %>%
mutate(p2.new_hotel_add = gsub(i,"",p2.new_hotel_add,ignore.case = TRUE))}
sort(table(Partner2$p2.country_code))
##
## AZ BO BT BW CV DK DO EE IS JM LT MD MK MT NC NF NG PG
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## RE UG UY WS XK CI FI GH GU LU ME MG MN MZ PK TO ZW BA
## 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3
## CL ET KZ LB LV MP PE PW RS SI SK AL CK CR CY PR SE VU
## 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4
## BG NA PA BN BR TN MO NO BD KW QA BE HR PF PL RO IE FJ
## 5 5 5 6 6 6 7 7 9 9 9 10 10 10 10 10 11 12
## KE TZ CO EG UA BH OM AR SC JO MU MX PT MV CH MA AT NL
## 12 12 13 13 13 14 15 17 17 18 22 22 24 27 30 33 35 35
## CZ NP SG HK CA IL HU LA MM SA AE RU GR KH TR DE PH NZ
## 36 38 39 41 47 49 55 56 65 66 68 74 76 112 121 152 166 185
## ZA LK KR TW ES MY FR IT GB VN AU JP ID IN CN TH US
## 189 214 226 231 251 264 267 287 360 364 539 557 599 788 878 948 962
For each hotel from Partner1 check equality of new_hotel_name, new_hotel_address and new_ct_postal to Partner2. If 2 of new_hotel_name, new_hotel_address and new_ct_postalcan are equal in Partner1 and Partner2 there there ia a match
results <- list()
for ( country in names(sort(table(Partner2$p2.country_code)))){
p1_by_country <- Partner1 %>% filter(p1.country_code==country)
p2_by_country <- Partner2 %>% filter(p2.country_code==country)
if(length(p1_by_country$p1.key)>1){
tmp1 <- p1_by_country$p1.new_hotel_name
tmp2 <- p2_by_country$p2.new_hotel_name
matches <- lapply(tmp1, function(x) {grep(x,tmp2)})
matches_len <- lapply(matches, function(x) length(x))
tmp_result <- as.data.frame(t(sapply(which(matches_len==1), function(x){
c(p1_by_country[x,"p1.key"],
p2_by_country[matches[[x]],"p2.key"])
})))
rownames(tmp_result) <- NULL
add1 <- p1_by_country$p1.new_hotel_add
add2 <- p2_by_country$p2.new_hotel_add
matches_add <- lapply(add1, function(x) {grep(x,add2)})
matches_add_len <- lapply(matches_add, function(x) length(x))
add_result <- as.data.frame(t(sapply(which(matches_add_len==1), function(x){
c(p1_by_country[x,"p1.key"],
p2_by_country[matches_add[[x]],"p2.key"])
})))
rownames(add_result) <- NULL
ct_post1 <- p1_by_country$p1.city_postal_code
ct_post2 <- p2_by_country$p2.city_postal_code
matches_ct_post <- lapply(ct_post1, function(x) {grep(x,ct_post2)})
matches_ct_post_len <- lapply(matches_ct_post, function(x) length(x))
ct_post_result <- as.data.frame(t(sapply(which(matches_ct_post_len ==1), function(x){
c(p1_by_country[x,"p1.key"],
p2_by_country[matches_ct_post[[x]],"p2.key"])
})))
rownames(ct_post_result) <- NULL
all_matches <- rbind(rbind(tmp_result,add_result),ct_post_result)
all_matches$p1.key = as.character(all_matches$p1.key); all_matches$p2.key = as.character(all_matches$p2.key)
results[[country]] <- all_matches %>% arrange(desc(p1.key)) %>%
group_by(p1.key,p2.key) %>%
summarise(c=n()) %>%
filter(c>1) %>%
select(p1.key,p2.key)
} else{
results[[country]] <- as.data.frame(t(c(p1.key=p1_by_country$p1.key,p2.key=p2_by_country$p2.key)))
results[[country]]$p1.key = as.character(results[[country]]$p1.key)
results[[country]]$p2.key = as.character(results[[country]]$p2.key)}
}
matches_tbl <- bind_rows(results)
head(matches_tbl)
#write.csv(matches_tbl,file = "~/Michal/agoda/mappings.csv")