Agoda challenge

Loading mapping hotel data

MappingHotelData <- read_excel_allsheets("~/Michal/agoda/mappinghotelsdataset_v2.xlsx")

names(MappingHotelData)

## [1] "Partner1" "Partner2" "examples"

colnames(MappingHotelData$Partner1)

## [1] "p1.key"           "p1.hotel_name"    "p1.city_name"    
## [4] "p1.country_code"  "p1.hotel_address" "p1.star_rating"  
## [7] "p1.postal_code"

head(MappingHotelData$Partner1)

## # A tibble: 6 × 7
##                             p1.key                       p1.hotel_name
##                              <chr>                               <chr>
## 1 5E876BFEA81A39E42E3019FE17303D52                  Elite Grande Hotel
## 2 4F315989358CC0F3F7869F569887743D            Quality Inn West Chester
## 3 A4EEBCBB9932DADE591248DFFFBDC068                 MAP5 Village Resort
## 4 2833BE9FD49A063A36D3DE1E5E28ABC4 Hampton Inn & Suites San Jose Hotel
## 5 F7C20B50AE5C6C807BAABB65B8926F07               Favehotel Daeng Tompo
## 6 C517E32BB18C1E2ED859947B25629103                   Art Cottage Hotel
## # ... with 5 more variables: p1.city_name <chr>, p1.country_code <chr>,
## #   p1.hotel_address <chr>, p1.star_rating <dbl>, p1.postal_code <chr>

Cleaning hotels names

Partner1 <- MappingHotelData$Partner1 %>% 
  mutate(p1.new_hotel_name = tolower(p1.hotel_name)) %>% 
  mutate(p1.new_hotel_name = gsub(" ","",p1.new_hotel_name,ignore.case = TRUE)) %>% 
  mutate(p1.new_hotel_name = gsub("/","",p1.new_hotel_name)) %>% 
  mutate(p1.new_hotel_name = gsub("@","",p1.new_hotel_name)) %>% 
  mutate(p1.new_hotel_name = gsub("\\|","",p1.new_hotel_name))%>%
  mutate(p1.new_hotel_name = gsub("&","",p1.new_hotel_name))%>%
  mutate(p1.new_hotel_name = gsub("[0-9]","",p1.new_hotel_name)) %>% 
  mutate(p1.new_hotel_name = gsub(",","",p1.new_hotel_name)) %>% 
  mutate(p1.new_hotel_name = gsub("-","",p1.new_hotel_name)) %>%
  mutate(p1.new_hotel_name = gsub("(","",p1.new_hotel_name,fixed = T)) %>% 
  mutate(p1.new_hotel_name = gsub(")","",p1.new_hotel_name,fixed = T))
  
Partner2 <- MappingHotelData$Partner2 %>% 
  mutate(p2.new_hotel_name = tolower(p2.hotel_name)) %>% 
  mutate(p2.new_hotel_name = gsub(" ","",p2.new_hotel_name,ignore.case = TRUE)) %>% 
  mutate(p2.new_hotel_name = gsub("/","",p2.new_hotel_name)) %>% 
  mutate(p2.new_hotel_name = gsub("@","",p2.new_hotel_name)) %>% 
  mutate(p2.new_hotel_name = gsub("\\|","",p2.new_hotel_name))%>%
  mutate(p2.new_hotel_name = gsub("&","",p2.new_hotel_name))%>%
  mutate(p2.new_hotel_name = gsub("[0-9]","",p2.new_hotel_name)) %>% 
  mutate(p2.new_hotel_name = gsub(",","",p2.new_hotel_name)) %>% 
  mutate(p2.new_hotel_name = gsub("-","",p2.new_hotel_name)) %>%
  mutate(p2.new_hotel_name = gsub("(","",p2.new_hotel_name,fixed = T)) %>% 
  mutate(p2.new_hotel_name = gsub(")","",p2.new_hotel_name,fixed = T))

New hotels name:

## [1] "elitegrandehotel"             "qualityinnwestchester"       
## [3] "mapvillageresort"             "hamptoninnsuitessanjosehotel"
## [5] "favehoteldaengtompo"          "artcottagehotel"

Cleaning hotels address

Partner1 <- Partner1 %>% 
  mutate(p1.new_hotel_add = tolower(p1.hotel_address)) %>% 
  mutate(p1.new_hotel_add = gsub(" ","",p1.new_hotel_add)) %>% 
  mutate(p1.new_hotel_add = gsub("/","",p1.new_hotel_add)) %>% 
  mutate(p1.new_hotel_add = gsub("@","",p1.new_hotel_add)) %>% 
  mutate(p1.new_hotel_add = gsub("\\|","",p1.new_hotel_add))%>%
  mutate(p1.new_hotel_add = gsub("&","",p1.new_hotel_add))%>%
  mutate(p1.new_hotel_add = gsub(",","",p1.new_hotel_add)) %>% 
  mutate(p1.new_hotel_add = gsub("-","",p1.new_hotel_add)) %>%
  mutate(p1.new_hotel_add = gsub("(","",p1.new_hotel_add,fixed = T)) %>% 
  mutate(p1.new_hotel_add = gsub(")","",p1.new_hotel_add,fixed = T)) %>% 
  mutate(p1.new_hotel_add = gsub(".","",p1.new_hotel_add,fixed = T)) %>% 
  mutate(p1.new_hotel_add = gsub("\\","",p1.new_hotel_add,fixed=T))
 
Partner2 <- Partner2 %>% 
  mutate(p2.new_hotel_add = tolower(p2.hotel_address)) %>% 
  mutate(p2.new_hotel_add = gsub(" ","",p2.new_hotel_add)) %>% 
  mutate(p2.new_hotel_add = gsub("/","",p2.new_hotel_add)) %>% 
  mutate(p2.new_hotel_add = gsub("@","",p2.new_hotel_add)) %>% 
  mutate(p2.new_hotel_add = gsub("\\|","",p2.new_hotel_add))%>%
  mutate(p2.new_hotel_add = gsub("&","",p2.new_hotel_add))%>%
  mutate(p2.new_hotel_add = gsub(",","",p2.new_hotel_add)) %>% 
  mutate(p2.new_hotel_add = gsub("-","",p2.new_hotel_add)) %>%
  mutate(p2.new_hotel_add = gsub("(","",p2.new_hotel_add,fixed = T)) %>% 
  mutate(p2.new_hotel_add = gsub(")","",p2.new_hotel_add,fixed = T)) %>% 
  mutate(p2.new_hotel_add = gsub(".","",p2.new_hotel_add,fixed = T)) %>% 
  mutate(p2.new_hotel_add = gsub("\\","",p2.new_hotel_add,fixed=T))

New hotels address:

## [1] "bldg3378road2845area428"        "8567cincinnatidaytonroad"      
## [3] "vithaldaswadoo"                 "55oldtullyroad"                
## [5] "daengtompostreetnumber28losari" "1517jalanmedanipoh1b1"

Preprocessing cities names only for cities in the US

I have noticed that the cities names in Partner1 include the state name whilst in Partner2 the state name not included. I removed the state name in parentheses from Partner1 list for cities that had only one state( the dataset contains 952 hotels that located in the US only 76 cities appears in more than one state, all the other 876 cities names can reduce to just city name without state name)

Cleaning city names

Partner1 <- Partner1 %>% 
  mutate(p1.new_city = tolower(p1.city_name)) %>% 
  mutate(p1.new_city = gsub(" ","",p1.new_city)) %>% 
  mutate(p1.new_city = gsub("/","",p1.new_city)) %>% 
  mutate(p1.new_city = gsub("@","",p1.new_city)) %>% 
  mutate(p1.new_city = gsub("\\|","",p1.new_city))%>%
  mutate(p1.new_city = gsub("&","",p1.new_city))%>%
  mutate(p1.new_city = gsub(",","",p1.new_city)) %>% 
  mutate(p1.new_city = gsub("-","",p1.new_city)) %>%
  mutate(p1.new_city = gsub("(","",p1.new_city,fixed = T)) %>% 
  mutate(p1.new_city = gsub(")","",p1.new_city,fixed = T)) %>% 
  mutate(p1.new_city = gsub(".","",p1.new_city,fixed = T)) %>%
  mutate(p1.new_city = gsub("[0-9]","",p1.new_city)) %>% 
  mutate(p1.new_city = gsub("\\","",p1.new_city,fixed=T))
 
Partner2 <- Partner2 %>% 
  mutate(p2.new_city = tolower(p2.city_name)) %>% 
  mutate(p2.new_city = gsub(" ","",p2.new_city)) %>% 
  mutate(p2.new_city = gsub("/","",p2.new_city)) %>% 
  mutate(p2.new_city = gsub("@","",p2.new_city)) %>% 
  mutate(p2.new_city = gsub("\\|","",p2.new_city))%>%
  mutate(p2.new_city = gsub("&","",p2.new_city))%>%
  mutate(p2.new_city = gsub(",","",p2.new_city)) %>% 
  mutate(p2.new_city = gsub("-","",p2.new_city)) %>%
  mutate(p2.new_city = gsub("(","",p2.new_city,fixed = T)) %>% 
  mutate(p2.new_city = gsub(")","",p2.new_city,fixed = T)) %>% 
  mutate(p2.new_city = gsub(".","",p2.new_city,fixed = T)) %>%
  mutate(p2.new_city = gsub("[0-9]","",p2.new_city)) %>%
  mutate(p2.new_city = gsub("\\","",p2.new_city,fixed=T))

Concatenate the city name and postal code

Partner1 <- Partner1 %>% 
  mutate(p1.city_postal_code = 
           tolower(paste(p1.new_city,p1.postal_code,sep="_"))) 

Partner2 <- Partner2 %>% 
  mutate(p2.city_postal_code = 
           tolower(paste(p2.new_city,p2.postal_code,sep="_")))

city and postal code:

## [1] "manama_5458"       "westchester_45069" "goa_403512"       
## [4] "sanjose_95111"     "makassar_na"       "ipoh_31400"

Remove frequent words in hotels name and hotel address

Words that appear in 20% or more of the hotels names are being removed, same goes for words in the hotels addresses.

  # Finds  frequent word in hotel names
  corp_hotel_name <- Corpus(VectorSource(c(Partner1$p1.hotel_name,Partner2$p2.hotel_name)))
  dtm_hotel_name <- DocumentTermMatrix(corp_hotel_name)
  freq_hotel_name <- colSums(as.matrix(dtm_hotel_name))
  ord_hotel_name <- order(freq_hotel_name)
  freq_hotel_names <- 
    names(freq_hotel_name[ord_hotel_name][freq_hotel_name[ord_hotel_name] >=0.2*nrow(Partner1)])
  
  # Finds  frequent word in hotel address
  corp_hotel_add <- Corpus(VectorSource(c(Partner2$p2.hotel_address,Partner2$p2.hotel_address)))
  dtm_hotel_add <- DocumentTermMatrix(corp_hotel_add)
  freq_hotel_add <- colSums(as.matrix(dtm_hotel_add))
  ord_hotel_add <- order(freq_hotel_add)
  freq_hotel_add <- 
   names(freq_hotel_add[ord_hotel_add][freq_hotel_add[ord_hotel_add] >=0.2*nrow(Partner1)])

  for ( i in freq_hotel_names){
    Partner1 <- Partner1 %>% 
    mutate(p1.new_hotel_name = gsub(i,"",p1.new_hotel_name,ignore.case = TRUE))
    Partner2 <- Partner2 %>% 
    mutate(p2.new_hotel_name = gsub(i,"",p2.new_hotel_name,ignore.case = TRUE))} 
  

  for ( i in freq_hotel_add){
    Partner1 <- Partner1 %>% 
    mutate(p1.new_hotel_add = gsub(i,"",p1.new_hotel_add,ignore.case = TRUE))
    Partner2 <- Partner2 %>% 
    mutate(p2.new_hotel_add = gsub(i,"",p2.new_hotel_add,ignore.case = TRUE))}

Hotel number in every country:

sort(table(Partner2$p2.country_code))

## 
##  AZ  BO  BT  BW  CV  DK  DO  EE  IS  JM  LT  MD  MK  MT  NC  NF  NG  PG 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  RE  UG  UY  WS  XK  CI  FI  GH  GU  LU  ME  MG  MN  MZ  PK  TO  ZW  BA 
##   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2   2   2   3 
##  CL  ET  KZ  LB  LV  MP  PE  PW  RS  SI  SK  AL  CK  CR  CY  PR  SE  VU 
##   3   3   3   3   3   3   3   3   3   3   3   4   4   4   4   4   4   4 
##  BG  NA  PA  BN  BR  TN  MO  NO  BD  KW  QA  BE  HR  PF  PL  RO  IE  FJ 
##   5   5   5   6   6   6   7   7   9   9   9  10  10  10  10  10  11  12 
##  KE  TZ  CO  EG  UA  BH  OM  AR  SC  JO  MU  MX  PT  MV  CH  MA  AT  NL 
##  12  12  13  13  13  14  15  17  17  18  22  22  24  27  30  33  35  35 
##  CZ  NP  SG  HK  CA  IL  HU  LA  MM  SA  AE  RU  GR  KH  TR  DE  PH  NZ 
##  36  38  39  41  47  49  55  56  65  66  68  74  76 112 121 152 166 185 
##  ZA  LK  KR  TW  ES  MY  FR  IT  GB  VN  AU  JP  ID  IN  CN  TH  US 
## 189 214 226 231 251 264 267 287 360 364 539 557 599 788 878 948 962

Both partners share the same amount of hotel in every counry.

identical(sort(table(Partner2$p2.country_code)),sort(table(Partner2$p2.country_code)))

## [1] TRUE

Results

For each hotel from Partner1 check equality of new_hotel_name, new_hotel_address and new_ct_postal to Partner2. If 2 of new_hotel_name, new_hotel_address and new_ct_postalcan are equal in Partner1 and Partner2 there there ia a match

results <- list()


for ( country in names(sort(table(Partner2$p2.country_code)))){
  
  p1_by_country <- Partner1 %>% filter(p1.country_code==country) 
  p2_by_country <- Partner2 %>% filter(p2.country_code==country)
  if(length(p1_by_country$p1.key)>1){
    tmp1 <- p1_by_country$p1.new_hotel_name
    tmp2 <- p2_by_country$p2.new_hotel_name
 
    matches <- lapply(tmp1, function(x) {grep(x,tmp2)}) 
    matches_len <- lapply(matches, function(x) length(x)) 
 
    tmp_result <- as.data.frame(t(sapply(which(matches_len==1), function(x){
      c(p1_by_country[x,"p1.key"],
      p2_by_country[matches[[x]],"p2.key"])
      })))
 
    rownames(tmp_result) <- NULL

    add1 <- p1_by_country$p1.new_hotel_add
    add2 <- p2_by_country$p2.new_hotel_add

    matches_add <- lapply(add1, function(x) {grep(x,add2)})
    matches_add_len <- lapply(matches_add, function(x) length(x))

    add_result <- as.data.frame(t(sapply(which(matches_add_len==1), function(x){
      c(p1_by_country[x,"p1.key"],
      p2_by_country[matches_add[[x]],"p2.key"])
      })))
  
    rownames(add_result) <- NULL
    ct_post1 <- p1_by_country$p1.city_postal_code
    ct_post2 <- p2_by_country$p2.city_postal_code

    matches_ct_post <- lapply(ct_post1, function(x) {grep(x,ct_post2)})
    matches_ct_post_len <- lapply(matches_ct_post, function(x) length(x))
  
    ct_post_result <- as.data.frame(t(sapply(which(matches_ct_post_len ==1), function(x){
      c(p1_by_country[x,"p1.key"],
      p2_by_country[matches_ct_post[[x]],"p2.key"])
      })))
  
    rownames(ct_post_result) <- NULL
    all_matches <- rbind(rbind(tmp_result,add_result),ct_post_result)
    all_matches$p1.key = as.character(all_matches$p1.key); all_matches$p2.key = as.character(all_matches$p2.key)
    
    results[[country]] <- all_matches %>% arrange(desc(p1.key)) %>%  
      group_by(p1.key,p2.key) %>% 
      summarise(c=n()) %>%
      filter(c>1) %>%
      select(p1.key,p2.key)

    } else{
      results[[country]] <- as.data.frame(t(c(p1.key=p1_by_country$p1.key,p2.key=p2_by_country$p2.key)))
      results[[country]]$p1.key = as.character(results[[country]]$p1.key)
      results[[country]]$p2.key = as.character(results[[country]]$p2.key)}
  
}

matches_tbl <- bind_rows(results)

head(matches_tbl)

#write.csv(matches_tbl,file = "~/Michal/agoda/mappings.csv")