515KHotelReviews

Load libraries, data, and create derived variables
Group and Count
Exploratory Charts

Load libraries, data, and create derived variables

knitr::opts_chunk$set(fig.width=9, fig.height=8)
full = read.csv('Hotel_Reviews.csv')

library(dplyr)
library(tidyr)
library(ggplot2)
library(leaflet)
library(leaflet.extras)
library(grid)
library(gridExtra)
library(scales)

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#Function to extract country from lat/lon (will be switching to getting name from address field)

# from:https://stackoverflow.com/a/14342127
library(sp)
library(rworldmap)

# The single argument to this function, points, is a data.frame in which:
#   - column 1 contains the longitude in degrees
#   - column 2 contains the latitude in degrees
coords2country = function(points)
{  
    countriesSP <- getMap(resolution='low')
    #countriesSP <- getMap(resolution='high') #you could use high res map from rworldxtra if you were concerned about detail
    
    # convert our list of points to a SpatialPoints object
    
    # pointsSP = SpatialPoints(points, proj4string=CRS(" +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0"))
    
    #setting CRS directly to that from rworldmap
    pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))  
    
    
    # use 'over' to get indices of the Polygons object containing each point 
    indices = over(pointsSP, countriesSP)
    
    # return the ADMIN names of each country
    indices$ADMIN  
    #indices$ISO3 # returns the ISO3 code 
    #indices$continent   # returns the continent (6 continent model)
    #indices$REGION   # returns the continent (7 continent model)
}
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---


# Create Interval column

full$Review_Range = cut(full$Reviewer_Score, breaks=1.5:11, include.lowest=T)

full$lng[is.na(full$lng)] <- 0
full$lat[is.na(full$lat)] <- 0
full$Country = coords2country(cbind(full$lng, full$lat))
full$Total_Words = full$Review_Total_Negative_Word_Counts + full$Review_Total_Positive_Word_Counts
full$Positive_Word_Rate = full$Review_Total_Positive_Word_Counts/full$Total_Words

Group and Count

#Get stats by unique hotel

hotel.names = full %>%
    select(Hotel_Name, Hotel_Address, lat, lng, Country, Average_Score, Total_Number_of_Reviews,
           Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts) %>%
    #Remove the 17 records without geo coordinates
    filter(lat != 0 & lng != 0) %>%
    group_by(Hotel_Name, Hotel_Address, lat, lng, Country,Average_Score, Total_Number_of_Reviews) %>%
    summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
              Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
              Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
              Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
              Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words)) #%>%
    #use fx to get country from lat/lng
    #mutate(Country = coords2country(cbind(lng,lat)))


#Get Hotel Countries with Review_Range derived variable.
# Step 2: country/NumberHotels (find count distinct Hotel_Name)
# Step 3: ggplot
#__________________________________________________________________________________________
country.review_range = full %>%
    select(Country, Review_Range, Total_Number_of_Reviews,
           Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts, Hotel_Name) %>%
    # #Remove the 17 records without geo coordinates
    # filter(lat != 0 & lng != 0) %>%
    group_by(Country, Review_Range) %>%
    summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
              Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
              Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
              Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
              Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words),
              Num_Reviews = n()
             )

review_range = full %>%
    select(Review_Range, Total_Number_of_Reviews,
           Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts, Hotel_Name) %>%
    # #Remove the 17 records without geo coordinates
    # filter(lat != 0 & lng != 0) %>%
    group_by(Review_Range) %>%
    summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
              Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
              Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
              Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
              Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words),
              Num_Reviews = n(),
              Avg_Words_Per_Review = format(Total_Words/Num_Reviews,digits = 4)
    )

#Get stats by country
country.stats = hotel.names %>%
    select(Country, Average_Score, Total_Number_of_Reviews, Tot_Pos_Words, Tot_Neg_Words,
           Total_Words) %>%
    group_by(Country) %>%
    summarize(Avg_Hotel_Review = mean(Average_Score),
              Positive_Words = sum(Tot_Pos_Words),
              Negative_Words = sum(Tot_Neg_Words),
              Total_Words = sum(Total_Words),
              Pos_Word_Rate = percent(Positive_Words/Total_Words),
              Neg_Word_Rate = percent(Negative_Words/Total_Words),
              Number_Hotels = n(),
              Total_Number_of_Reviews = sum(Total_Number_of_Reviews))

## Adding missing grouping variables: `Hotel_Name`, `Hotel_Address`, `lat`, `lng`

515KHotelReviews

Neil Kutty

9/16/2017

Load libraries, data, and create derived variables

Group and Count

Exploratory Charts

By Review Score Range

Country Stats

Leaflet Map