Load libraries, data, and create derived variables
knitr::opts_chunk$set(fig.width=9, fig.height=8)
full = read.csv('Hotel_Reviews.csv')
library(dplyr)
library(tidyr)
library(ggplot2)
library(leaflet)
library(leaflet.extras)
library(grid)
library(gridExtra)
library(scales)
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#Function to extract country from lat/lon (will be switching to getting name from address field)
# from:https://stackoverflow.com/a/14342127
library(sp)
library(rworldmap)
# The single argument to this function, points, is a data.frame in which:
# - column 1 contains the longitude in degrees
# - column 2 contains the latitude in degrees
coords2country = function(points)
{
countriesSP <- getMap(resolution='low')
#countriesSP <- getMap(resolution='high') #you could use high res map from rworldxtra if you were concerned about detail
# convert our list of points to a SpatialPoints object
# pointsSP = SpatialPoints(points, proj4string=CRS(" +proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0"))
#setting CRS directly to that from rworldmap
pointsSP = SpatialPoints(points, proj4string=CRS(proj4string(countriesSP)))
# use 'over' to get indices of the Polygons object containing each point
indices = over(pointsSP, countriesSP)
# return the ADMIN names of each country
indices$ADMIN
#indices$ISO3 # returns the ISO3 code
#indices$continent # returns the continent (6 continent model)
#indices$REGION # returns the continent (7 continent model)
}
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Create Interval column
full$Review_Range = cut(full$Reviewer_Score, breaks=1.5:11, include.lowest=T)
full$lng[is.na(full$lng)] <- 0
full$lat[is.na(full$lat)] <- 0
full$Country = coords2country(cbind(full$lng, full$lat))
full$Total_Words = full$Review_Total_Negative_Word_Counts + full$Review_Total_Positive_Word_Counts
full$Positive_Word_Rate = full$Review_Total_Positive_Word_Counts/full$Total_Words
Group and Count
#Get stats by unique hotel
hotel.names = full %>%
select(Hotel_Name, Hotel_Address, lat, lng, Country, Average_Score, Total_Number_of_Reviews,
Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts) %>%
#Remove the 17 records without geo coordinates
filter(lat != 0 & lng != 0) %>%
group_by(Hotel_Name, Hotel_Address, lat, lng, Country,Average_Score, Total_Number_of_Reviews) %>%
summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words)) #%>%
#use fx to get country from lat/lng
#mutate(Country = coords2country(cbind(lng,lat)))
#Get Hotel Countries with Review_Range derived variable.
# Step 2: country/NumberHotels (find count distinct Hotel_Name)
# Step 3: ggplot
#__________________________________________________________________________________________
country.review_range = full %>%
select(Country, Review_Range, Total_Number_of_Reviews,
Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts, Hotel_Name) %>%
# #Remove the 17 records without geo coordinates
# filter(lat != 0 & lng != 0) %>%
group_by(Country, Review_Range) %>%
summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words),
Num_Reviews = n()
)
review_range = full %>%
select(Review_Range, Total_Number_of_Reviews,
Review_Total_Positive_Word_Counts, Review_Total_Negative_Word_Counts, Hotel_Name) %>%
# #Remove the 17 records without geo coordinates
# filter(lat != 0 & lng != 0) %>%
group_by(Review_Range) %>%
summarise(Tot_Pos_Words = sum(Review_Total_Positive_Word_Counts),
Tot_Neg_Words = sum(Review_Total_Negative_Word_Counts),
Total_Words = sum(Tot_Pos_Words + Tot_Neg_Words),
Pos_Word_Rate = percent(Tot_Pos_Words/Total_Words),
Neg_Word_Rate = percent(Tot_Neg_Words/Total_Words),
Num_Reviews = n(),
Avg_Words_Per_Review = format(Total_Words/Num_Reviews,digits = 4)
)
#Get stats by country
country.stats = hotel.names %>%
select(Country, Average_Score, Total_Number_of_Reviews, Tot_Pos_Words, Tot_Neg_Words,
Total_Words) %>%
group_by(Country) %>%
summarize(Avg_Hotel_Review = mean(Average_Score),
Positive_Words = sum(Tot_Pos_Words),
Negative_Words = sum(Tot_Neg_Words),
Total_Words = sum(Total_Words),
Pos_Word_Rate = percent(Positive_Words/Total_Words),
Neg_Word_Rate = percent(Negative_Words/Total_Words),
Number_Hotels = n(),
Total_Number_of_Reviews = sum(Total_Number_of_Reviews))
## Adding missing grouping variables: `Hotel_Name`, `Hotel_Address`, `lat`, `lng`