library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggpmisc)
## Loading required package: ggpp
## 
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.5     ✓ purrr   0.3.4
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggpp::annotate()         masks ggplot2::annotate()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(wordcloud)
## Loading required package: RColorBrewer
library(leaflet)
library(tidytext)
library(treemap)
#Reading the data 
Hotels<-read_csv("/Users/shuaibismail/Documents copy/804/week3/Hotels.csv", show_col_types = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
HotelReviews<-read_csv("/Users/shuaibismail/Documents copy/804/week3/HotelReviews.csv", show_col_types = FALSE)
view(Hotels)
glimpse(Hotels)
## Rows: 6,413
## Columns: 58
## $ business_id                           <chr> "mKTq1T_IAplDpHUcMzOXkw", "FrMRH…
## $ name                                  <chr> "Tipsy Ryde", "Sin City Party Bu…
## $ address                               <chr> NA, "4005 W Reno Ave, Ste C", "3…
## $ city                                  <chr> "Gastonia", "Las Vegas", "Montré…
## $ state                                 <chr> "NC", "NV", "QC", "QC", "AZ", "A…
## $ postal_code                           <chr> "28054", "89118", "H2X 1H6", "H2…
## $ latitude                              <dbl> 35.25284, 36.09688, 45.51495, 45…
## $ longitude                             <dbl> -81.15270, -115.19290, -73.56571…
## $ stars                                 <dbl> 3.5, 4.0, 4.5, 3.5, 4.0, 4.5, 2.…
## $ review_count                          <dbl> 3, 18, 4, 45, 3, 12, 10, 17, 441…
## $ is_open                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ categories                            <chr> "Hotels & Travel, Transportation…
## $ attributes.BusinessAcceptsCreditCards <lgl> TRUE, TRUE, NA, NA, TRUE, NA, NA…
## $ attributes.BikeParking                <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForKids                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BusinessParking            <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.ByAppointmentOnly          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsPriceRange2     <dbl> NA, NA, 2, 2, NA, NA, NA, 1, 3, …
## $ attributes.DogsAllowed                <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.WiFi                       <chr> NA, NA, "'free'", "u'free'", NA,…
## $ attributes.RestaurantsAttire          <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsTakeOut         <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.NoiseLevel                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsReservations    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsGoodForGroups   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HasTV                      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Alcohol                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsDelivery        <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.OutdoorSeating             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Caters                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.WheelchairAccessible       <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.AcceptsInsurance           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsTableService    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Ambience                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForMeal                <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HappyHour                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BusinessAcceptsBitcoin     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BYOB                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Corkage                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForDancing             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.CoatCheck                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BestNights                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Music                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Smoking                    <chr> NA, NA, NA, NA, NA, "u'no'", NA,…
## $ attributes.DietaryRestrictions        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.DriveThru                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HairSpecializesIn          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BYOBCorkage                <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.AgesAllowed                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsCounterService  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Open24Hours                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ hours.Monday                          <chr> "0:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Tuesday                         <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Wednesday                       <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Thursday                        <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Friday                          <chr> "9:0-2:30", "8:0-20:0", NA, "0:0…
## $ hours.Saturday                        <chr> "9:0-2:30", "10:0-20:0", NA, "0:…
## $ hours.Sunday                          <chr> "12:0-0:0", "10:0-20:0", NA, "0:…

Creating a tibble from the Hotels data that excludes irrelevant columns

Hotels <- as_tibble(Hotels)

Hotels %>%
  select(-starts_with("hours"), -starts_with("attribute"))

Creating sub-categories for easy analysis

Hotels_pivot<-Hotels%>%
       mutate(categories = strsplit(categories, ", ")) %>%
       unnest(categories) %>% 
       arrange(categories) %>%  
       pivot_wider(names_from = categories,
            names_prefix = "category_",
            names_repair = "universal", 
            values_from = categories, 
            values_fill = 0, 
            values_fn = length)
## New names:
## * `category_Active Life` -> category_Active.Life
## * `category_Adult Education` -> category_Adult.Education
## * `category_Adult Entertainment` -> category_Adult.Entertainment
## * `category_Aerial Tours` -> category_Aerial.Tours
## * `category_Aircraft Dealers` -> category_Aircraft.Dealers
## * ...

Number of organizations are categorized as “Car Rental” in the State of Nevada?

Hotels_pivot%>%
  filter(category_Car.Rental==1)%>%
  filter(state=="NV")%>%
  select(name, state )%>%
  group_by(state)%>%
  count(state)%>%
  arrange(desc(n))

Number of organizations in each state that have the word “Car Rental” as one of their categories?

 Hotels_pivot%>%
  filter(category_Car.Rental==1)%>%
  select(name, state)%>%
  group_by(state)%>%
  count(state)%>%
  arrange(desc(n))

The number and proportion of Car Rental organizations that are open?

Hotels_pivot %>%
  filter(category_Car.Rental==1)%>%
  select(name, is_open) %>%
  group_by(is_open)%>%
  summarise(Count = n())%>%
  mutate(is_open=as.factor(is_open), Proportion = round(Count/sum(Count)*100,2)) %>%
  arrange(desc(is_open))

Top 5 states in terms of Car Rental average star review scores.

Hotels_pivot %>%
  filter(category_Car.Rental==1)%>%
  type_convert(cols(stars = col_double(), review_count = col_integer()))%>% 
  select(state, stars) %>%
  group_by(state)%>%
  summarize(Average_Stars=mean(stars))%>%
  arrange(desc(Average_Stars))%>%
  head(5)

Show the names of the organizations (top 10) that have received the greatest number of 5-star reviews. Also show the total number of 5-star reviews that they have received.

Hotels_pivot %>%
  filter(stars==5)%>%
  type_convert(cols(stars = col_double(), review_count = col_integer()))%>% 
  select(name, review_count, stars) %>%
  group_by(name)%>%
  summarize(Number_of_5Stars=sum(stars))%>%
  arrange(desc(Number_of_5Stars))%>%
  head(10)

Organizations(top 10) appear the greatest number of times in the Hotels dataset Order the businesses by the number of time they appear

Hotels_pivot %>%
  group_by(name)%>%
  summarize(Count= n())%>%
  arrange(desc(Count))%>%
  head(10)

Created a tibble dataset from the hotels tibble that includes only the records from Enterprise Rent-A-Car.

Enterprise_A_Car<- Hotels%>%
  filter(name=="Enterprise Rent-A-Car")%>%
  select(-starts_with("hours"), -starts_with("attribute"))

glimpse(Enterprise_A_Car)
## Rows: 235
## Columns: 12
## $ business_id  <chr> "xo0_NBXWjRWuyAEyDbnZ7Q", "AucmLoCNGDkdg8sn6nxv1Q", "LuVF…
## $ name         <chr> "Enterprise Rent-A-Car", "Enterprise Rent-A-Car", "Enterp…
## $ address      <chr> "4000 International Ln", "9773 State Route 14", "1000 Row…
## $ city         <chr> "Madison", "Streetsboro", "Woodbridge", "Tempe", "Madison…
## $ state        <chr> "WI", "OH", "ON", "AZ", "WI", "PA", "NV", "OH", "ON", "NV…
## $ postal_code  <chr> "53704", "44241", "L4L 5X3", "85282", "53719", "15231", "…
## $ latitude     <dbl> 43.13633, 41.25053, 43.78431, 33.40703, 43.05610, 40.4960…
## $ longitude    <dbl> -89.34579, -81.36543, -79.56811, -111.95651, -89.49433, -…
## $ stars        <dbl> 3.0, 3.5, 2.0, 4.0, 4.0, 3.5, 4.0, 1.0, 4.5, 3.0, 1.5, 2.…
## $ review_count <dbl> 23, 4, 4, 33, 8, 82, 41, 3, 10, 6, 6, 18, 39, 19, 14, 8, …
## $ is_open      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ categories   <chr> "Car Rental, Hotels & Travel", "Hotels & Travel, Car Rent…

Joined the Enterprise Rent-A-Car dataset with the HotelReviews dataset. The dataset includes all the Enterprise Rent-A-Car records and their corresponding reviews. All other non-Enterprise Rent-A-Car reviews were excluded from the new dataset

Enterprise_A_Car_Joined<- tibble(inner_join(Enterprise_A_Car, HotelReviews))
## Joining, by = c("business_id", "stars")
glimpse(Enterprise_A_Car_Joined)
## Rows: 271
## Columns: 19
## $ business_id  <chr> "LuVF0lyyZ23yx8iH2PrDdA", "7A9CrMSBbXu_KEDRkp4dWA", "7A9C…
## $ name         <chr> "Enterprise Rent-A-Car", "Enterprise Rent-A-Car", "Enterp…
## $ address      <chr> "1000 Rowntree Dairy Road, Suite 5", "1131 W Broadway Rd"…
## $ city         <chr> "Woodbridge", "Tempe", "Tempe", "Tempe", "Madison", "Madi…
## $ state        <chr> "ON", "AZ", "AZ", "AZ", "WI", "WI", "WI", "NV", "OH", "OH…
## $ postal_code  <chr> "L4L 5X3", "85282", "85282", "85282", "53719", "53719", "…
## $ latitude     <dbl> 43.78431, 33.40703, 33.40703, 33.40703, 43.05610, 43.0561…
## $ longitude    <dbl> -79.56811, -111.95651, -111.95651, -111.95651, -89.49433,…
## $ stars        <dbl> 2, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 2, 5, 5, 5, 5, 5, 5, 4, …
## $ review_count <dbl> 4, 33, 33, 33, 8, 8, 8, 41, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8,…
## $ is_open      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ categories   <chr> "Car Rental, Hotels & Travel", "Car Rental, Hotels & Trav…
## $ review_id    <chr> "OJHF8iWWnyS3FSVVbn88YQ", "bj8qZWyaBdGmsMPQnAmP7g", "n5JA…
## $ user_id      <chr> "h8-0FvOdlgebgDyUJY3iIQ", "EXezs0rAdBzt2v1TUZR4OQ", "ahxI…
## $ useful       <dbl> 0, 0, 2, 0, 0, 1, 0, 0, 2, 2, 0, 2, 1, 8, 0, 6, 1, 1, 0, …
## $ funny        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 9, 0, 0, 0, …
## $ cool         <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 6, 1, 6, 0, 0, 0, …
## $ text         <chr> "Annoyed with the representative I had. Never told me the…
## $ date         <chr> "4/29/15 15:08", "7/25/14 2:02", "10/15/15 15:10", "8/16/…

Top 10 Enterprise Rent-A-Car locations that have received the most number of reviews. Be sure to show the business_id, address, city, state and number of reviews for each location.

Enterprise_A_Car_Joined%>%
  select(business_id, address, city, date, state, review_count)%>%
  group_by(business_id,address, city, state, review_count)%>%
  #distinct(business_id, .keep_all = TRUE)%>%
  summarize(count= n())%>%
  #arrange(desc())%>%
  head(10)
## `summarise()` has grouped output by 'business_id', 'address', 'city', 'state'. You can override using the `.groups` argument.

Top 10 Ten most frequently used words(other than Stop words “enterprise”, “rental” , “car” or “cars”) on Enterprise Rent-A-Car reviews.

Enterprise_A_Car_Joined%>%
  unnest_tokens(word, text) %>% ##Split a text column into words/tokens
  filter(!word %in% stop_words$word)%>%
  filter(!word %in% c("enterprise", "rental", "car", "cars"))%>%
  count(word,sort = TRUE) %>%
  ungroup() %>%
  mutate(word=factor(word, levels = rev(unique(word))))%>%
  head(10)

Enterprise Rent-A-Car locations that have received reviews with negative sentiment scores.

Review_sentiment<-Enterprise_A_Car_Joined %>%
  unnest_tokens(word, text) %>%  #Splitting the text column into words/tokens
  inner_join(get_sentiments("afinn"), by = "word") %>%
  group_by(review_id) %>%
  summarize(sentiment = mean(value),words = n()) %>%
  ungroup()%>%
  filter(words >= 5)
Review_sentiment%>%
  arrange(desc(sentiment)) %>%
  top_n(-10, sentiment) %>% ## get the highest sentiment scores
  inner_join(Enterprise_A_Car_Joined, by = "review_id") %>%
  select(address, city, sentiment,date, text) %>%
  head(10)

Enterprise Rent-A-Car locations that have received reviews with positive sentiment scores.

Review_sentiment%>%
  arrange(desc(sentiment)) %>%
  top_n(10, sentiment) %>% ## get the highest sentiment scores
  inner_join(Enterprise_A_Car_Joined, by = "review_id") %>%
  select(address, city, sentiment,date, text) %>%
  head(10)

Number of reviews for Enterprise Rent-A-Car over time(years) using a line chart.

Enterprise_A_Car_Date<-Enterprise_A_Car_Joined %>%
 mutate(date_formatted = mdy_hm(date),
 month_formatted=month(date_formatted),
 day_formatted=day(date_formatted),
 year_formatted=year(date_formatted),
 hour_formatted=hour(date_formatted))
Enterprise_A_Car_Date%>%
  select(year_formatted)%>%
  group_by(year_formatted)%>%
  summarise(NumberofReviews = n())%>%
  ggplot(aes (x=year_formatted,y=NumberofReviews)) +
  geom_line()+
  xlim(c(2006, 2020))+
  xlab("Year") + ylab("Number of Reviews")

Number of reviews in each State using a column/bar chart.

Enterprise_A_Car_Date%>%
  select(state)%>%
  group_by(state)%>%
  ggplot()+
  geom_bar(mapping = aes(state, color=state))

A donut chart that Shows the proportion of businesses that are open vs those that are closed.

#Enterprise_A_Car_Date$label <- paste0(Enterprise_A_Car_Date$is_open, "\n value: ", Enterprise_A_Car_Date$Count)

Enterprise_A_Car_Date%>%
  select(is_open)%>%
  group_by(is_open)%>%
  summarise(Count = n())%>%
  mutate(is_open=as.factor(is_open), Proportion = round(Count/sum(Count)*100),CummProp = cumsum(Proportion),ymin = c(0, head(CummProp, n=-1))) %>%
  ggplot()+
  geom_rect(mapping = aes(ymax=CummProp, ymin=ymin,xmax=4, xmin=3, fill=is_open))+
  geom_label(aes(x = 2,y=Proportion, label=Proportion))+
  coord_polar(theta="y")+ # Try to remove that to understand how the chart is built initially
  xlim(c(-1, 4))+
  theme_void() 

A column chart that shows the median number of words used in Enterprise Rent-A-Car reviews each month.

Enterprise_A_Car_Date%>%
  mutate(NumberOfWords=str_count(text, boundary("word"))) %>%
  mutate(MonthName=month(date_formatted, label=TRUE, abbr=FALSE))%>%
  group_by(MonthName)%>%
  summarize(NumberOfWords=median(NumberOfWords))%>%
  ggplot(aes (x=MonthName, y=NumberOfWords, fill=MonthName)) +
  geom_col() +
  coord_flip()+
  xlab("Month") + ylab("Number of Words")

A chart that shows the relationship between Enterprise Rent-A-Car’s funny and useful reviews. Your chart must include values for Adj R2 (or R2), Intercept and slope

formula <- y ~ x
Enterprise_A_Car_Date%>%
ggplot(aes(x=funny, y=useful)) + 
geom_point() +
geom_smooth(method='lm', se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

formula <- y ~ poly(x, 3, raw = TRUE)

Enterprise_A_Car_Date%>%
ggplot(aes(x=funny, y=useful)) + 
  stat_smooth(aes(fill = useful), method = "lm", formula = formula) +
  stat_poly_eq(formula = formula, 
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                parse = TRUE) +
  geom_abline(aes(intercept=0.574, slope=1.74))+
  geom_point()

A wordcloud with to shows the 50 words that are used the most frequently in Enterprise Rent-A-Car reviews.

Word_Cloud = function(x)
{
  Enterprise_A_Car_Date %>%
    unnest_tokens(word, text) %>%
    filter(!word %in% stop_words$word) %>%
    filter(!word %in% c("enterprise", "rental", "car", "cars"))%>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    head(50) %>%
    with(wordcloud(word, n, max.words = 50,colors=brewer.pal(9, "Set1")))
}

Word_Cloud(text)

A chart that shows the relative contribution of at least 30 words(15 positive, 15 negative) towards the negative or positive sentiment of Enterprise Rent-A-Car

WordsBarGraph <- function(w) {
  word_contribution <- w %>%
    unnest_tokens(word, text) %>%
    count(word,sort = TRUE) %>%
    ungroup() %>%
    inner_join(get_sentiments("afinn"), by = "word") %>%
    group_by(word) %>%
    summarize(word_contribution = sum(value), n=n())
  
  word_contribution %>%
    top_n(30, abs(word_contribution)) %>%
    mutate(word = reorder(word, word_contribution)) %>%
    head(30) %>%
    ggplot(aes(word, word_contribution, fill = word_contribution > 0)) +
    geom_segment( aes(x=word, xend=word, y=word_contribution, yend=word_contribution), size=1.3, alpha=0.9) +
    geom_col(show.legend = FALSE) +
    theme_light() +
    theme(
    legend.position = "none",
    panel.border = element_blank(),
  ) +
    coord_flip() + 
    theme_bw()
}

  WordsBarGraph(Enterprise_A_Car_Joined)

A Map that shows all the Enterprise-Rent-A-Car Locations.

pal <- colorFactor(c("red","gray","orange","white","blue"),
                   domain = unique(Enterprise_A_Car$stars))
### this draws the map
location_map <- leaflet(Enterprise_A_Car) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    color = ~pal(Enterprise_A_Car$stars),
    stroke = FALSE, fillOpacity = 0.5,
    lat = Enterprise_A_Car$latitude,
    lng = Enterprise_A_Car$longitude,
    clusterOptions = markerClusterOptions(),
    popup = as.character(Enterprise_A_Car$address))

location_map

Books with 5 star reviews

Hotels_pivot%>%
  filter(category_Books==1)%>%
  filter(stars==5)%>%
  type_convert(cols(stars = col_double(), review_count = col_integer()))%>% 
  select(name, review_count, stars) %>%
  group_by(name)%>%
  summarize(Books_with_5Stars=sum(review_count))%>%
  arrange(desc(Books_with_5Stars))%>%
  treemap(Hotels_pivot,
            index="name",
            vSize="Books_with_5Stars",
            type="index"
            )

Top 10 Hotels with 5 star reviews

Hotels_pivot%>%
  filter(category_Hotels==1)%>%
  filter(stars==5)%>%
  type_convert(cols(stars = col_double(), review_count = col_integer()))%>% 
  select(name, review_count, stars) %>%
  group_by(name)%>%
  summarize(Number_of_5Stars=sum(review_count))%>%
  arrange(desc(Number_of_5Stars))%>%
  head(5)%>%
  ggplot()+
  geom_point(aes(x = Number_of_5Stars, y=name))+
  geom_label(aes(x = Number_of_5Stars, y=name, label=Number_of_5Stars))+
  ggtitle("Top 10 Hotels with 5 Star Ratings")+
  xlab("Number 5 Stars Recieved") + ylab("Name")