library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggpmisc)
## Loading required package: ggpp
##
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.5 ✓ purrr 0.3.4
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggpp::annotate() masks ggplot2::annotate()
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(wordcloud)
## Loading required package: RColorBrewer
library(leaflet)
library(tidytext)
library(treemap)
#Reading the data
Hotels<-read_csv("/Users/shuaibismail/Documents copy/804/week3/Hotels.csv", show_col_types = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
HotelReviews<-read_csv("/Users/shuaibismail/Documents copy/804/week3/HotelReviews.csv", show_col_types = FALSE)
view(Hotels)
glimpse(Hotels)
## Rows: 6,413
## Columns: 58
## $ business_id <chr> "mKTq1T_IAplDpHUcMzOXkw", "FrMRH…
## $ name <chr> "Tipsy Ryde", "Sin City Party Bu…
## $ address <chr> NA, "4005 W Reno Ave, Ste C", "3…
## $ city <chr> "Gastonia", "Las Vegas", "Montré…
## $ state <chr> "NC", "NV", "QC", "QC", "AZ", "A…
## $ postal_code <chr> "28054", "89118", "H2X 1H6", "H2…
## $ latitude <dbl> 35.25284, 36.09688, 45.51495, 45…
## $ longitude <dbl> -81.15270, -115.19290, -73.56571…
## $ stars <dbl> 3.5, 4.0, 4.5, 3.5, 4.0, 4.5, 2.…
## $ review_count <dbl> 3, 18, 4, 45, 3, 12, 10, 17, 441…
## $ is_open <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ categories <chr> "Hotels & Travel, Transportation…
## $ attributes.BusinessAcceptsCreditCards <lgl> TRUE, TRUE, NA, NA, TRUE, NA, NA…
## $ attributes.BikeParking <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForKids <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BusinessParking <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.ByAppointmentOnly <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsPriceRange2 <dbl> NA, NA, 2, 2, NA, NA, NA, 1, 3, …
## $ attributes.DogsAllowed <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.WiFi <chr> NA, NA, "'free'", "u'free'", NA,…
## $ attributes.RestaurantsAttire <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsTakeOut <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.NoiseLevel <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsReservations <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsGoodForGroups <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HasTV <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Alcohol <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsDelivery <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.OutdoorSeating <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Caters <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.WheelchairAccessible <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.AcceptsInsurance <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsTableService <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Ambience <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForMeal <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HappyHour <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BusinessAcceptsBitcoin <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BYOB <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Corkage <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.GoodForDancing <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.CoatCheck <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BestNights <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Music <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Smoking <chr> NA, NA, NA, NA, NA, "u'no'", NA,…
## $ attributes.DietaryRestrictions <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.DriveThru <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.HairSpecializesIn <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.BYOBCorkage <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.AgesAllowed <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.RestaurantsCounterService <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ attributes.Open24Hours <lgl> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ hours.Monday <chr> "0:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Tuesday <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Wednesday <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Thursday <chr> "9:0-0:0", "8:0-20:0", NA, "0:0-…
## $ hours.Friday <chr> "9:0-2:30", "8:0-20:0", NA, "0:0…
## $ hours.Saturday <chr> "9:0-2:30", "10:0-20:0", NA, "0:…
## $ hours.Sunday <chr> "12:0-0:0", "10:0-20:0", NA, "0:…
Creating a tibble from the Hotels data that excludes irrelevant columns
Hotels <- as_tibble(Hotels)
Hotels %>%
select(-starts_with("hours"), -starts_with("attribute"))
Creating sub-categories for easy analysis
Hotels_pivot<-Hotels%>%
mutate(categories = strsplit(categories, ", ")) %>%
unnest(categories) %>%
arrange(categories) %>%
pivot_wider(names_from = categories,
names_prefix = "category_",
names_repair = "universal",
values_from = categories,
values_fill = 0,
values_fn = length)
## New names:
## * `category_Active Life` -> category_Active.Life
## * `category_Adult Education` -> category_Adult.Education
## * `category_Adult Entertainment` -> category_Adult.Entertainment
## * `category_Aerial Tours` -> category_Aerial.Tours
## * `category_Aircraft Dealers` -> category_Aircraft.Dealers
## * ...
Number of organizations are categorized as “Car Rental” in the State of Nevada?
Hotels_pivot%>%
filter(category_Car.Rental==1)%>%
filter(state=="NV")%>%
select(name, state )%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
Number of organizations in each state that have the word “Car Rental” as one of their categories?
Hotels_pivot%>%
filter(category_Car.Rental==1)%>%
select(name, state)%>%
group_by(state)%>%
count(state)%>%
arrange(desc(n))
The number and proportion of Car Rental organizations that are open?
Hotels_pivot %>%
filter(category_Car.Rental==1)%>%
select(name, is_open) %>%
group_by(is_open)%>%
summarise(Count = n())%>%
mutate(is_open=as.factor(is_open), Proportion = round(Count/sum(Count)*100,2)) %>%
arrange(desc(is_open))
Top 5 states in terms of Car Rental average star review scores.
Hotels_pivot %>%
filter(category_Car.Rental==1)%>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>%
select(state, stars) %>%
group_by(state)%>%
summarize(Average_Stars=mean(stars))%>%
arrange(desc(Average_Stars))%>%
head(5)
Show the names of the organizations (top 10) that have received the greatest number of 5-star reviews. Also show the total number of 5-star reviews that they have received.
Hotels_pivot %>%
filter(stars==5)%>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>%
select(name, review_count, stars) %>%
group_by(name)%>%
summarize(Number_of_5Stars=sum(stars))%>%
arrange(desc(Number_of_5Stars))%>%
head(10)
Organizations(top 10) appear the greatest number of times in the Hotels dataset Order the businesses by the number of time they appear
Hotels_pivot %>%
group_by(name)%>%
summarize(Count= n())%>%
arrange(desc(Count))%>%
head(10)
Created a tibble dataset from the hotels tibble that includes only the records from Enterprise Rent-A-Car.
Enterprise_A_Car<- Hotels%>%
filter(name=="Enterprise Rent-A-Car")%>%
select(-starts_with("hours"), -starts_with("attribute"))
glimpse(Enterprise_A_Car)
## Rows: 235
## Columns: 12
## $ business_id <chr> "xo0_NBXWjRWuyAEyDbnZ7Q", "AucmLoCNGDkdg8sn6nxv1Q", "LuVF…
## $ name <chr> "Enterprise Rent-A-Car", "Enterprise Rent-A-Car", "Enterp…
## $ address <chr> "4000 International Ln", "9773 State Route 14", "1000 Row…
## $ city <chr> "Madison", "Streetsboro", "Woodbridge", "Tempe", "Madison…
## $ state <chr> "WI", "OH", "ON", "AZ", "WI", "PA", "NV", "OH", "ON", "NV…
## $ postal_code <chr> "53704", "44241", "L4L 5X3", "85282", "53719", "15231", "…
## $ latitude <dbl> 43.13633, 41.25053, 43.78431, 33.40703, 43.05610, 40.4960…
## $ longitude <dbl> -89.34579, -81.36543, -79.56811, -111.95651, -89.49433, -…
## $ stars <dbl> 3.0, 3.5, 2.0, 4.0, 4.0, 3.5, 4.0, 1.0, 4.5, 3.0, 1.5, 2.…
## $ review_count <dbl> 23, 4, 4, 33, 8, 82, 41, 3, 10, 6, 6, 18, 39, 19, 14, 8, …
## $ is_open <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ categories <chr> "Car Rental, Hotels & Travel", "Hotels & Travel, Car Rent…
Joined the Enterprise Rent-A-Car dataset with the HotelReviews dataset. The dataset includes all the Enterprise Rent-A-Car records and their corresponding reviews. All other non-Enterprise Rent-A-Car reviews were excluded from the new dataset
Enterprise_A_Car_Joined<- tibble(inner_join(Enterprise_A_Car, HotelReviews))
## Joining, by = c("business_id", "stars")
glimpse(Enterprise_A_Car_Joined)
## Rows: 271
## Columns: 19
## $ business_id <chr> "LuVF0lyyZ23yx8iH2PrDdA", "7A9CrMSBbXu_KEDRkp4dWA", "7A9C…
## $ name <chr> "Enterprise Rent-A-Car", "Enterprise Rent-A-Car", "Enterp…
## $ address <chr> "1000 Rowntree Dairy Road, Suite 5", "1131 W Broadway Rd"…
## $ city <chr> "Woodbridge", "Tempe", "Tempe", "Tempe", "Madison", "Madi…
## $ state <chr> "ON", "AZ", "AZ", "AZ", "WI", "WI", "WI", "NV", "OH", "OH…
## $ postal_code <chr> "L4L 5X3", "85282", "85282", "85282", "53719", "53719", "…
## $ latitude <dbl> 43.78431, 33.40703, 33.40703, 33.40703, 43.05610, 43.0561…
## $ longitude <dbl> -79.56811, -111.95651, -111.95651, -111.95651, -89.49433,…
## $ stars <dbl> 2, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 2, 5, 5, 5, 5, 5, 5, 4, …
## $ review_count <dbl> 4, 33, 33, 33, 8, 8, 8, 41, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8,…
## $ is_open <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ categories <chr> "Car Rental, Hotels & Travel", "Car Rental, Hotels & Trav…
## $ review_id <chr> "OJHF8iWWnyS3FSVVbn88YQ", "bj8qZWyaBdGmsMPQnAmP7g", "n5JA…
## $ user_id <chr> "h8-0FvOdlgebgDyUJY3iIQ", "EXezs0rAdBzt2v1TUZR4OQ", "ahxI…
## $ useful <dbl> 0, 0, 2, 0, 0, 1, 0, 0, 2, 2, 0, 2, 1, 8, 0, 6, 1, 1, 0, …
## $ funny <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1, 9, 0, 0, 0, …
## $ cool <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 6, 1, 6, 0, 0, 0, …
## $ text <chr> "Annoyed with the representative I had. Never told me the…
## $ date <chr> "4/29/15 15:08", "7/25/14 2:02", "10/15/15 15:10", "8/16/…
Top 10 Enterprise Rent-A-Car locations that have received the most number of reviews. Be sure to show the business_id, address, city, state and number of reviews for each location.
Enterprise_A_Car_Joined%>%
select(business_id, address, city, date, state, review_count)%>%
group_by(business_id,address, city, state, review_count)%>%
#distinct(business_id, .keep_all = TRUE)%>%
summarize(count= n())%>%
#arrange(desc())%>%
head(10)
## `summarise()` has grouped output by 'business_id', 'address', 'city', 'state'. You can override using the `.groups` argument.
Top 10 Ten most frequently used words(other than Stop words “enterprise”, “rental” , “car” or “cars”) on Enterprise Rent-A-Car reviews.
Enterprise_A_Car_Joined%>%
unnest_tokens(word, text) %>% ##Split a text column into words/tokens
filter(!word %in% stop_words$word)%>%
filter(!word %in% c("enterprise", "rental", "car", "cars"))%>%
count(word,sort = TRUE) %>%
ungroup() %>%
mutate(word=factor(word, levels = rev(unique(word))))%>%
head(10)
Enterprise Rent-A-Car locations that have received reviews with negative sentiment scores.
Review_sentiment<-Enterprise_A_Car_Joined %>%
unnest_tokens(word, text) %>% #Splitting the text column into words/tokens
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(review_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
ungroup()%>%
filter(words >= 5)
Review_sentiment%>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment) %>% ## get the highest sentiment scores
inner_join(Enterprise_A_Car_Joined, by = "review_id") %>%
select(address, city, sentiment,date, text) %>%
head(10)
Enterprise Rent-A-Car locations that have received reviews with positive sentiment scores.
Review_sentiment%>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment) %>% ## get the highest sentiment scores
inner_join(Enterprise_A_Car_Joined, by = "review_id") %>%
select(address, city, sentiment,date, text) %>%
head(10)
Number of reviews for Enterprise Rent-A-Car over time(years) using a line chart.
Enterprise_A_Car_Date<-Enterprise_A_Car_Joined %>%
mutate(date_formatted = mdy_hm(date),
month_formatted=month(date_formatted),
day_formatted=day(date_formatted),
year_formatted=year(date_formatted),
hour_formatted=hour(date_formatted))
Enterprise_A_Car_Date%>%
select(year_formatted)%>%
group_by(year_formatted)%>%
summarise(NumberofReviews = n())%>%
ggplot(aes (x=year_formatted,y=NumberofReviews)) +
geom_line()+
xlim(c(2006, 2020))+
xlab("Year") + ylab("Number of Reviews")
Number of reviews in each State using a column/bar chart.
Enterprise_A_Car_Date%>%
select(state)%>%
group_by(state)%>%
ggplot()+
geom_bar(mapping = aes(state, color=state))
A donut chart that Shows the proportion of businesses that are open vs those that are closed.
#Enterprise_A_Car_Date$label <- paste0(Enterprise_A_Car_Date$is_open, "\n value: ", Enterprise_A_Car_Date$Count)
Enterprise_A_Car_Date%>%
select(is_open)%>%
group_by(is_open)%>%
summarise(Count = n())%>%
mutate(is_open=as.factor(is_open), Proportion = round(Count/sum(Count)*100),CummProp = cumsum(Proportion),ymin = c(0, head(CummProp, n=-1))) %>%
ggplot()+
geom_rect(mapping = aes(ymax=CummProp, ymin=ymin,xmax=4, xmin=3, fill=is_open))+
geom_label(aes(x = 2,y=Proportion, label=Proportion))+
coord_polar(theta="y")+ # Try to remove that to understand how the chart is built initially
xlim(c(-1, 4))+
theme_void()
A column chart that shows the median number of words used in Enterprise Rent-A-Car reviews each month.
Enterprise_A_Car_Date%>%
mutate(NumberOfWords=str_count(text, boundary("word"))) %>%
mutate(MonthName=month(date_formatted, label=TRUE, abbr=FALSE))%>%
group_by(MonthName)%>%
summarize(NumberOfWords=median(NumberOfWords))%>%
ggplot(aes (x=MonthName, y=NumberOfWords, fill=MonthName)) +
geom_col() +
coord_flip()+
xlab("Month") + ylab("Number of Words")
A chart that shows the relationship between Enterprise Rent-A-Car’s funny and useful reviews. Your chart must include values for Adj R2 (or R2), Intercept and slope
formula <- y ~ x
Enterprise_A_Car_Date%>%
ggplot(aes(x=funny, y=useful)) +
geom_point() +
geom_smooth(method='lm', se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
formula <- y ~ poly(x, 3, raw = TRUE)
Enterprise_A_Car_Date%>%
ggplot(aes(x=funny, y=useful)) +
stat_smooth(aes(fill = useful), method = "lm", formula = formula) +
stat_poly_eq(formula = formula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
geom_abline(aes(intercept=0.574, slope=1.74))+
geom_point()
A wordcloud with to shows the 50 words that are used the most frequently in Enterprise Rent-A-Car reviews.
Word_Cloud = function(x)
{
Enterprise_A_Car_Date %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
filter(!word %in% c("enterprise", "rental", "car", "cars"))%>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(50) %>%
with(wordcloud(word, n, max.words = 50,colors=brewer.pal(9, "Set1")))
}
Word_Cloud(text)
A chart that shows the relative contribution of at least 30 words(15 positive, 15 negative) towards the negative or positive sentiment of Enterprise Rent-A-Car
WordsBarGraph <- function(w) {
word_contribution <- w %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(word_contribution = sum(value), n=n())
word_contribution %>%
top_n(30, abs(word_contribution)) %>%
mutate(word = reorder(word, word_contribution)) %>%
head(30) %>%
ggplot(aes(word, word_contribution, fill = word_contribution > 0)) +
geom_segment( aes(x=word, xend=word, y=word_contribution, yend=word_contribution), size=1.3, alpha=0.9) +
geom_col(show.legend = FALSE) +
theme_light() +
theme(
legend.position = "none",
panel.border = element_blank(),
) +
coord_flip() +
theme_bw()
}
WordsBarGraph(Enterprise_A_Car_Joined)
A Map that shows all the Enterprise-Rent-A-Car Locations.
pal <- colorFactor(c("red","gray","orange","white","blue"),
domain = unique(Enterprise_A_Car$stars))
### this draws the map
location_map <- leaflet(Enterprise_A_Car) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
color = ~pal(Enterprise_A_Car$stars),
stroke = FALSE, fillOpacity = 0.5,
lat = Enterprise_A_Car$latitude,
lng = Enterprise_A_Car$longitude,
clusterOptions = markerClusterOptions(),
popup = as.character(Enterprise_A_Car$address))
location_map
Books with 5 star reviews
Hotels_pivot%>%
filter(category_Books==1)%>%
filter(stars==5)%>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>%
select(name, review_count, stars) %>%
group_by(name)%>%
summarize(Books_with_5Stars=sum(review_count))%>%
arrange(desc(Books_with_5Stars))%>%
treemap(Hotels_pivot,
index="name",
vSize="Books_with_5Stars",
type="index"
)
Top 10 Hotels with 5 star reviews
Hotels_pivot%>%
filter(category_Hotels==1)%>%
filter(stars==5)%>%
type_convert(cols(stars = col_double(), review_count = col_integer()))%>%
select(name, review_count, stars) %>%
group_by(name)%>%
summarize(Number_of_5Stars=sum(review_count))%>%
arrange(desc(Number_of_5Stars))%>%
head(5)%>%
ggplot()+
geom_point(aes(x = Number_of_5Stars, y=name))+
geom_label(aes(x = Number_of_5Stars, y=name, label=Number_of_5Stars))+
ggtitle("Top 10 Hotels with 5 Star Ratings")+
xlab("Number 5 Stars Recieved") + ylab("Name")