Code
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(readr)
library(purrr)
library(tibble)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(outliers)
library(KernSmooth)
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
reviewsWC <- read_csv("D:/Master of Analytics/Sem 1 2018/MATH2270 - Data Visualisation/Assignment 3/reviews_WC.csv") %>% as.data.frame()
## Parsed with column specification:
## cols(
## listing_id = col_integer(),
## date = col_date(format = "")
## )
listingsWC <- read_csv("D:/Master of Analytics/Sem 1 2018/MATH2270 - Data Visualisation/Assignment 3/listings_WC.csv") %>% as.data.frame()
## Parsed with column specification:
## cols(
## id = col_integer(),
## name = col_character(),
## host_id = col_integer(),
## host_name = col_character(),
## neighbourhood_group = col_character(),
## neighbourhood = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## room_type = col_character(),
## price = col_integer(),
## minimum_nights = col_integer(),
## number_of_reviews = col_integer(),
## last_review = col_character(),
## reviews_per_month = col_double(),
## calculated_host_listings_count = col_integer(),
## availability_365 = col_integer()
## )
colnames(reviewsWC) <- c("id", "review_date")
for (col_name in list("id", "host_id")) {
listingsWC[, col_name] <- as.factor(listingsWC[, col_name])
}
for (coL_name in list("price", "minimum_nights", "number_of_reviews",
"reviews_per_month", "calculated_host_listings_count",
"availability_365")){
listingsWC[, col_name] <- as.numeric(listingsWC[, coL_name])
}
listingsWC$room_type <- factor(listingsWC$room_type)
listingsWC$neighbourhood <- factor(listingsWC$neighbourhood)
listingsWC$last_review <- as.Date(listingsWC$last_review, "%Y-%m-%d")
reviewsWC$review_date <- as.Date(reviewsWC$review_date, "%Y-%m-%d")
reviewsWC$id <- as.factor(reviewsWC$id)
data <- merge.data.frame(x = reviewsWC,
y = listingsWC,
by.x = "id",
by.y = "id",
all.x = TRUE)
data<- data[, !colnames(data) %in% c("neighbourhood_group")]
str(data)
## 'data.frame': 152548 obs. of 16 variables:
## $ id : Factor w/ 5660 levels "2476","3362",..: 2537 2538 2538 2538 2538 2538 2538 2538 2539 2539 ...
## $ review_date : Date, format: "2016-05-14" "2016-02-14" ...
## $ name : chr "Washington Gorgeous 2 BR Apartment" "Washington Furnished 2 Bedroom Apt." "Washington Furnished 2 Bedroom Apt." "Washington Furnished 2 Bedroom Apt." ...
## $ host_id : num 172 361 361 361 361 361 361 361 361 361 ...
## $ host_name : chr "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" ...
## $ neighbourhood : Factor w/ 39 levels "Brightwood Park, Crestwood, Petworth",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ latitude : num 38.9 38.9 38.9 38.9 38.9 ...
## $ longitude : num -77 -77 -77 -77 -77 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ price : int 299 299 299 299 299 299 299 299 299 299 ...
## $ minimum_nights : int 3 3 3 3 3 3 3 3 1 1 ...
## $ number_of_reviews : int 1 7 7 7 7 7 7 7 2 2 ...
## $ last_review : Date, format: NA NA ...
## $ reviews_per_month : num 0.08 0.46 0.46 0.46 0.46 0.46 0.46 0.46 0.16 0.16 ...
## $ calculated_host_listings_count: int 27 27 27 27 27 27 27 27 27 27 ...
## $ availability_365 : int 172 361 361 361 361 361 361 361 361 361 ...
data2017 <- data %>% filter(review_date > as.Date("2017-01-01"))
price_boxplt <- boxplot(data2017$price)

data_clean <- subset(data2017, !(price %in% price_boxplt$out))
p1= ggplot(data = data_clean, aes(x = price))+
theme_minimal() +
geom_histogram(mapping = aes(price,..density..),
binwidth = dpih(data_clean$price)
, color = "white") +
geom_density(fill = "dodgerblue", color = "white", alpha = 1/2) +
geom_rug() +
labs(title = "Airbnb price in WC in 2017", x = "Price") +
scale_x_continuous(limits=c(0,300))
p2 = ggplot(data_clean,aes(x=factor(1),y=price))+
geom_boxplot(width= .50) +scale_y_continuous(limits=c(0,300))
plot_grid(p1,p2+coord_flip()+theme(axis.title.y = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y = element_blank()),
ncol=1,align="v",rel_heights=c(2,1))
## Warning: Removed 1 rows containing missing values (geom_bar).

p3=plot_ly(data=data_clean, y = ~price, color = ~room_type, type = "box")
p3
data_clean_sum=data_clean%>% group_by(neighbourhood)%>%summarise(count=n())
data_clean_sum$Proportion=data_clean_sum$count/nrow(data_clean)
data_clean_sum$Percent=data_clean_sum$count/nrow(data_clean)*100
data_clean_sum_top5=data_clean_sum%>%filter(rank(count)>34)
p4=plot_ly(data_clean_sum_top5, labels = ~neighbourhood, values = ~Percent, type = 'pie') %>%
layout(title = 'Top 5 Airbnb Location/Area in WC 2017',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p4
p5=plot_ly(data_clean_sum, labels = ~neighbourhood, values = ~Percent)%>%
add_pie(hole = 0.6) %>%
layout(title = 'Percentage of Airbnb Location/Area in WC 2017',
xaxis = list(showgrid = FALSE, zeroline = FALSE,
showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE,
showticklabels = FALSE))
p5