MATH2270/MATH2237 Assignment 3

Code

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(readr)
library(purrr)
library(tibble)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(outliers)
library(KernSmooth)

## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:plotly':
## 
##     subplot

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(cowplot)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

reviewsWC <- read_csv("D:/Master of Analytics/Sem 1 2018/MATH2270 - Data Visualisation/Assignment 3/reviews_WC.csv") %>% as.data.frame()

## Parsed with column specification:
## cols(
##   listing_id = col_integer(),
##   date = col_date(format = "")
## )

listingsWC <- read_csv("D:/Master of Analytics/Sem 1 2018/MATH2270 - Data Visualisation/Assignment 3/listings_WC.csv") %>% as.data.frame()

## Parsed with column specification:
## cols(
##   id = col_integer(),
##   name = col_character(),
##   host_id = col_integer(),
##   host_name = col_character(),
##   neighbourhood_group = col_character(),
##   neighbourhood = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   room_type = col_character(),
##   price = col_integer(),
##   minimum_nights = col_integer(),
##   number_of_reviews = col_integer(),
##   last_review = col_character(),
##   reviews_per_month = col_double(),
##   calculated_host_listings_count = col_integer(),
##   availability_365 = col_integer()
## )

colnames(reviewsWC) <- c("id", "review_date")

for (col_name in list("id", "host_id")) {
  listingsWC[, col_name] <- as.factor(listingsWC[, col_name])
}

for (coL_name in list("price", "minimum_nights", "number_of_reviews", 
                      "reviews_per_month", "calculated_host_listings_count", 
                      "availability_365")){
  listingsWC[, col_name] <- as.numeric(listingsWC[, coL_name])
}

listingsWC$room_type <- factor(listingsWC$room_type)
listingsWC$neighbourhood <- factor(listingsWC$neighbourhood)
listingsWC$last_review <- as.Date(listingsWC$last_review, "%Y-%m-%d")

reviewsWC$review_date <- as.Date(reviewsWC$review_date, "%Y-%m-%d")
reviewsWC$id <- as.factor(reviewsWC$id)

data <- merge.data.frame(x = reviewsWC, 
                         y = listingsWC, 
                         by.x = "id", 
                         by.y = "id", 
                         all.x = TRUE)
data<- data[, !colnames(data) %in% c("neighbourhood_group")]
str(data)

## 'data.frame':    152548 obs. of  16 variables:
##  $ id                            : Factor w/ 5660 levels "2476","3362",..: 2537 2538 2538 2538 2538 2538 2538 2538 2539 2539 ...
##  $ review_date                   : Date, format: "2016-05-14" "2016-02-14" ...
##  $ name                          : chr  "Washington Gorgeous 2 BR Apartment" "Washington Furnished 2 Bedroom Apt." "Washington Furnished 2 Bedroom Apt." "Washington Furnished 2 Bedroom Apt." ...
##  $ host_id                       : num  172 361 361 361 361 361 361 361 361 361 ...
##  $ host_name                     : chr  "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" "Heaven In USA Furnished Apartments" ...
##  $ neighbourhood                 : Factor w/ 39 levels "Brightwood Park, Crestwood, Petworth",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ latitude                      : num  38.9 38.9 38.9 38.9 38.9 ...
##  $ longitude                     : num  -77 -77 -77 -77 -77 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ price                         : int  299 299 299 299 299 299 299 299 299 299 ...
##  $ minimum_nights                : int  3 3 3 3 3 3 3 3 1 1 ...
##  $ number_of_reviews             : int  1 7 7 7 7 7 7 7 2 2 ...
##  $ last_review                   : Date, format: NA NA ...
##  $ reviews_per_month             : num  0.08 0.46 0.46 0.46 0.46 0.46 0.46 0.46 0.16 0.16 ...
##  $ calculated_host_listings_count: int  27 27 27 27 27 27 27 27 27 27 ...
##  $ availability_365              : int  172 361 361 361 361 361 361 361 361 361 ...

data2017 <- data %>% filter(review_date > as.Date("2017-01-01"))
price_boxplt <- boxplot(data2017$price)

data_clean <- subset(data2017, !(price %in% price_boxplt$out))

p1= ggplot(data = data_clean, aes(x = price))+
  theme_minimal() +
  geom_histogram(mapping = aes(price,..density..), 
                 binwidth = dpih(data_clean$price)
                , color = "white") + 
  geom_density(fill = "dodgerblue", color = "white", alpha = 1/2) +
  geom_rug() +
  labs(title = "Airbnb price in WC in 2017", x = "Price") +
  scale_x_continuous(limits=c(0,300))

p2 = ggplot(data_clean,aes(x=factor(1),y=price))+
  geom_boxplot(width= .50) +scale_y_continuous(limits=c(0,300))

plot_grid(p1,p2+coord_flip()+theme(axis.title.y = element_blank(),
                                   axis.text.y=element_blank(),
                                   axis.ticks.y = element_blank()),
          ncol=1,align="v",rel_heights=c(2,1))

## Warning: Removed 1 rows containing missing values (geom_bar).

p3=plot_ly(data=data_clean, y = ~price, color = ~room_type, type = "box")
p3

data_clean_sum=data_clean%>% group_by(neighbourhood)%>%summarise(count=n())
data_clean_sum$Proportion=data_clean_sum$count/nrow(data_clean)
data_clean_sum$Percent=data_clean_sum$count/nrow(data_clean)*100
data_clean_sum_top5=data_clean_sum%>%filter(rank(count)>34)

p4=plot_ly(data_clean_sum_top5, labels = ~neighbourhood, values = ~Percent, type = 'pie') %>%
  layout(title = 'Top 5 Airbnb Location/Area in WC 2017',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p4

p5=plot_ly(data_clean_sum, labels = ~neighbourhood, values = ~Percent)%>%
  add_pie(hole = 0.6) %>%
  layout(title = 'Percentage of Airbnb Location/Area in WC 2017',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, 
                      showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, 
                      showticklabels = FALSE))
p5

MATH2270/MATH2237 Assignment 3

Interactive Storytelling

Student Details

Story URL

Visualisation URL

DATA Source

Code