Load packages and data

library(tidyverse)
library(dsbox)

Exercises

Exercise 1

glimpse(edibnb)
## Rows: 13,245
## Columns: 10
## $ id                   <dbl> 15420, 24288, 38628, 44552, 47616, 48645, 51505, …
## $ price                <dbl> 80, 115, 46, 32, 100, 71, 175, 150, 139, 190, 85,…
## $ neighbourhood        <chr> "New Town", "Southside", NA, "Leith", "Southside"…
## $ accommodates         <dbl> 2, 4, 2, 2, 2, 3, 5, 5, 6, 10, 2, 4, 3, 2, 2, 4, …
## $ bathrooms            <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,…
## $ bedrooms             <dbl> 1, 2, 0, 1, 1, 1, 2, 3, 4, 4, 1, 1, 1, 1, 1, 2, 1…
## $ beds                 <dbl> 1, 2, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1, 1, 1, 1, 2, 1…
## $ review_scores_rating <dbl> 99, 92, 94, 93, 98, 97, 100, 92, 96, 99, 77, 98, …
## $ number_of_reviews    <dbl> 283, 199, 52, 184, 32, 762, 7, 28, 222, 142, 14, …
## $ listing_url          <chr> "https://www.airbnb.com/rooms/15420", "https://ww…
cat("There are", nrow(edibnb) ,"observations in dataset.\n")
## There are 13245 observations in dataset.

The dataset have 13245 obversations.

Exercise 2

view(edibnb)
names(edibnb)
##  [1] "id"                   "price"                "neighbourhood"       
##  [4] "accommodates"         "bathrooms"            "bedrooms"            
##  [7] "beds"                 "review_scores_rating" "number_of_reviews"   
## [10] "listing_url"
cat("Each row represents an one Airbnb listing in Edinburgh.\n ")
## Each row represents an one Airbnb listing in Edinburgh.
## 

Exercise 3

ggplot(data = edibnb, mapping = aes(x = price, )) +
  geom_histogram(binwidth = 10,color = "black",na.rm = TRUE) +
  facet_wrap(~neighbourhood,scales="free_y",ncol = 4)

#scales="free_y"避免量少的hotel被稀釋看不出差異
cat("由於圖表有13個,選成分割成四欄的子圖表,相較全部變成一欄或一列較好觀察各街區差異")
## 由於圖表有13個,選成分割成四欄的子圖表,相較全部變成一欄或一列較好觀察各街區差異

Exercise 4

第一管Use a single pipeline to identity the neighbourhoods with the top five median listing prices. 第二管 先filter 五街區,然後再畫山嶺圖in another pipeline filter the data for these five neighbourhoods and make ridge plots of the distributions of listing prices in these five neighbourhoods. 第三管敘述統計In a third pipeline calculate the minimum, mean, median, standard deviation, IQR, and maximum listing price in each of these neighbourhoods. Use the visualisation and the summary statistics to describe the distribution of listing prices in the neighbourhoods. (Your answer will include three pipelines, one of which ends in a visualisation, and a narrative.)

top5med_price<-edibnb %>%
  filter(!is.na(neighbourhood),!is.na(price)) %>%
  group_by(neighbourhood) %>%
  summarize(median_price = median(price, na.rm = TRUE))%>%
  arrange(desc(median_price))%>%
  slice_head(n=5)  #或是slice_max(median_price, n = 5)更:不可以這樣,數值重疊得會算一個而已
top5med_price
## # A tibble: 5 × 2
##   neighbourhood median_price
##   <chr>                <dbl>
## 1 New Town               100
## 2 Old Town                90
## 3 West End                90
## 4 Stockbridge             85
## 5 Bruntsfield             80
top5_ridge_plot <-edibnb%>%
  filter(  neighbourhood %in% top5med_price$neighbourhood, 
          !is.na(price)  # %in%代表在這個集合裡,代表neighbourhood只留下有在top5med_price集合裡的數值
    )
top5_ridge_plot
## # A tibble: 3,936 × 10
##        id price neighbourhood accommodates bathrooms bedrooms  beds
##     <dbl> <dbl> <chr>                <dbl>     <dbl>    <dbl> <dbl>
##  1  15420    80 New Town                 2         1        1     1
##  2  48645    71 Old Town                 3         1        1     2
##  3  51505   175 New Town                 5         1        2     3
##  4  54188   150 West End                 5         1        3     4
##  5  58682   190 West End                10         2        4     7
##  6  67706    85 New Town                 2         1        1     1
##  7 100285   120 New Town                 2         1        1     1
##  8 131342    80 West End                 4         1        1     1
##  9 138010    80 Old Town                 4         1        1     1
## 10 170756    55 West End                 2         2        1     1
## # ℹ 3,926 more rows
## # ℹ 3 more variables: review_scores_rating <dbl>, number_of_reviews <dbl>,
## #   listing_url <chr>
library(ggridges)
ggplot(data = top5_ridge_plot,
       mapping = aes(x = price,y = neighbourhood,
                     fill = neighbourhood ) )+ 
         #框架建立,山嶺圖以y軸當作分類,並且要把密度圖群組都填上不同顏色(fill)
         geom_density_ridges(alpha = 0.3)+ #填色需要透明一點才較好比對
         labs(
           title = "Price Distribution of Top 5 Neighbourhoods",
           x = "Price(£) ",
           y = "Neighbourhood"
           )+
         theme(legend.position = "none") #移除圖例,畫面看起來會比較乾淨
## Picking joint bandwidth of 13.8

edibnb %>%
  filter(neighbourhood %in% top5med_price$neighbourhood,
         !is.na(price)) %>%  # 只保留前五名街區並去掉 price 欄位為 NA 的列
  ggplot(aes(x = price, y = fct_rev(factor(neighbourhood)), fill = neighbourhood)) + # fct_rev() 是為了把最貴的街區放在最上面
  geom_density_ridges(alpha = 0.6, scale = 0.8) +                     # ridge plot,alpha 0.6 讓顏色半透明,scale 控制 ridge plot 的高度
  labs(title = "Distribution of Airbnb Prices in Top 5 Neighbourhoods",
       x = "Price (£)",
       y = "Neighbourhood") +
  theme_minimal() +
  theme(legend.position = "none") #移除圖例,畫面看起來會比較乾淨
## Picking joint bandwidth of 13.8

top5_stats <-edibnb%>%
  filter(  neighbourhood %in% top5med_price$neighbourhood, 
          !is.na(price)
          )%>% #一樣篩選出top5並清除缺失值
  group_by(neighbourhood)%>%
  summarise(min_price = min(price),
            mean_price = mean(price),
            median_price = median(price),
            sd_price = sd(price),
            IQR_price = IQR(price),
            max_price = max(price),na.rm = TRUE
            )#用summarize來觀察敘述統計
top5_stats
## # A tibble: 5 × 8
##   neighbourhood min_price mean_price median_price sd_price IQR_price max_price
##   <chr>             <dbl>      <dbl>        <dbl>    <dbl>     <dbl>     <dbl>
## 1 Bruntsfield          10       99.4           80     90.2      72.5       900
## 2 New Town             12      136.           100    109.       86.5       999
## 3 Old Town             15      128.            90    110.       76         999
## 4 Stockbridge          21      104.            85     77.6      66         750
## 5 West End             19      116.            90     93.3      80         999
## # ℹ 1 more variable: na.rm <lgl>
cat("這前五名中位數價格的街區中,Bruntsfield有最便宜的價格,均價也是最低。New Town平均和中位數價格格皆最貴。標準差方面,Old Town的離散程度最大、Stockbridge最小。而四分位距差方面,Stockbridge差距最小、New Town差距最大")
## 這前五名中位數價格的街區中,Bruntsfield有最便宜的價格,均價也是最低。New Town平均和中位數價格格皆最貴。標準差方面,Old Town的離散程度最大、Stockbridge最小。而四分位距差方面,Stockbridge差距最小、New Town差距最大

Exercise 5

Create a visualization that will help you compare the distribution of review scores (review_scores_rating) across neighbourhoods. 建立一個視覺化圖表來幫助你比較不同鄰居的評分分佈

You get to decide what type of visualisation to create and there is more than one correct answer! In your answer, include a brief interpretation of how Airbnb guests rate properties in general and how the neighbourhoods compare to each other in terms of their ratings. 請簡單解釋 Airbnb 房客對房源的整體評價,以及不同社區的評分比較情況。

##山嶺圖
ggplot(data = top5_ridge_plot,
       mapping = aes(x = review_scores_rating,y = neighbourhood,
                     fill = neighbourhood ) )+ 
         #框架建立,山嶺圖以y軸當作分類,並且要把密度圖群組都填上不同顏色(fill)
         geom_density_ridges(alpha = 0.3)+ #填色需要透明一點才較好比對
         labs(
           title = "Review Score Distribution of Top 5 Neighbourhoods",
           x = "Review Score ",
           y = "Neighbourhood"
           )
## Picking joint bandwidth of 1.06
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

#密度圖
ggplot(top5_ridge_plot, aes(x = review_scores_rating, fill = neighbourhood)) +
  geom_density(alpha = 0.3) +
  labs(
    title = "Density of Review Scores by Neighbourhood",
    x = "Review Score",
    y = "Density"
  ) +
  theme_minimal()
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_density()`).

 ##直方圖
ggplot(top5_ridge_plot, aes(x = review_scores_rating, fill = neighbourhood)) +
  geom_histogram(binwidth = 5, color = "white", alpha = 0.4, position = "identity") +
  labs(
    title = "Review Score Distribution of Top 5 Neighbourhoods",
    x = "Review Score",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_bin()`).

##箱型圖
ggplot(top5_ridge_plot,
       aes(x = review_scores_rating, y =neighbourhood , fill = neighbourhood)) +
  geom_boxplot(alpha = 0.6, outlier.color = "gray30") +
  coord_flip() +
  labs(
    title = "Review Score Distribution of Top 5 Neighbourhoods",
    x = "Neighbourhood",
    y = "Review Score"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

cat("比對下來,覺得箱型圖較能更清楚比較各地區的中位數與變異數。而山脊圖與直方圖不適合是因為評分高度集中在接近100,圖形會重疊,難以呈現差異。房客普遍的評分都給予蠻高的分數,中位數都有到90分以上, 中位數: Bruntsfield最高/Old town 最低
   outlier:Stockbridge最少")
## 比對下來,覺得箱型圖較能更清楚比較各地區的中位數與變異數。而山脊圖與直方圖不適合是因為評分高度集中在接近100,圖形會重疊,難以呈現差異。房客普遍的評分都給予蠻高的分數,中位數都有到90分以上, 中位數: Bruntsfield最高/Old town 最低
##    outlier:Stockbridge最少