library(tidyverse)
library(dsbox)
glimpse(edibnb)
## Rows: 13,245
## Columns: 10
## $ id <dbl> 15420, 24288, 38628, 44552, 47616, 48645, 51505, …
## $ price <dbl> 80, 115, 46, 32, 100, 71, 175, 150, 139, 190, 85,…
## $ neighbourhood <chr> "New Town", "Southside", NA, "Leith", "Southside"…
## $ accommodates <dbl> 2, 4, 2, 2, 2, 3, 5, 5, 6, 10, 2, 4, 3, 2, 2, 4, …
## $ bathrooms <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,…
## $ bedrooms <dbl> 1, 2, 0, 1, 1, 1, 2, 3, 4, 4, 1, 1, 1, 1, 1, 2, 1…
## $ beds <dbl> 1, 2, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1, 1, 1, 1, 2, 1…
## $ review_scores_rating <dbl> 99, 92, 94, 93, 98, 97, 100, 92, 96, 99, 77, 98, …
## $ number_of_reviews <dbl> 283, 199, 52, 184, 32, 762, 7, 28, 222, 142, 14, …
## $ listing_url <chr> "https://www.airbnb.com/rooms/15420", "https://ww…
cat("There are", nrow(edibnb) ,"observations in dataset.\n")
## There are 13245 observations in dataset.
The dataset have 13245 obversations.
view(edibnb)
names(edibnb)
## [1] "id" "price" "neighbourhood"
## [4] "accommodates" "bathrooms" "bedrooms"
## [7] "beds" "review_scores_rating" "number_of_reviews"
## [10] "listing_url"
cat("Each row represents an one Airbnb listing in Edinburgh.\n ")
## Each row represents an one Airbnb listing in Edinburgh.
##
ggplot(data = edibnb, mapping = aes(x = price, )) +
geom_histogram(binwidth = 10,color = "black",na.rm = TRUE) +
facet_wrap(~neighbourhood,scales="free_y",ncol = 4)
#scales="free_y"避免量少的hotel被稀釋看不出差異
cat("由於圖表有13個,選成分割成四欄的子圖表,相較全部變成一欄或一列較好觀察各街區差異")
## 由於圖表有13個,選成分割成四欄的子圖表,相較全部變成一欄或一列較好觀察各街區差異
第一管Use a single pipeline to identity the neighbourhoods with the top five median listing prices. 第二管 先filter 五街區,然後再畫山嶺圖in another pipeline filter the data for these five neighbourhoods and make ridge plots of the distributions of listing prices in these five neighbourhoods. 第三管敘述統計In a third pipeline calculate the minimum, mean, median, standard deviation, IQR, and maximum listing price in each of these neighbourhoods. Use the visualisation and the summary statistics to describe the distribution of listing prices in the neighbourhoods. (Your answer will include three pipelines, one of which ends in a visualisation, and a narrative.)
top5med_price<-edibnb %>%
filter(!is.na(neighbourhood),!is.na(price)) %>%
group_by(neighbourhood) %>%
summarize(median_price = median(price, na.rm = TRUE))%>%
arrange(desc(median_price))%>%
slice_head(n=5) #或是slice_max(median_price, n = 5)更:不可以這樣,數值重疊得會算一個而已
top5med_price
## # A tibble: 5 × 2
## neighbourhood median_price
## <chr> <dbl>
## 1 New Town 100
## 2 Old Town 90
## 3 West End 90
## 4 Stockbridge 85
## 5 Bruntsfield 80
top5_ridge_plot <-edibnb%>%
filter( neighbourhood %in% top5med_price$neighbourhood,
!is.na(price) # %in%代表在這個集合裡,代表neighbourhood只留下有在top5med_price集合裡的數值
)
top5_ridge_plot
## # A tibble: 3,936 × 10
## id price neighbourhood accommodates bathrooms bedrooms beds
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 15420 80 New Town 2 1 1 1
## 2 48645 71 Old Town 3 1 1 2
## 3 51505 175 New Town 5 1 2 3
## 4 54188 150 West End 5 1 3 4
## 5 58682 190 West End 10 2 4 7
## 6 67706 85 New Town 2 1 1 1
## 7 100285 120 New Town 2 1 1 1
## 8 131342 80 West End 4 1 1 1
## 9 138010 80 Old Town 4 1 1 1
## 10 170756 55 West End 2 2 1 1
## # ℹ 3,926 more rows
## # ℹ 3 more variables: review_scores_rating <dbl>, number_of_reviews <dbl>,
## # listing_url <chr>
library(ggridges)
ggplot(data = top5_ridge_plot,
mapping = aes(x = price,y = neighbourhood,
fill = neighbourhood ) )+
#框架建立,山嶺圖以y軸當作分類,並且要把密度圖群組都填上不同顏色(fill)
geom_density_ridges(alpha = 0.3)+ #填色需要透明一點才較好比對
labs(
title = "Price Distribution of Top 5 Neighbourhoods",
x = "Price(£) ",
y = "Neighbourhood"
)+
theme(legend.position = "none") #移除圖例,畫面看起來會比較乾淨
## Picking joint bandwidth of 13.8
edibnb %>%
filter(neighbourhood %in% top5med_price$neighbourhood,
!is.na(price)) %>% # 只保留前五名街區並去掉 price 欄位為 NA 的列
ggplot(aes(x = price, y = fct_rev(factor(neighbourhood)), fill = neighbourhood)) + # fct_rev() 是為了把最貴的街區放在最上面
geom_density_ridges(alpha = 0.6, scale = 0.8) + # ridge plot,alpha 0.6 讓顏色半透明,scale 控制 ridge plot 的高度
labs(title = "Distribution of Airbnb Prices in Top 5 Neighbourhoods",
x = "Price (£)",
y = "Neighbourhood") +
theme_minimal() +
theme(legend.position = "none") #移除圖例,畫面看起來會比較乾淨
## Picking joint bandwidth of 13.8
top5_stats <-edibnb%>%
filter( neighbourhood %in% top5med_price$neighbourhood,
!is.na(price)
)%>% #一樣篩選出top5並清除缺失值
group_by(neighbourhood)%>%
summarise(min_price = min(price),
mean_price = mean(price),
median_price = median(price),
sd_price = sd(price),
IQR_price = IQR(price),
max_price = max(price),na.rm = TRUE
)#用summarize來觀察敘述統計
top5_stats
## # A tibble: 5 × 8
## neighbourhood min_price mean_price median_price sd_price IQR_price max_price
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bruntsfield 10 99.4 80 90.2 72.5 900
## 2 New Town 12 136. 100 109. 86.5 999
## 3 Old Town 15 128. 90 110. 76 999
## 4 Stockbridge 21 104. 85 77.6 66 750
## 5 West End 19 116. 90 93.3 80 999
## # ℹ 1 more variable: na.rm <lgl>
cat("這前五名中位數價格的街區中,Bruntsfield有最便宜的價格,均價也是最低。New Town平均和中位數價格格皆最貴。標準差方面,Old Town的離散程度最大、Stockbridge最小。而四分位距差方面,Stockbridge差距最小、New Town差距最大")
## 這前五名中位數價格的街區中,Bruntsfield有最便宜的價格,均價也是最低。New Town平均和中位數價格格皆最貴。標準差方面,Old Town的離散程度最大、Stockbridge最小。而四分位距差方面,Stockbridge差距最小、New Town差距最大
Create a visualization that will help you compare the distribution of review scores (review_scores_rating) across neighbourhoods. 建立一個視覺化圖表來幫助你比較不同鄰居的評分分佈
You get to decide what type of visualisation to create and there is more than one correct answer! In your answer, include a brief interpretation of how Airbnb guests rate properties in general and how the neighbourhoods compare to each other in terms of their ratings. 請簡單解釋 Airbnb 房客對房源的整體評價,以及不同社區的評分比較情況。
##山嶺圖
ggplot(data = top5_ridge_plot,
mapping = aes(x = review_scores_rating,y = neighbourhood,
fill = neighbourhood ) )+
#框架建立,山嶺圖以y軸當作分類,並且要把密度圖群組都填上不同顏色(fill)
geom_density_ridges(alpha = 0.3)+ #填色需要透明一點才較好比對
labs(
title = "Review Score Distribution of Top 5 Neighbourhoods",
x = "Review Score ",
y = "Neighbourhood"
)
## Picking joint bandwidth of 1.06
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
#密度圖
ggplot(top5_ridge_plot, aes(x = review_scores_rating, fill = neighbourhood)) +
geom_density(alpha = 0.3) +
labs(
title = "Density of Review Scores by Neighbourhood",
x = "Review Score",
y = "Density"
) +
theme_minimal()
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_density()`).
##直方圖
ggplot(top5_ridge_plot, aes(x = review_scores_rating, fill = neighbourhood)) +
geom_histogram(binwidth = 5, color = "white", alpha = 0.4, position = "identity") +
labs(
title = "Review Score Distribution of Top 5 Neighbourhoods",
x = "Review Score",
y = "Count"
) +
theme_minimal()
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_bin()`).
##箱型圖
ggplot(top5_ridge_plot,
aes(x = review_scores_rating, y =neighbourhood , fill = neighbourhood)) +
geom_boxplot(alpha = 0.6, outlier.color = "gray30") +
coord_flip() +
labs(
title = "Review Score Distribution of Top 5 Neighbourhoods",
x = "Neighbourhood",
y = "Review Score"
) +
theme_minimal() +
theme(legend.position = "none")
## Warning: Removed 549 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
cat("比對下來,覺得箱型圖較能更清楚比較各地區的中位數與變異數。而山脊圖與直方圖不適合是因為評分高度集中在接近100,圖形會重疊,難以呈現差異。房客普遍的評分都給予蠻高的分數,中位數都有到90分以上, 中位數: Bruntsfield最高/Old town 最低
outlier:Stockbridge最少")
## 比對下來,覺得箱型圖較能更清楚比較各地區的中位數與變異數。而山脊圖與直方圖不適合是因為評分高度集中在接近100,圖形會重疊,難以呈現差異。房客普遍的評分都給予蠻高的分數,中位數都有到90分以上, 中位數: Bruntsfield最高/Old town 最低
## outlier:Stockbridge最少