Introduction

The flats data set includes flat specifications in 23 Budapest districts. The data set contains information on street address, area, number of rooms, condition, price, floor number and so on. The following summary table shows the characteristics of Price, District and Area columns as an example.

library(data.table)
library(ggplot2)
library(modelsummary)
library(kableExtra)
library(dplyr)
library(ggrepel)
library(tidyselect)
library(ggplot2)
library(ggmap)
library(tidygeocoder)
library(scatterpie)

df <- data.table(readRDS('flats.rds'))
P95 <- function(x){quantile(x,0.95,na.rm=T)}
P05 <- function(x){quantile(x,0.05,na.rm=T)}
datasummary( Price + Area + District ~
             Mean + Median + SD + Min + Max + P05 + P95 , 
             data = df ,
             title = 'Descriptive statistics') %>% 
      kable_styling(latex_options = c("HOLD_position","scale_down"))
Descriptive statistics
Mean Median SD Min Max P05 P95
Price 208621.50 150000.00 152194.83 28000.00 1e+06 87950.00 544050.00
Area 60.34 53.00 30.36 5.00 200.00 26.00 120.00
District 8.41 8.00 4.28 1.00 23.00 2.00 14.00

Task 1

ggplot(df, aes(x = Area)) + 
  geom_bar(fill = "#006D77", color = "white", stat = 'bin', binwidth = 5) +
  theme_bw() + 
  ggtitle("Distribution of the Area of flats (m2)")

Task 2

ggplot(df[!is.na(Condition), ], aes(x = Price, group = Condition, fill = Condition, )) + 
  geom_density(alpha = 0.25) +
  theme_bw() +
  ggtitle("Price Distribution for flats in different conditions") + theme(legend.position = "top") + guides(fill = guide_legend(nrow = 1)) + scale_x_continuous(label = scales::dollar_format( prefix= "", suffix = " Ft"))

Task 3

ggplot(df[!is.na(Condition), ], aes(x = Area, y = Price,  color = Condition)) + 
  geom_point(alpha = 0.3) +
  theme_bw() + 
  ggtitle("How the condition of the flats effects price to area") + 
  geom_smooth(aes(color = factor(Condition)), method = "lm", se = FALSE) + labs(color = "Condition") + scale_x_continuous(label= scales::dollar_format(prefix = "", suffix = " m2")) + scale_y_continuous(label=scales:: dollar_format(prefix = "", suffix = " Ft")) + theme(legend.position = "bottom") + guides(color = guide_legend(nrow = 1))

Task 4

avg_price_dis <- df[,.(avg_price=mean(Price)),by = District]
ggplot(avg_price_dis,
  aes(x = factor(District), y = avg_price)) +
  geom_bar( size = 0.5, fill = "#006D77", stat = "identity") +
  theme_bw()+
  labs(y="Average Price", x= "District") + scale_y_continuous(label=scales:: dollar_format(prefix = "", suffix = " Ft"))

Task 5

ggplot(df, aes(x = factor(District), y = Price)) + 
  geom_violin(color = "#006D77", fill = "#66B7B0" ) +
  theme_bw() +
  labs(x = "District", y= "Price") + scale_y_continuous(label=scales:: dollar_format(prefix = "", suffix = " Ft"))

Task 6

Computing the mean and sd of all numeric variables grouped by District:

Districts <- df %>%
  group_by(District) %>%
  summarise(Distirct.mean = mean(District),
            District.sd = sd(District),
            Area.mean = mean(Area),
            Area.sd = sd(Area),
            Num_whole_rooms.mean = mean(Num_whole_rooms),
            Num_whole_rooms.sd = sd(Num_whole_rooms),
            Num_half_rooms.mean = mean(Num_half_rooms, na.rm = TRUE),
            Num_half_rooms.sd = sd(Num_half_rooms, na.rm = TRUE),
            Price.mean = mean(Price),
            Price.sd = sd(Price),
            Floor.mean = mean(Floor, na.rm = TRUE),
            Floor.sd = sd(Floor, na.rm = TRUE),
            Floors_in_bdg.mean = mean(Floors_in_bdg, na.rm = TRUE),
            Floors_in_bdg.sd = sd(Floors_in_bdg, na.rm = TRUE),
            Overhead.mean = mean(Overhead, na.rm = TRUE),
            Overhead.sd = sd(Overhead, na.rm = TRUE),
            Parking_fee_monthly.mean = mean(`Parking_fee(monthly)`, na.rm = TRUE),
            Parking_fee_monthly.sd = sd(`Parking_fee(monthly)`, na.rm = TRUE))
str(Districts)
## tibble [23 × 19] (S3: tbl_df/tbl/data.frame)
##  $ District                : num [1:23] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Distirct.mean           : num [1:23] 1 2 3 4 5 6 7 8 9 10 ...
##  $ District.sd             : num [1:23] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Area.mean               : num [1:23] 64.8 83.2 61.2 47.1 73.9 ...
##  $ Area.sd                 : num [1:23] 30.7 38.4 30.1 19.9 32.1 ...
##  $ Num_whole_rooms.mean    : num [1:23] 2 2.62 2.05 1.48 2.27 ...
##  $ Num_whole_rooms.sd      : num [1:23] 0.921 1.195 1.049 0.653 0.992 ...
##  $ Num_half_rooms.mean     : num [1:23] 1.11 1.08 1.32 1.19 1.11 ...
##  $ Num_half_rooms.sd       : num [1:23] 0.346 0.316 0.522 0.422 0.332 ...
##  $ Price.mean              : num [1:23] 262886 354291 203518 134079 297399 ...
##  $ Price.sd                : num [1:23] 178666 227459 132615 47520 194448 ...
##  $ Floor.mean              : num [1:23] 2.27 2.22 2.98 3.53 2.72 ...
##  $ Floor.sd                : num [1:23] 1.46 1.58 2.36 2.42 1.47 ...
##  $ Floors_in_bdg.mean      : num [1:23] 4.15 3.8 4.87 6 4.61 ...
##  $ Floors_in_bdg.sd        : num [1:23] 1.4 1.63 2.93 3.04 1.17 ...
##  $ Overhead.mean           : num [1:23] 28839 30397 26170 24342 30995 ...
##  $ Overhead.sd             : num [1:23] 17414 23015 15015 10928 17987 ...
##  $ Parking_fee_monthly.mean: num [1:23] 670368 551245 612206 702091 20521 ...
##  $ Parking_fee_monthly.sd  : num [1:23] 4420566 3218645 3425745 3193486 27915 ...

Applying MDS on this dataset and visualizing the similarities of the Budapest districts:

mds <- data.frame(cmdscale(dist(scale(Districts))))
mds$districts <- row.names(mds)
ggplot(mds, aes(X1, X2, label= districts)) + labs(title = "Budapest Districts") + theme_void() + theme(plot.title = element_text(hjust = 0.5)) + geom_label_repel()

Bonus: Task 1

Geocoding the 23 districts of Budapest and showing them on a map:

Budapest <- df[, .N, by = District]
Budapest[, address := paste0('Budapest ', as.roman(District), ' district')]
Budapest <- data.table(geocode(Budapest, 'address'))


bp <- get_stamenmap(
    c(
        left = min(Budapest$long) * 0.995,
        right = max(Budapest$long) * 1.001,
        bottom = min(Budapest$lat) * 0.999,
        top = max(Budapest$lat)) * 1.001,
    maptype = 'toner-2011',
    zoom = 12)

ggmap(bp) +
    geom_point(data = Budapest, aes(long, lat, size = N), shape = 19, color = 'orange') +
    theme_void() + theme(legend.position = 'none')

Bonus: Task 2

Now, I am using the location data from above, but instead of points, I placed small pie-charts (!) using the scatterpie package on the map to show the distribution of comfort level for each district:

Buda_dists <- df[!is.na(Comfort_lev), .N, by = .(District, Comfort_lev)]
  Buda_dists <- dcast(Buda_dists, District ~ Comfort_lev)
  Buda_dists[is.na(Buda_dists)] <- 0
  Buda_dists[, address := paste0('Budapest ', as.roman(District), ' district')]
Buda_dists <- geocode(Buda_dists, 'address')

ggmap(bp) +
    geom_scatterpie(
        data = Buda_dists,
        aes(long, lat, group = District, r = 0.012),
       cols=c("very low", "low", "average", "high", "very high", "luxury"), color = NA, alpha = 0.6, size= 0.4) +
    theme_void() + theme(legend.position = 'top') +
    guides(fill = guide_legend(title = 'Comfort level', nrow = 1))