2023-01-15

Homework 1

Alima Dzhanybaeva

Downloading dataset

df <- data.table(readRDS('flats.rds'))

Task 1

ggplot(df, aes(x = Area, y =..count..)) + 
  geom_histogram(fill='#006D77', color = 'white', stat = "bin", binwidth = 5) +
  labs(title = 'Distribution of the Area of flats (m2)') + 
  theme_bw()

Task 2

df1 <- df %>% filter(!is.na(Condition))

ggplot(df1, aes(x=Price, fill=Condition)) +
  geom_density(alpha=0.3) + 
  scale_x_continuous(breaks = c(0, 250000, 500000, 750000, 1000000),
                   label = c("0 Ft", "250,000 Ft", "500,000 Ft", 
                             "750,000 Ft", "1,000,000 Ft")) + 
  theme_bw() +
  theme(legend.position="top") +
  guides(fill=guide_legend(nrow=1, byrow=TRUE)) +
  labs(title = 'Price distribution for flats in different conditions')

Task 3

ggplot(df1, aes(x=Area, y=Price, color = Condition)) + 
  geom_point(alpha=0.5, size = 2)+
  geom_smooth(method=lm, se=FALSE) +
  scale_y_continuous(breaks = c(0, 250000, 500000, 750000, 1000000),
                     label = c("0 Ft", "250,000 Ft", "500,000 Ft", 
                               "750,000 Ft", "1,000,000 Ft")) + 
  scale_x_continuous(breaks = c(0, 50, 100, 150, 200),
                     labels = c("0 m2", "50 m2", "100 m2", 
                                "150 m2", "200 m2")) + 
  theme_bw() + 
  theme(legend.position="bottom") +
  guides(color=guide_legend(nrow=1, byrow=TRUE)) + 
  labs(title = 'How the condition of the flats effects price to area')
## `geom_smooth()` using formula = 'y ~ x'

Task 4

df2 <- df %>% group_by(District) %>% 
  summarise(Price = mean(Price))
ggplot(df2, aes(x=District, y = Price)) +
  geom_histogram(stat = 'identity', fill='#006D77', color = 'white') +
  scale_x_continuous(breaks=seq(1,23,1)) +
  scale_y_continuous(breaks = c(0, 100000, 200000, 300000),
                     label = c("0 Ft", "100,000 Ft", 
                               "200,000 Ft", "300,000 Ft"))+
  ylab('Average price') +
  theme_bw()

Task 5

ggplot(df, aes(x=as.factor(District), y=Price)) + 
  geom_violin(color = '#006D77', fill = '#66B7B0', linewidth = 0.6) +
  scale_y_continuous(breaks = c(0, 250000, 500000, 750000, 1000000),
                     label = c("0 Ft", "250,000 Ft", "500,000 Ft", 
                               "750,000 Ft", "1,000,000 Ft")) +
  theme_bw() +
  xlab('District')

Task 6

df3 <- df %>% rename('Parking_fee_monthly'='Parking_fee(monthly)') %>%
  group_by(District) %>%
  summarise(Distirct.mean = mean(District),
            Didtrict.sd = sd(District),
            Area.mean = mean(Area),
            Area.sd = sd(Area),
            Num_whole_rooms.mean = mean(Num_whole_rooms),
            Num_whole_rooms.sd = sd(Num_whole_rooms),
            Num_half_rooms.mean = mean(Num_half_rooms, na.rm = TRUE),
            Num_half_rooms.sd = sd(Num_half_rooms, na.rm = TRUE),
            Price.mean = mean(Price),
            Price.sd = sd(Price),
            Floor.mean = mean(Floor, na.rm = TRUE),
            Floor.sd = sd(Floor, na.rm = TRUE),
            Floors_in_bdg.mean = mean(Floors_in_bdg, na.rm = TRUE),
            Floors_in_bdg.sd = sd(Floors_in_bdg, na.rm = TRUE),
            Overhead.mean = mean(Overhead, na.rm = TRUE),
            Overhead.sd = sd(Overhead, na.rm = TRUE),
            Parking_fee_monthly.mean = mean(Parking_fee_monthly, na.rm = TRUE),
            Parking_fee_monthly.sd = sd(Parking_fee_monthly, na.rm = TRUE))
mds <- cmdscale(dist(scale(df3)))
mds <- as.data.frame(mds)
mds$District <- rownames(mds)
ggplot(mds, aes(V1, V2, label = District)) +
  geom_label(size=3.5) + theme_void() +
  ggtitle("Budapest districts") +
  theme(plot.title = element_text(hjust = 0.5))