Activate packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Set directory

setwd("c:\\users\\laptop\\desktop\\data")

Import data

real_estate <- read_csv("processed-real-estate-data-ascii.csv")
## Parsed with column specification:
## cols(
##   id = col_double(),
##   district = col_character(),
##   city = col_character(),
##   geo_lat = col_double(),
##   geo_long = col_double(),
##   type = col_character(),
##   price = col_double(),
##   price_unit = col_character(),
##   identity_number = col_integer(),
##   surface_size = col_double(),
##   number_of_rooms = col_integer(),
##   number_of_toilets = col_integer(),
##   month = col_integer(),
##   year = col_integer(),
##   lat = col_double(),
##   lng = col_double()
## )

Check data structure:

str(real_estate, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    26763 obs. of  16 variables:
##  $ id               : num  23951 10113 71360 51930 136176 ...
##  $ district         : chr  "Quan 10" "Quan 8" "Quan 2" "Quan 7" ...
##  $ city             : chr  "Ho Chi Minh" "Ho Chi Minh" "Ho Chi Minh" "Ho Chi Minh" ...
##  $ geo_lat          : num  1.08e+16 1.07e+14 1.08e+14 1.07e+14 1.08e+14 ...
##  $ geo_long         : num  1.07e+16 1.07e+16 1.07e+13 1.07e+14 1.07e+14 ...
##  $ type             : chr  "Ban can ho chung cu" "Ban can ho chung cu" "Ban can ho chung cu" "Ban can ho chung cu" ...
##  $ price            : num  47 16 47 33 64 22 25 22 30 21 ...
##  $ price_unit       : chr  "trieu/m2" "trieu/m2" "trieu/m2" "trieu/m2" ...
##  $ identity_number  : int  14705252 14472361 13692400 15327652 15899425 14430046 14977029 15999882 15992253 15045467 ...
##  $ surface_size     : num  103 62 132 68 70 ...
##  $ number_of_rooms  : int  2 NA 3 NA NA 2 2 2 2 2 ...
##  $ number_of_toilets: int  2 NA 3 NA NA NA 2 2 2 2 ...
##  $ month            : int  2 1 3 3 5 4 2 5 5 3 ...
##  $ year             : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ lat              : num  10.8 10.7 10.8 10.7 10.8 ...
##  $ lng              : num  107 107 107 107 107 ...

Data processing

real_estate_dta <- select(real_estate, district, price) %>% 
  mutate(price = as.character(price)) %>% 
  mutate(price = case_when(
    price == "18055555556"~ "180.55555556",
    price == "166666666667" ~ "16666.6666667",
    price == "24000000000" ~ "240.00000000",
    price == "3731343" ~ "373.1343",
    price == "2.4e+10" ~ "240", 
    TRUE ~ price
  )) %>% 
  mutate(price = as.numeric(price))
real_estate_data <- real_estate_dta %>% 
  group_by(district) %>% 
  summarise(mean_price = mean(price, na.rm = TRUE)) %>% 
  ungroup() %>% 
  arrange(mean_price) %>% 
  mutate(district = forcats::fct_reorder(district, mean_price)) %>% 
  mutate(mean_price = as.integer(mean_price))

Check data structure:

str(real_estate_data, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    23 obs. of  2 variables:
##  $ district  : Factor w/ 23 levels "Huyen Cu Chi",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ mean_price: int  10 12 23 29 35 37 39 42 49 52 ...

Chart drawing

ggplot(real_estate_data, aes(district, mean_price))+
  geom_col(aes(fill = "red"), show.legend = FALSE) +
  coord_flip() +
  theme_minimal()+
  labs(title = "Average house price in Ho Chi Minh City",
       subtitle = "Unit : Thousand million")+
  scale_y_continuous(name = "Average price",
                     breaks = seq(0, 285, 15),
                     labels = c("0", "15", "30", "45", "60", "75", "90", "105", "120", "135",
                                "150", "165", "180", "195", "210","225", "240", "255", "270",
                                   "285"))+
  scale_x_discrete(name = NULL)