Activate packages
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Set directory
setwd("c:\\users\\laptop\\desktop\\data")
Import data
real_estate <- read_csv("processed-real-estate-data-ascii.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## district = col_character(),
## city = col_character(),
## geo_lat = col_double(),
## geo_long = col_double(),
## type = col_character(),
## price = col_double(),
## price_unit = col_character(),
## identity_number = col_integer(),
## surface_size = col_double(),
## number_of_rooms = col_integer(),
## number_of_toilets = col_integer(),
## month = col_integer(),
## year = col_integer(),
## lat = col_double(),
## lng = col_double()
## )
Check data structure:
str(real_estate, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 26763 obs. of 16 variables:
## $ id : num 23951 10113 71360 51930 136176 ...
## $ district : chr "Quan 10" "Quan 8" "Quan 2" "Quan 7" ...
## $ city : chr "Ho Chi Minh" "Ho Chi Minh" "Ho Chi Minh" "Ho Chi Minh" ...
## $ geo_lat : num 1.08e+16 1.07e+14 1.08e+14 1.07e+14 1.08e+14 ...
## $ geo_long : num 1.07e+16 1.07e+16 1.07e+13 1.07e+14 1.07e+14 ...
## $ type : chr "Ban can ho chung cu" "Ban can ho chung cu" "Ban can ho chung cu" "Ban can ho chung cu" ...
## $ price : num 47 16 47 33 64 22 25 22 30 21 ...
## $ price_unit : chr "trieu/m2" "trieu/m2" "trieu/m2" "trieu/m2" ...
## $ identity_number : int 14705252 14472361 13692400 15327652 15899425 14430046 14977029 15999882 15992253 15045467 ...
## $ surface_size : num 103 62 132 68 70 ...
## $ number_of_rooms : int 2 NA 3 NA NA 2 2 2 2 2 ...
## $ number_of_toilets: int 2 NA 3 NA NA NA 2 2 2 2 ...
## $ month : int 2 1 3 3 5 4 2 5 5 3 ...
## $ year : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ lat : num 10.8 10.7 10.8 10.7 10.8 ...
## $ lng : num 107 107 107 107 107 ...
Data processing
real_estate_dta <- select(real_estate, district, price) %>%
mutate(price = as.character(price)) %>%
mutate(price = case_when(
price == "18055555556"~ "180.55555556",
price == "166666666667" ~ "16666.6666667",
price == "24000000000" ~ "240.00000000",
price == "3731343" ~ "373.1343",
price == "2.4e+10" ~ "240",
TRUE ~ price
)) %>%
mutate(price = as.numeric(price))
real_estate_data <- real_estate_dta %>%
group_by(district) %>%
summarise(mean_price = mean(price, na.rm = TRUE)) %>%
ungroup() %>%
arrange(mean_price) %>%
mutate(district = forcats::fct_reorder(district, mean_price)) %>%
mutate(mean_price = as.integer(mean_price))
Check data structure:
str(real_estate_data, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 23 obs. of 2 variables:
## $ district : Factor w/ 23 levels "Huyen Cu Chi",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ mean_price: int 10 12 23 29 35 37 39 42 49 52 ...
Chart drawing
ggplot(real_estate_data, aes(district, mean_price))+
geom_col(aes(fill = "red"), show.legend = FALSE) +
coord_flip() +
theme_minimal()+
labs(title = "Average house price in Ho Chi Minh City",
subtitle = "Unit : Thousand million")+
scale_y_continuous(name = "Average price",
breaks = seq(0, 285, 15),
labels = c("0", "15", "30", "45", "60", "75", "90", "105", "120", "135",
"150", "165", "180", "195", "210","225", "240", "255", "270",
"285"))+
scale_x_discrete(name = NULL)
