library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.3.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(epitools)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'tibble' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Dữ liệu
setwd("F:/PTDLĐT")
data <- read.csv("EDAdataset.csv")
str(data)
## 'data.frame': 153432 obs. of 15 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ property_type: chr "Flat" "Flat" "House" "House" ...
## $ price : int 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : chr "3,367,989" "33,700,993" "33,631,485,999,999,900" "33,707,572,937,012" ...
## $ longitude : chr "7,301,264" "72,971,492" "72,926,559" "7,315,119,934,082" ...
## $ baths : int 2 3 6 4 3 8 8 2 7 5 ...
## $ purpose : chr "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : int 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : chr "04-02-2019" "04-05-2019" "17-07-2019" "05-04-2019" ...
## $ agency : chr "Self" "Self" "Self" "Self" ...
## $ agent : chr "Self" "Self" "Self" "Self" ...
## $ Area_in_Marla: chr "4.00" "5.60" "8.00" "40.00" ...
#1. Thống kê mô tả các biến
table(data$purpose)
##
## For Rent For Sale
## 2 43183 110247
table(data$purpose)/sum(table(data$purpose))
##
## For Rent For Sale
## 1.303509e-05 2.814472e-01 7.185398e-01
library(ggplot2)
data |> ggplot(aes(x = data$purpose, y = after_stat(count))) + geom_bar(fill = 'blue') + geom_text(aes(label = scales::percent(after_stat(count/sum(count)))), stat = 'count', color = 'green', vjust = - .5) + theme_classic() + labs(x = 'Purpose', y = 'Frequency')
## Warning: Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.
## Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.