library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.3.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(epitools)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'tibble' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Dữ liệu

setwd("F:/PTDLĐT")
data <- read.csv("EDAdataset.csv")
str(data)
## 'data.frame':    153430 obs. of  15 variables:
##  $ N            : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr  "Flat" "Flat" "House" "House" ...
##  $ price        : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr  "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr  "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr  "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : chr  "3,367,989" "33,700,993" "33,631,485,999,999,900" "33,707,572,937,012" ...
##  $ longitude    : chr  "7,301,264" "72,971,492" "72,926,559" "7,315,119,934,082" ...
##  $ baths        : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr  "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : int  2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : chr  "04-02-19" "04-05-19" "17-07-19" "05-04-19" ...
##  $ agency       : chr  "Self" "Self" "Self" "Self" ...
##  $ agent        : chr  "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: chr  "4" "5.6" "8" "40" ...

Thống kê mô tả các biến

pur1 <- table(data$purpose)
pur1
## 
## For Rent For Sale 
##    43183   110247
pur1a <- prop.table(pur1);pur1a
## 
##  For Rent  For Sale 
## 0.2814508 0.7185492
addmargins(pur1)
## 
## For Rent For Sale      Sum 
##    43183   110247   153430
library(ggplot2)
 data |> ggplot(aes(x = data$purpose, y = after_stat(count))) + geom_bar(fill = 'blue') + geom_text(aes(label = scales::percent(after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = - .5) + theme_classic() + labs(x = 'Purpose', y = 'Frequency')
## Warning: Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.
## Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.

Mô hình hồi quy

Hồi quy logit