The variables with missing values are date added 367 area 1092 keywords 863 location 367 rpice, type, description and complete location 1 missing values
#Dataset details The data set has 8414 observations with 13 variables, 10 discrete and 3 continuous variables. Some variables show missing data, price, date added, type, area, location. complete location and keywords
We find that every row in this dataset has missing value, and we will deal with it later.
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(gplots)
## Warning: package 'gplots' was built under R version 4.2.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(repr)
## Warning: package 'repr' was built under R version 4.2.3
#list rows of data with missing values
missing_rows <- kpp[!complete.cases(kpp),]
head(missing_rows)
## # A tibble: 6 × 14
## id title date_added type bedrooms bathrooms location complete_location
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 2 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Al…
## 2 3 4 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Sports Ci…
## 3 4 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Pr…
## 4 7 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Pr…
## 5 8 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Pr…
## 6 11 Chance D… 9 hours a… House 0 0 DHA Def… DHA Phase 4, DHA…
## # ℹ 6 more variables: description <chr>, keywords <chr>, area_size <chr>,
## # area_unit <chr>, sales_price <dbl>, price_unit <chr>
nrow(missing_rows)
## [1] 2182
table(unique(kpp$price_unit))
##
## Crore Lakh
## 1 1 1
kpp %>% select(id, sales_price, price_unit) %>% group_by(sales_price, price_unit) %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'sales_price'. You can override using the
## `.groups` argument.
## # A tibble: 651 × 3
## # Groups: sales_price [628]
## sales_price price_unit count
## <dbl> <chr> <int>
## 1 1.5 Crore 149
## 2 1.75 Crore 149
## 3 2 Crore 122
## 4 2.1 Crore 116
## 5 4.5 Crore 107
## 6 1.9 Crore 104
## 7 6.5 Crore 93
## 8 1.55 Crore 87
## 9 2.25 Crore 85
## 10 65 Lakh 82
## # ℹ 641 more rows
kpp %>% select(id, sales_price, price_unit) %>% group_by(price_unit) %>% filter(price_unit == "Crore") %>%
summarise(count = n()) %>% arrange(desc(count))
## # A tibble: 1 × 2
## price_unit count
## <chr> <int>
## 1 Crore 7079
kpp %>% select(id, sales_price, price_unit) %>% group_by(price_unit) %>% filter(price_unit == "Lakh") %>%
summarise(count = n()) %>% arrange(desc(count))
## # A tibble: 1 × 2
## price_unit count
## <chr> <int>
## 1 Lakh 1334
kpp$sales_price[kpp$price_unit == "Lakh"] <- kpp$sales_price[kpp$price_unit == "Lakh"] * 0.01
kpp$price_unit[kpp$price_unit == "Lakh"] <- kpp$price_unit[kpp$price_unit == "Crore"]
## Warning in kpp$price_unit[kpp$price_unit == "Lakh"] <-
## kpp$price_unit[kpp$price_unit == : number of items to replace is not a multiple
## of replacement length
#check for area
table(kpp$area_unit)
##
## Sq. Yd.
## 1092 7322
colSums(is.na(kpp))
## id title date_added type
## 0 0 367 1
## bedrooms bathrooms location complete_location
## 0 0 367 1
## description keywords area_size area_unit
## 1 863 1092 0
## sales_price price_unit
## 1 0
unique(kpp$area_unit)
## [1] "Sq. Yd." ""
sum(kpp$area_unit == "")
## [1] 1092
sum(kpp$area_size == " ")
## [1] NA
colSums(is.na(kpp))
## id title date_added type
## 0 0 367 1
## bedrooms bathrooms location complete_location
## 0 0 367 1
## description keywords area_size area_unit
## 1 863 1092 0
## sales_price price_unit
## 1 0
kpp %>% select(id, area_size, area_unit) %>% group_by(area_size, area_unit) %>% filter(is.na(area_unit)) %>% count()
## # A tibble: 0 × 3
## # Groups: area_size, area_unit [0]
## # ℹ 3 variables: area_size <chr>, area_unit <chr>, n <int>
In area_size variable 1092 values are missing thefore
#Descritive and Exploratory analysis of Sales price
SalePrice is our target variable and also the dependent variable for prediction. According to the assumptions of Linear Regression, data should be normally distributed. By checking the distribution of SalePrice, we can decide if we need non-linear transformation, like log term, to make better prediction.
##summary of target variable : SalesPrice
#five number summary
summary(kpp$sales_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.250 1.430 2.500 5.556 7.280 99.000 1
#draw a histogram to figure out distribution of sales
#options(scipen = 10000)
ggplot(kpp, aes(x = sales_price, fill = ..count..)) +
geom_histogram(binwidth = 5) +
ggtitle("Figure 1 Histogram of Sales price")+
ylab("Count of Houses") + xlab("Housing Prices") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
From the histogram above, the distribution of our target variable– SalePrice is skewed to right. Thus, a log term of SalePrice should be generated for linear regression. Here, we name it lSalePrice.
#log term saleprice
kpp$logsaleprice <- log(kpp$sales_price)
#draw histogram using logscalesprices
ggplot(kpp, aes(x = logsaleprice, fill = ..count..)) +
geom_histogram(binwidth = 0.5) +
ggtitle("Figure 2 Histogram of Sales price in Log")+
ylab("Count of Houses") + xlab("Housing Prices") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
This looks normally distributed, so will use log term later in the
model
#explore the distribution of sales price using Location
colnames(kpp)
## [1] "id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "area_size" "area_unit"
## [13] "sales_price" "price_unit" "logsaleprice"
When it comes to housing price, the value of house is usually related to two types of elements: internal and external. Internal elements are the key features of house itself, like total area, the number of rooms. As for External elements, environment is one of the key factors.
First, let’s figure out the variable that can indicates housing environment in our dataset. Here, I choose location as this indicator. It’s a dummy variable and the identiciation is:
table(unique(kpp$location))
##
## Bahria Town Karachi, Karachi, Sindh Clifton, Karachi, Sindh
## 1 1
## DHA Defence, Karachi, Sindh
## 1
#three locationa re bahria town, clifton, and DHA defence all in Karachi
#count houses by location
options(repr.plot.height = 4, repr.plot.width = 5)
ggplot(kpp, aes( x = location, fill = location)) +
geom_bar()+
scale_fill_hue(c = 80) +
ggtitle("Figure 3 Location destribution") +
theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",size=0.5, linetype="solid",
colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
t <- kpp %>% select(id, location, complete_location) %>% group_by(location, complete_location) %>%
filter(is.na(location))
t[c('c1', 'c2','c3', 'c4')] <- str_split_fixed(t$complete_location, ' ', 4)
head(t)
## # A tibble: 6 × 7
## # Groups: location, complete_location [6]
## id location complete_location c1 c2 c3 c4
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 91 <NA> Bahria Town - Precinct 10, Bahria Town… Bahr… "Tow… "-" "Pre…
## 2 96 <NA> Bahria Town - Precinct 10-B, Bahria To… Bahr… "Tow… "-" "Pre…
## 3 98 <NA> Bahria Town Karachi, Karachi, Sindh Bahr… "Tow… "Kar… "Kar…
## 4 115 <NA> <NA> <NA> "" "" ""
## 5 125 <NA> Clifton, Karachi, Sindh Clif… "Kar… "Sin… ""
## 6 142 <NA> Bahria Town - Precinct 11-A, Bahria To… Bahr… "Tow… "-" "Pre…
dim(t)
## [1] 367 7
colSums(is.na(t))
## id location complete_location c1
## 0 367 1 1
## c2 c3 c4
## 0 0 0
t$complete_location <- NULL
t$location <- NULL
t$c2 <- NULL
t$c3 <- NULL
t$c4 <- NULL
kpp$id[kpp$id == 7903] <- replace(kpp$location, "NA", "Clifton")
## Warning in kpp$id[kpp$id == 7903] <- replace(kpp$location, "NA", "Clifton"):
## number of items to replace is not a multiple of replacement length
kpp$id[kpp$id == 7903] <- kpp$location[kpp$location == "Clifton"]
kpp$price_unit[kpp$price_unit == "Lakh"] <- kpp$price_unit[kpp$price_unit == "Crore"]
b <- t %>% select(id, c1) %>% group_by(id) %>%
filter(c1 == "Bahria")
c <- t %>% select(id, c1) %>% group_by(id) %>%
filter(c1 == "Clifton")
d <- t %>% select(id, c1) %>% group_by(id) %>%
filter(c1 == "DHA")
colSums(is.na(kpp))
## id title date_added type
## 0 0 367 1
## bedrooms bathrooms location complete_location
## 0 0 367 1
## description keywords area_size area_unit
## 1 863 1092 0
## sales_price price_unit logsaleprice
## 1 0 1
kpp$location[kpp$id == 7903] <- "Clifton, Karachi, Sindh"
kpp$location[kpp$id == 464] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 2226] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 3978] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 4223] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5441] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5527] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5530] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5534] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6566] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6585] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6989] <- "DHA Defence, Karachi, Sindh"
#for bahria in all missing values
kpp$location[kpp$location == "Bahria"] <- "Bahria Town Karachi, Karachi, Sindh"
#one missing value with no information
kpp$location[kpp$id == 115] <- NA
head(kpp)
## # A tibble: 6 × 15
## id title date_added type bedrooms bathrooms location complete_location
## <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 600 Yard… 14 hours … House 5 6 DHA Def… DHA Phase 6, DHA…
## 2 2 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Al…
## 3 3 4 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Sports Ci…
## 4 4 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Pr…
## 5 5 Buying A… 6 hours a… Flat 3 3 Clifton… Clifton - Block …
## 6 6 500 Yard… 6 hours a… House 7 6 DHA Def… DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## # area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>
kpp$area <- NULL
kpp$size <- NULL
#remove one data entry with all variables missing informations
kpp <- subset(kpp, id != 115)
#count houses by location
options(repr.plot.height = 4, repr.plot.width = 5)
ggplot(kpp, aes( x = location, fill = location)) +
geom_bar()+
scale_fill_hue(c = 80) +
ggtitle("Figure 3 Location destribution") +
theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",size=0.5, linetype="solid",
colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))
Most of the houses in this dataset are in Bahria town (3827), followed
by DHA 3953 adn them 634 in Clifton
et’s add our target variable into analysis. How does housing price look like in each category? Here, I use boxplot to show the distribution of prices in each MSZoning.
## NULL
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The prices in dha are highest, followed by clifton and bahria.
Dig deep down to area size and figure out price and size by location
# #find average house sizes in each area
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
colnames(kpp$ar)
## Warning: Unknown or uninitialised column: `ar`.
## NULL
#check missing area by location and sales price
mis_loc <- kpp %>% select(area_size, area_unit, id, sales_price, location) %>% group_by(location) %>% filter(is.na(area_size))
#behria missing
bah_area <- mis_loc %>% select(id, location, sales_price, area_size) %>% group_by(location) %>% filter(location == "Bahria Town Karachi, Karachi, Sindh")
range(bah_area$sales_price) #0.26 - 15.5
## [1] 0.26 15.50
summary(bah_area)
## id location sales_price area_size
## Length:762 Length:762 Min. : 0.260 Length:762
## Class :character Class :character 1st Qu.: 1.035 Class :character
## Mode :character Mode :character Median : 1.750 Mode :character
## Mean : 1.861
## 3rd Qu.: 2.170
## Max. :15.500
#dha missing
dha_area <- mis_loc %>% select(id, location, sales_price) %>% group_by(location) %>% filter(location == "DHA Defence, Karachi, Sindh")
range(dha_area$sales_price) #0.52 - 80.0
## [1] 0.52 80.00
#clifton missing
clift_area <- mis_loc %>% select(id, location, sales_price) %>% group_by(location) %>% filter(location == "Clifton, Karachi, Sindh")
range(clift_area$sales_price) #1.5-36.5
## [1] 1.5 36.5
To impute data as per location area size and sales price
bah_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "Bahria Town Karachi, Karachi, Sindh" & !is.na(area_size))
range(bah_area_price$area_size) #33-8300
## [1] "1,000" "97"
range(bah_area_price$sales_price) #0.25-36.16
## [1] 0.25 22.00
dha_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "DHA Defence, Karachi, Sindh" & !is.na(area_size))
range(dha_area_price$area_size) #33-3600
## [1] "1,000" "993"
range(dha_area_price$sales_price) #0.38-99.00
## [1] 0.38 99.00
clif_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "Clifton, Karachi, Sindh" & !is.na(area_size))
range(clif_area_price$area_size) #67-2400
## [1] "1,000" "94"
range(clif_area_price$sales_price) #0.37-85.0
## [1] 0.37 85.00
summary(bah_area_price)
## id location area_size sales_price
## Length:2709 Length:2709 Length:2709 Min. : 0.250
## Class :character Class :character Class :character 1st Qu.: 0.980
## Mode :character Mode :character Mode :character Median : 1.600
## Mean : 1.802
## 3rd Qu.: 2.140
## Max. :22.000
colnames(kpp)
## [1] "id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "area_size" "area_unit"
## [13] "sales_price" "price_unit" "logsaleprice"
head(kpp)
## # A tibble: 6 × 15
## id title date_added type bedrooms bathrooms location complete_location
## <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 1 600 Yard… 14 hours … House 5 6 DHA Def… DHA Phase 6, DHA…
## 2 2 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Al…
## 3 3 4 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Sports Ci…
## 4 4 3 BEDS L… 5 hours a… House 0 0 Bahria … Bahria Town - Pr…
## 5 5 Buying A… 6 hours a… Flat 3 3 Clifton… Clifton - Block …
## 6 6 500 Yard… 6 hours a… House 7 6 DHA Def… DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## # area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>
table(kpp$area_unit)
##
## Sq. Yd.
## 1091 7322
#location size without missing values
ddply(kpp, .(location), summarize, size = mean(!is.na(area_size)))
## location size
## 1 Bahria Town Karachi, Karachi, Sindh 0.7804667
## 2 Clifton, Karachi, Sindh 0.9589258
## 3 DHA Defence, Karachi, Sindh 0.9248672
## 4 <NA> 0.9831461
ddply(kpp, .(type), summarize, size = mean(!is.na(area_size)))
## type size
## 1 Farm House 0.5000000
## 2 Flat 0.9124204
## 3 House 0.8360584
## 4 Lower Portion 1.0000000
## 5 Penthouse 0.8541667
## 6 Upper Portion 1.0000000
ddply(kpp, .(bedrooms), summarize, size = mean(!is.na(area_size)))
## bedrooms size
## 1 0 0.0000000
## 2 1 0.9691781
## 3 2 0.9786834
## 4 3 0.9897480
## 5 4 0.9907049
## 6 5 0.9924338
## 7 6 0.9859611
## 8 7 0.9824561
## 9 8 0.8780488
## 10 9 0.9090909
## 11 10 1.0000000
## 12 11 0.5000000
ddply(kpp, .(bathrooms), summarize, size = mean(!is.na(area_size)))
## bathrooms size
## 1 0 0.0000000
## 2 1 0.9737991
## 3 2 0.9846797
## 4 3 0.9995015
## 5 4 0.9977011
## 6 5 0.9988739
## 7 6 0.9990934
## 8 7 1.0000000
## 9 8 1.0000000
## 10 9 1.0000000
## 11 10 1.0000000
#location size with missing values
ddply(kpp, .(location), summarize, size = mean(is.na(area_size)))
## location size
## 1 Bahria Town Karachi, Karachi, Sindh 0.21953328
## 2 Clifton, Karachi, Sindh 0.04107425
## 3 DHA Defence, Karachi, Sindh 0.07513281
## 4 <NA> 0.01685393
ddply(kpp, .(type), summarize, size = mean(is.na(area_size)))
## type size
## 1 Farm House 0.50000000
## 2 Flat 0.08757962
## 3 House 0.16394157
## 4 Lower Portion 0.00000000
## 5 Penthouse 0.14583333
## 6 Upper Portion 0.00000000
ddply(kpp, .(bedrooms), summarize, size = mean(is.na(area_size)))
## bedrooms size
## 1 0 1.000000000
## 2 1 0.030821918
## 3 2 0.021316614
## 4 3 0.010252029
## 5 4 0.009295120
## 6 5 0.007566204
## 7 6 0.014038877
## 8 7 0.017543860
## 9 8 0.121951220
## 10 9 0.090909091
## 11 10 0.000000000
## 12 11 0.500000000
ddply(kpp, .(bathrooms), summarize, size = mean(is.na(area_size)))
## bathrooms size
## 1 0 1.0000000000
## 2 1 0.0262008734
## 3 2 0.0153203343
## 4 3 0.0004985045
## 5 4 0.0022988506
## 6 5 0.0011261261
## 7 6 0.0009066183
## 8 7 0.0000000000
## 9 8 0.0000000000
## 10 9 0.0000000000
## 11 10 0.0000000000
ddply(kpp, .(location), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
## location max_price min_price
## 1 Bahria Town Karachi, Karachi, Sindh 22.00 0.25
## 2 Clifton, Karachi, Sindh 85.00 0.37
## 3 DHA Defence, Karachi, Sindh 99.00 0.38
## 4 <NA> 36.16 0.40
ddply(kpp, .(type), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
## type max_price min_price
## 1 Farm House 17.00 5.00
## 2 Flat 28.00 0.25
## 3 House 99.00 0.50
## 4 Lower Portion 3.75 2.95
## 5 Penthouse 37.00 1.40
## 6 Upper Portion 4.00 2.95
ddply(kpp, .(bedrooms), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
## bedrooms max_price min_price
## 1 0 80.00 0.2682
## 2 1 7.00 0.2500
## 3 2 15.00 0.2682
## 4 3 25.00 0.3574
## 5 4 36.16 0.4500
## 6 5 48.00 1.1600
## 7 6 85.00 1.5000
## 8 7 70.00 3.3000
## 9 8 75.00 4.9500
## 10 9 45.00 11.7500
## 11 10 99.00 15.0000
## 12 11 80.00 14.0000
ddply(kpp, .(bathrooms), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
## bathrooms max_price min_price
## 1 0 80.00 0.2682
## 2 1 6.20 0.2500
## 3 2 25.00 0.2682
## 4 3 15.00 0.3574
## 5 4 36.16 0.4500
## 6 5 38.00 1.0500
## 7 6 99.00 1.5000
## 8 7 70.00 4.1500
## 9 8 70.00 3.3000
## 10 9 27.50 11.7500
## 11 10 95.00 13.0000
#Bedrooms and Bathrooms
kpp %>% select(id, bedrooms,area_size, location, sales_price) %>% group_by(location, area_size) %>% filter(bedrooms == 0) %>% summarise(min = min(sales_price), max = max(sales_price), avg = mean(sales_price))
## min max avg
## 1 0.2682 80 4.191381
# historgram of housing price by BldgType
ggplot(kpp, aes(sales_price)) +
geom_histogram(aes(fill = type), binwidth = 5) +
coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))
#Check price per type of house
ggplot(kpp, aes(x = sales_price,fill = as.factor(type))) +
geom_histogram(position = "stack", binwidth = 10) +
ggtitle("Figure 6 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
scale_fill_discrete(name="OverallQual")+
theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))
#check for bedrooms and bathrooms
ggplot(kpp, aes(x = sales_price,fill = as.factor(location))) +
geom_histogram(position = "stack", binwidth = 10) +
ggtitle("Figure 6 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
scale_fill_discrete(name="OverallQual")+
theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))
#What kind of house will be sold for higher price? #correlation
exploration
variable selction for correlaiton exploration sales_price, area_size, bedrooms, bathrooms,
First create correlation heatmapt to plot correlation coefficients
But before plotting heatmap, one more step is needed– feature engineering. In section I, we learn that some variables are factors, in order to buid heatmap we need to convert them into numerics.
I will do feature engineering of type, location
#first convert location and type to factor
kpp$location <- as.factor(kpp$location)
kpp$type <- as.factor(kpp$type)
#convert factor to numeric
kpp$location <- as.numeric(factor(kpp$location, levels = c("Bahria Town Karachi, Karachi, Sindh", "Clifton, Karachi, Sindh", "DHA Defence, Karachi, Sindh"), labels = c(1,2,3), ordered = TRUE))
kpp$type <- as.numeric(factor(kpp$type, levels = c("House","Flat","Penthouse","Farm House","Upper Portion","Lower Portion"), labels = c(1,2,3,4,5,6), ordered = TRUE))
colnames(kpp)
## [1] "id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "area_size" "area_unit"
## [13] "sales_price" "price_unit" "logsaleprice"
head(kpp)
## # A tibble: 6 × 15
## id title date_added type bedrooms bathrooms location complete_location
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 600 Yard… 14 hours … 1 5 6 3 DHA Phase 6, DHA…
## 2 2 3 BEDS L… 5 hours a… 1 0 0 1 Bahria Town - Al…
## 3 3 4 BEDS L… 5 hours a… 1 0 0 1 Bahria Sports Ci…
## 4 4 3 BEDS L… 5 hours a… 1 0 0 1 Bahria Town - Pr…
## 5 5 Buying A… 6 hours a… 2 3 3 2 Clifton - Block …
## 6 6 500 Yard… 6 hours a… 1 7 6 3 DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## # area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>
#Select variables for model building and heatmap
model_var <- c("type", "location", "sales_price", "area_size", "bedrooms", "bathrooms")
heat <- kpp[, model_var]
Next, using ggplot plot correlation heatmap.
#plot correltion for sales price
# options(repr.plot.height = 6, repr.plot.width = 8)
#
# library(ggplot2)
# library(reshape2)
#
# qplot(x = Var1, y = Var2, data = melt(cor(heat, use = "p")), fill = value, geom = "tile") +
# scale_fill_gradient2(low = "green", high = "red", mid = "white",
# midpoint = 0, limit = c(-1,1), space = "Lab", name = "Correlation")+
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1)) +
# coord_fixed() +
# ggtitle("Figure 7 Correlation Heatmap") +
# theme(plot.title = element_text(hjust = 0.4))
In this graph, Red indicates perfect positive correlation and Green indicates perfect negative correlation.location with sales price, location with area size and bedrooms and bathrooms shows positive correlation. The type of home has negative correlation with sales rice and area size.
#Correlation between SalePrice and some numeric variables
In this section, I am going to analyze the correlation between SalePrice and numeric variables, including area size, bedrooms and bathrooms. Different from categorical variables, here I will use scatter plot and trend line to indicate the relationship.
#scatter plot of area size
options(repr.plot.height = 6, repr.plot.width = 9)
p1 <- ggplot(kpp, aes(area_size, y = sales_price))+geom_point(shape = 1)+
geom_smooth(method = lm, color = "red", se = FALSE) +
ggtitle("Figure 8 Scatter plot of Sales price and area size") +
theme(plot.title = element_text(hjust = 0.4))
p2 <- ggplot(kpp, aes(bedrooms, y = sales_price))+geom_point(shape = 1)+
geom_smooth(method = lm, color = "red", se = FALSE) +
ggtitle("Figure 8 Scatter plot of Sales price and No of Bedrooms") +
theme(plot.title = element_text(hjust = 0.4))
p3 <- ggplot(kpp, aes(bathrooms, y = sales_price))+geom_point(shape = 1)+
geom_smooth(method = lm, color = "red", se = FALSE) +
ggtitle("Figure 8 Scatter plot of Sales price and No of Bathrooms") +
theme(plot.title = element_text(hjust = 0.4))
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#MODEL fITTING After Descriptive Analysis we are moving into Predictive Analysis section. There are three models here:
Linear Regresion Model Classification & Regression Trees (CART) Model Random Forest Model
##Linear Regression Model Step 1: choose variables and transfer SalePrice into log term
#prediction of lm
#build model dataset for linear regression
model_lin <- kpp[, model_var]
model_lin$lsales_price <- log(model_lin$sales_price)
Step 2: divide datasets into two parts – training and validation, to prepare for prediction later
# #partition data
# set.seed(10000)
# train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1] * 0.8)
#
# model_lin_train <- model_lin[train.index, ]
# model_lin_test <- model_lin[-train.index, ]
Step 3: Run Regression
#use lm() to run linear regression of sales price on all variables in model dataset
#
# linreg <- lm(lsales_price ~ . -sales_price, data = model_lin_train)
# summary(linreg)
Step 4: forecast and check for model accuracy
# library(forecast)
#
# #use predict() to make prediction on a new set
# pred1 <- predict(linreg, model_lin_test, type = "response")
# residuals <- model_lin_test$lsales_price - pred1
#
# linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_test$lsales_price, "Residuals" = residuals)
#
# accuracy(pred1, model_lin_test$lsales_price)
##CART
#classification tree
# library(rpart)
# library(rpart.plot)
#
# class.tree <- rpart(lsales_price ~. -sales_price,
# data = model_lin_train, control = rpart.control(cp = 0.01))
#
# plotcp(class.tree)
# printcp(class.tree)
#rpart.plot(class.tree, box.palette = "GnBu", branch.lty = 3, shadow.col = "gray", nn = TRUE)
#Random Forest
#library(randomForest)
#RF <- randomForest(lsales_price ~.-sales_price, data = model_lin_train,
# importance = TRUE, ntree = 500, nodesize = 7, na.action = na.roughfix)
#
#variable importance
#options(repr.plot.width = 9, repr.plot.height = 6)
#varImpPlot(RF, type = 1)
#prediction
#rf.pred <- predict(RF, newdata = model_lin_test)
#accuracy(rf.pred, model_lin_test$lsales_price)
#plot(rf.pred, model_lin_test$lsales_price, main = "Figure 9 Predicted vs. Actual log Sale Price")
#abline(0,1)