The variables with missing values are date added 367 area 1092 keywords 863 location 367 rpice, type, description and complete location 1 missing values

#Dataset details The data set has 8414 observations with 13 variables, 10 discrete and 3 continuous variables. Some variables show missing data, price, date added, type, area, location. complete location and keywords

We find that every row in this dataset has missing value, and we will deal with it later.

library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(gplots)
## Warning: package 'gplots' was built under R version 4.2.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(repr)
## Warning: package 'repr' was built under R version 4.2.3
#list rows of data with missing values
missing_rows <- kpp[!complete.cases(kpp),]
head(missing_rows)
## # A tibble: 6 × 14
##      id title     date_added type  bedrooms bathrooms location complete_location
##   <dbl> <chr>     <chr>      <chr>    <dbl>     <dbl> <chr>    <chr>            
## 1     2 3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Al…
## 2     3 4 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Sports Ci…
## 3     4 3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Pr…
## 4     7 3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Pr…
## 5     8 3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Pr…
## 6    11 Chance D… 9 hours a… House        0         0 DHA Def… DHA Phase 4, DHA…
## # ℹ 6 more variables: description <chr>, keywords <chr>, area_size <chr>,
## #   area_unit <chr>, sales_price <dbl>, price_unit <chr>
nrow(missing_rows)
## [1] 2182
table(unique(kpp$price_unit))
## 
##       Crore  Lakh 
##     1     1     1
kpp %>% select(id, sales_price, price_unit) %>% group_by(sales_price, price_unit) %>% summarise(count = n()) %>% arrange(desc(count))
## `summarise()` has grouped output by 'sales_price'. You can override using the
## `.groups` argument.
## # A tibble: 651 × 3
## # Groups:   sales_price [628]
##    sales_price price_unit count
##          <dbl> <chr>      <int>
##  1        1.5  Crore        149
##  2        1.75 Crore        149
##  3        2    Crore        122
##  4        2.1  Crore        116
##  5        4.5  Crore        107
##  6        1.9  Crore        104
##  7        6.5  Crore         93
##  8        1.55 Crore         87
##  9        2.25 Crore         85
## 10       65    Lakh          82
## # ℹ 641 more rows
kpp %>% select(id, sales_price, price_unit) %>% group_by(price_unit) %>% filter(price_unit == "Crore") %>% 
   summarise(count = n()) %>% arrange(desc(count))
## # A tibble: 1 × 2
##   price_unit count
##   <chr>      <int>
## 1 Crore       7079
kpp %>% select(id, sales_price, price_unit) %>% group_by(price_unit) %>% filter(price_unit == "Lakh") %>% 
  summarise(count = n()) %>% arrange(desc(count)) 
## # A tibble: 1 × 2
##   price_unit count
##   <chr>      <int>
## 1 Lakh        1334
kpp$sales_price[kpp$price_unit == "Lakh"] <- kpp$sales_price[kpp$price_unit == "Lakh"] * 0.01

kpp$price_unit[kpp$price_unit == "Lakh"] <- kpp$price_unit[kpp$price_unit == "Crore"]
## Warning in kpp$price_unit[kpp$price_unit == "Lakh"] <-
## kpp$price_unit[kpp$price_unit == : number of items to replace is not a multiple
## of replacement length
#check for area
table(kpp$area_unit)
## 
##         Sq. Yd. 
##    1092    7322
colSums(is.na(kpp))
##                id             title        date_added              type 
##                 0                 0               367                 1 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0               367                 1 
##       description          keywords         area_size         area_unit 
##                 1               863              1092                 0 
##       sales_price        price_unit 
##                 1                 0
unique(kpp$area_unit)
## [1] "Sq. Yd." ""
sum(kpp$area_unit == "")
## [1] 1092
sum(kpp$area_size == " ")
## [1] NA
colSums(is.na(kpp))
##                id             title        date_added              type 
##                 0                 0               367                 1 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0               367                 1 
##       description          keywords         area_size         area_unit 
##                 1               863              1092                 0 
##       sales_price        price_unit 
##                 1                 0
kpp %>% select(id, area_size, area_unit) %>% group_by(area_size, area_unit) %>% filter(is.na(area_unit)) %>% count()
## # A tibble: 0 × 3
## # Groups:   area_size, area_unit [0]
## # ℹ 3 variables: area_size <chr>, area_unit <chr>, n <int>

In area_size variable 1092 values are missing thefore

#Descritive and Exploratory analysis of Sales price

SalePrice is our target variable and also the dependent variable for prediction. According to the assumptions of Linear Regression, data should be normally distributed. By checking the distribution of SalePrice, we can decide if we need non-linear transformation, like log term, to make better prediction.

##summary of target variable : SalesPrice

#five number summary
summary(kpp$sales_price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.250   1.430   2.500   5.556   7.280  99.000       1
#draw a histogram to figure out distribution of sales
#options(scipen = 10000)

ggplot(kpp, aes(x = sales_price, fill = ..count..)) + 
  geom_histogram(binwidth = 5) + 
  ggtitle("Figure 1 Histogram of Sales price")+
  ylab("Count of Houses") + xlab("Housing Prices") + 
  theme(plot.title = element_text(hjust = 0.5)) 
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

From the histogram above, the distribution of our target variable– SalePrice is skewed to right. Thus, a log term of SalePrice should be generated for linear regression. Here, we name it lSalePrice.

#log term saleprice
kpp$logsaleprice <- log(kpp$sales_price)

#draw histogram using logscalesprices
ggplot(kpp, aes(x = logsaleprice, fill = ..count..)) + 
  geom_histogram(binwidth = 0.5) + 
  ggtitle("Figure 2 Histogram of Sales price in Log")+
  ylab("Count of Houses") + xlab("Housing Prices") + 
  theme(plot.title = element_text(hjust = 0.5)) 
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

This looks normally distributed, so will use log term later in the model

#explore the distribution of sales price using Location

colnames(kpp)
##  [1] "id"                "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "area_size"         "area_unit"        
## [13] "sales_price"       "price_unit"        "logsaleprice"

When it comes to housing price, the value of house is usually related to two types of elements: internal and external. Internal elements are the key features of house itself, like total area, the number of rooms. As for External elements, environment is one of the key factors.

First, let’s figure out the variable that can indicates housing environment in our dataset. Here, I choose location as this indicator. It’s a dummy variable and the identiciation is:

table(unique(kpp$location))
## 
## Bahria Town Karachi, Karachi, Sindh             Clifton, Karachi, Sindh 
##                                   1                                   1 
##         DHA Defence, Karachi, Sindh 
##                                   1

#three locationa re bahria town, clifton, and DHA defence all in Karachi

#count houses by location

options(repr.plot.height = 4, repr.plot.width = 5)

ggplot(kpp, aes( x = location, fill = location)) +
  geom_bar()+
  scale_fill_hue(c = 80) + 
  ggtitle("Figure 3 Location destribution") + 
  theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", 
colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

t <- kpp %>% select(id, location, complete_location) %>% group_by(location, complete_location) %>% 
  filter(is.na(location))


t[c('c1', 'c2','c3', 'c4')] <- str_split_fixed(t$complete_location,  ' ', 4)
head(t)
## # A tibble: 6 × 7
## # Groups:   location, complete_location [6]
##      id location complete_location                       c1    c2    c3    c4   
##   <dbl> <chr>    <chr>                                   <chr> <chr> <chr> <chr>
## 1    91 <NA>     Bahria Town - Precinct 10, Bahria Town… Bahr… "Tow… "-"   "Pre…
## 2    96 <NA>     Bahria Town - Precinct 10-B, Bahria To… Bahr… "Tow… "-"   "Pre…
## 3    98 <NA>     Bahria Town Karachi, Karachi, Sindh     Bahr… "Tow… "Kar… "Kar…
## 4   115 <NA>     <NA>                                    <NA>  ""    ""    ""   
## 5   125 <NA>     Clifton, Karachi, Sindh                 Clif… "Kar… "Sin… ""   
## 6   142 <NA>     Bahria Town - Precinct 11-A, Bahria To… Bahr… "Tow… "-"   "Pre…
dim(t)
## [1] 367   7
colSums(is.na(t))
##                id          location complete_location                c1 
##                 0               367                 1                 1 
##                c2                c3                c4 
##                 0                 0                 0
t$complete_location <- NULL
t$location <- NULL
t$c2 <- NULL
t$c3 <- NULL
t$c4 <- NULL

kpp$id[kpp$id == 7903] <- replace(kpp$location, "NA", "Clifton")
## Warning in kpp$id[kpp$id == 7903] <- replace(kpp$location, "NA", "Clifton"):
## number of items to replace is not a multiple of replacement length
kpp$id[kpp$id == 7903] <- kpp$location[kpp$location == "Clifton"] 
kpp$price_unit[kpp$price_unit == "Lakh"] <- kpp$price_unit[kpp$price_unit == "Crore"]

b <- t %>% select(id, c1) %>% group_by(id) %>% 
  filter(c1 == "Bahria")
c <- t %>% select(id, c1) %>% group_by(id) %>% 
  filter(c1 == "Clifton")
d <- t %>% select(id, c1) %>% group_by(id) %>% 
  filter(c1 == "DHA")

colSums(is.na(kpp))
##                id             title        date_added              type 
##                 0                 0               367                 1 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0               367                 1 
##       description          keywords         area_size         area_unit 
##                 1               863              1092                 0 
##       sales_price        price_unit      logsaleprice 
##                 1                 0                 1
kpp$location[kpp$id == 7903] <- "Clifton, Karachi, Sindh"
kpp$location[kpp$id == 464] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 2226] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 3978] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 4223] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5441] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5527] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5530] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 5534] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6566] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6585] <- "DHA Defence, Karachi, Sindh"
kpp$location[kpp$id == 6989] <- "DHA Defence, Karachi, Sindh"


#for bahria in all missing values
kpp$location[kpp$location == "Bahria"] <- "Bahria Town Karachi, Karachi, Sindh"

#one missing value with no information
kpp$location[kpp$id == 115] <- NA

head(kpp)
## # A tibble: 6 × 15
##   id    title     date_added type  bedrooms bathrooms location complete_location
##   <chr> <chr>     <chr>      <chr>    <dbl>     <dbl> <chr>    <chr>            
## 1 1     600 Yard… 14 hours … House        5         6 DHA Def… DHA Phase 6, DHA…
## 2 2     3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Al…
## 3 3     4 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Sports Ci…
## 4 4     3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Pr…
## 5 5     Buying A… 6 hours a… Flat         3         3 Clifton… Clifton - Block …
## 6 6     500 Yard… 6 hours a… House        7         6 DHA Def… DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## #   area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>
kpp$area <- NULL
kpp$size <-  NULL


#remove one data entry with all variables missing informations
kpp <- subset(kpp, id != 115)
#count houses by location

options(repr.plot.height = 4, repr.plot.width = 5)

ggplot(kpp, aes( x = location, fill = location)) +
  geom_bar()+
  scale_fill_hue(c = 80) + 
  ggtitle("Figure 3 Location destribution") + 
  theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", 
colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7))

Most of the houses in this dataset are in Bahria town (3827), followed by DHA 3953 adn them 634 in Clifton

et’s add our target variable into analysis. How does housing price look like in each category? Here, I use boxplot to show the distribution of prices in each MSZoning.

## NULL
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The prices in dha are highest, followed by clifton and bahria.

Dig deep down to area size and figure out price and size by location

# #find average house sizes in each area
 library(plyr)
## Warning: package 'plyr' was built under R version 4.2.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
colnames(kpp$ar)
## Warning: Unknown or uninitialised column: `ar`.
## NULL
#check missing area by location and sales price
mis_loc <- kpp %>% select(area_size, area_unit, id, sales_price, location) %>% group_by(location) %>% filter(is.na(area_size))

#behria missing
bah_area <- mis_loc %>% select(id, location, sales_price, area_size) %>% group_by(location) %>% filter(location == "Bahria Town Karachi, Karachi, Sindh")
 range(bah_area$sales_price) #0.26 - 15.5
## [1]  0.26 15.50
 summary(bah_area)
##       id              location          sales_price      area_size        
##  Length:762         Length:762         Min.   : 0.260   Length:762        
##  Class :character   Class :character   1st Qu.: 1.035   Class :character  
##  Mode  :character   Mode  :character   Median : 1.750   Mode  :character  
##                                        Mean   : 1.861                     
##                                        3rd Qu.: 2.170                     
##                                        Max.   :15.500
 #dha missing
dha_area <- mis_loc %>% select(id, location, sales_price) %>% group_by(location) %>% filter(location == "DHA Defence, Karachi, Sindh")
range(dha_area$sales_price) #0.52 - 80.0
## [1]  0.52 80.00
 #clifton missing
clift_area <- mis_loc %>% select(id, location, sales_price) %>% group_by(location) %>% filter(location == "Clifton, Karachi, Sindh")
range(clift_area$sales_price) #1.5-36.5
## [1]  1.5 36.5

To impute data as per location area size and sales price

bah_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "Bahria Town Karachi, Karachi, Sindh" & !is.na(area_size)) 
range(bah_area_price$area_size) #33-8300
## [1] "1,000" "97"
range(bah_area_price$sales_price) #0.25-36.16
## [1]  0.25 22.00
dha_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "DHA Defence, Karachi, Sindh" & !is.na(area_size)) 
range(dha_area_price$area_size) #33-3600
## [1] "1,000" "993"
range(dha_area_price$sales_price) #0.38-99.00
## [1]  0.38 99.00
clif_area_price <- kpp %>% select(id, location, area_size, sales_price) %>% filter(location == "Clifton, Karachi, Sindh" & !is.na(area_size)) 
range(clif_area_price$area_size) #67-2400
## [1] "1,000" "94"
range(clif_area_price$sales_price) #0.37-85.0
## [1]  0.37 85.00
summary(bah_area_price)
##       id              location          area_size          sales_price    
##  Length:2709        Length:2709        Length:2709        Min.   : 0.250  
##  Class :character   Class :character   Class :character   1st Qu.: 0.980  
##  Mode  :character   Mode  :character   Mode  :character   Median : 1.600  
##                                                           Mean   : 1.802  
##                                                           3rd Qu.: 2.140  
##                                                           Max.   :22.000
colnames(kpp)
##  [1] "id"                "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "area_size"         "area_unit"        
## [13] "sales_price"       "price_unit"        "logsaleprice"
head(kpp)
## # A tibble: 6 × 15
##   id    title     date_added type  bedrooms bathrooms location complete_location
##   <chr> <chr>     <chr>      <chr>    <dbl>     <dbl> <chr>    <chr>            
## 1 1     600 Yard… 14 hours … House        5         6 DHA Def… DHA Phase 6, DHA…
## 2 2     3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Al…
## 3 3     4 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Sports Ci…
## 4 4     3 BEDS L… 5 hours a… House        0         0 Bahria … Bahria Town - Pr…
## 5 5     Buying A… 6 hours a… Flat         3         3 Clifton… Clifton - Block …
## 6 6     500 Yard… 6 hours a… House        7         6 DHA Def… DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## #   area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>

Check size as per location, type, bedrooms and bathrooms

table(kpp$area_unit)
## 
##         Sq. Yd. 
##    1091    7322
#location size without missing values
ddply(kpp, .(location), summarize, size = mean(!is.na(area_size)))
##                              location      size
## 1 Bahria Town Karachi, Karachi, Sindh 0.7804667
## 2             Clifton, Karachi, Sindh 0.9589258
## 3         DHA Defence, Karachi, Sindh 0.9248672
## 4                                <NA> 0.9831461
ddply(kpp, .(type), summarize, size = mean(!is.na(area_size)))
##            type      size
## 1    Farm House 0.5000000
## 2          Flat 0.9124204
## 3         House 0.8360584
## 4 Lower Portion 1.0000000
## 5     Penthouse 0.8541667
## 6 Upper Portion 1.0000000
ddply(kpp, .(bedrooms), summarize, size = mean(!is.na(area_size)))
##    bedrooms      size
## 1         0 0.0000000
## 2         1 0.9691781
## 3         2 0.9786834
## 4         3 0.9897480
## 5         4 0.9907049
## 6         5 0.9924338
## 7         6 0.9859611
## 8         7 0.9824561
## 9         8 0.8780488
## 10        9 0.9090909
## 11       10 1.0000000
## 12       11 0.5000000
ddply(kpp, .(bathrooms), summarize, size = mean(!is.na(area_size)))
##    bathrooms      size
## 1          0 0.0000000
## 2          1 0.9737991
## 3          2 0.9846797
## 4          3 0.9995015
## 5          4 0.9977011
## 6          5 0.9988739
## 7          6 0.9990934
## 8          7 1.0000000
## 9          8 1.0000000
## 10         9 1.0000000
## 11        10 1.0000000
#location size with missing values
ddply(kpp, .(location), summarize, size = mean(is.na(area_size)))
##                              location       size
## 1 Bahria Town Karachi, Karachi, Sindh 0.21953328
## 2             Clifton, Karachi, Sindh 0.04107425
## 3         DHA Defence, Karachi, Sindh 0.07513281
## 4                                <NA> 0.01685393
ddply(kpp, .(type), summarize, size = mean(is.na(area_size)))
##            type       size
## 1    Farm House 0.50000000
## 2          Flat 0.08757962
## 3         House 0.16394157
## 4 Lower Portion 0.00000000
## 5     Penthouse 0.14583333
## 6 Upper Portion 0.00000000
ddply(kpp, .(bedrooms), summarize, size = mean(is.na(area_size)))
##    bedrooms        size
## 1         0 1.000000000
## 2         1 0.030821918
## 3         2 0.021316614
## 4         3 0.010252029
## 5         4 0.009295120
## 6         5 0.007566204
## 7         6 0.014038877
## 8         7 0.017543860
## 9         8 0.121951220
## 10        9 0.090909091
## 11       10 0.000000000
## 12       11 0.500000000
ddply(kpp, .(bathrooms), summarize, size = mean(is.na(area_size)))
##    bathrooms         size
## 1          0 1.0000000000
## 2          1 0.0262008734
## 3          2 0.0153203343
## 4          3 0.0004985045
## 5          4 0.0022988506
## 6          5 0.0011261261
## 7          6 0.0009066183
## 8          7 0.0000000000
## 9          8 0.0000000000
## 10         9 0.0000000000
## 11        10 0.0000000000

Check prices as per location, type, bedrooms and bathrooms

ddply(kpp, .(location), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
##                              location max_price min_price
## 1 Bahria Town Karachi, Karachi, Sindh     22.00      0.25
## 2             Clifton, Karachi, Sindh     85.00      0.37
## 3         DHA Defence, Karachi, Sindh     99.00      0.38
## 4                                <NA>     36.16      0.40
ddply(kpp, .(type), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
##            type max_price min_price
## 1    Farm House     17.00      5.00
## 2          Flat     28.00      0.25
## 3         House     99.00      0.50
## 4 Lower Portion      3.75      2.95
## 5     Penthouse     37.00      1.40
## 6 Upper Portion      4.00      2.95
ddply(kpp, .(bedrooms), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
##    bedrooms max_price min_price
## 1         0     80.00    0.2682
## 2         1      7.00    0.2500
## 3         2     15.00    0.2682
## 4         3     25.00    0.3574
## 5         4     36.16    0.4500
## 6         5     48.00    1.1600
## 7         6     85.00    1.5000
## 8         7     70.00    3.3000
## 9         8     75.00    4.9500
## 10        9     45.00   11.7500
## 11       10     99.00   15.0000
## 12       11     80.00   14.0000
ddply(kpp, .(bathrooms), summarize, max_price = max((sales_price)), min_price = min((sales_price)))
##    bathrooms max_price min_price
## 1          0     80.00    0.2682
## 2          1      6.20    0.2500
## 3          2     25.00    0.2682
## 4          3     15.00    0.3574
## 5          4     36.16    0.4500
## 6          5     38.00    1.0500
## 7          6     99.00    1.5000
## 8          7     70.00    4.1500
## 9          8     70.00    3.3000
## 10         9     27.50   11.7500
## 11        10     95.00   13.0000

#Bedrooms and Bathrooms

kpp %>% select(id, bedrooms,area_size, location, sales_price) %>% group_by(location, area_size) %>% filter(bedrooms == 0) %>% summarise(min = min(sales_price), max = max(sales_price), avg = mean(sales_price))
##      min max      avg
## 1 0.2682  80 4.191381
# historgram of housing price by BldgType 
ggplot(kpp, aes(sales_price)) +
 geom_histogram(aes(fill = type), binwidth = 5) +
 coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
 ylab("Count") +
 xlab("Housing Price") + 
 theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))

#Check price per type of house

ggplot(kpp, aes(x = sales_price,fill = as.factor(type))) +
  geom_histogram(position = "stack", binwidth = 10) +
  ggtitle("Figure 6 Histogram of SalePrice") +
  ylab("Count") +
  xlab("Housing Price") + 
  scale_fill_discrete(name="OverallQual")+
  theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))

#check for bedrooms and bathrooms

ggplot(kpp, aes(x = sales_price,fill = as.factor(location))) +
  geom_histogram(position = "stack", binwidth = 10) +
  ggtitle("Figure 6 Histogram of SalePrice") +
  ylab("Count") +
  xlab("Housing Price") + 
  scale_fill_discrete(name="OverallQual")+
  theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",size=0.5, linetype="solid", colour ="black"))

#What kind of house will be sold for higher price? #correlation exploration

variable selction for correlaiton exploration sales_price, area_size, bedrooms, bathrooms,

First create correlation heatmapt to plot correlation coefficients

But before plotting heatmap, one more step is needed– feature engineering. In section I, we learn that some variables are factors, in order to buid heatmap we need to convert them into numerics.

I will do feature engineering of type, location

#first convert location and type to factor 
kpp$location <- as.factor(kpp$location)
kpp$type <- as.factor(kpp$type)

#convert factor to numeric
kpp$location <- as.numeric(factor(kpp$location, levels = c("Bahria Town Karachi, Karachi, Sindh", "Clifton, Karachi, Sindh", "DHA Defence, Karachi, Sindh"), labels = c(1,2,3), ordered = TRUE))

kpp$type <- as.numeric(factor(kpp$type, levels =  c("House","Flat","Penthouse","Farm House","Upper Portion","Lower Portion"), labels = c(1,2,3,4,5,6), ordered = TRUE))

colnames(kpp)
##  [1] "id"                "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "area_size"         "area_unit"        
## [13] "sales_price"       "price_unit"        "logsaleprice"
head(kpp)
## # A tibble: 6 × 15
##   id    title     date_added  type bedrooms bathrooms location complete_location
##   <chr> <chr>     <chr>      <dbl>    <dbl>     <dbl>    <dbl> <chr>            
## 1 1     600 Yard… 14 hours …     1        5         6        3 DHA Phase 6, DHA…
## 2 2     3 BEDS L… 5 hours a…     1        0         0        1 Bahria Town - Al…
## 3 3     4 BEDS L… 5 hours a…     1        0         0        1 Bahria Sports Ci…
## 4 4     3 BEDS L… 5 hours a…     1        0         0        1 Bahria Town - Pr…
## 5 5     Buying A… 6 hours a…     2        3         3        2 Clifton - Block …
## 6 6     500 Yard… 6 hours a…     1        7         6        3 DHA Phase 7, DHA…
## # ℹ 7 more variables: description <chr>, keywords <chr>, area_size <chr>,
## #   area_unit <chr>, sales_price <dbl>, price_unit <chr>, logsaleprice <dbl>

#Select variables for model building and heatmap

model_var <- c("type", "location", "sales_price", "area_size", "bedrooms", "bathrooms")

heat <- kpp[, model_var]

Next, using ggplot plot correlation heatmap.

#plot correltion for sales price

# options(repr.plot.height = 6, repr.plot.width = 8)
# 
# library(ggplot2)
# library(reshape2)
# 
# qplot(x = Var1, y = Var2, data = melt(cor(heat, use = "p")), fill = value, geom = "tile") +
#   scale_fill_gradient2(low = "green", high = "red", mid = "white",
#     midpoint = 0, limit = c(-1,1), space = "Lab", name = "Correlation")+
#   theme_minimal() + 
#   theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1)) +
#   coord_fixed() +
#   ggtitle("Figure 7 Correlation Heatmap") +
#   theme(plot.title = element_text(hjust = 0.4))

In this graph, Red indicates perfect positive correlation and Green indicates perfect negative correlation.location with sales price, location with area size and bedrooms and bathrooms shows positive correlation. The type of home has negative correlation with sales rice and area size.

#Correlation between SalePrice and some numeric variables

In this section, I am going to analyze the correlation between SalePrice and numeric variables, including area size, bedrooms and bathrooms. Different from categorical variables, here I will use scatter plot and trend line to indicate the relationship.

#scatter plot of area size
options(repr.plot.height = 6, repr.plot.width = 9)

p1 <- ggplot(kpp, aes(area_size, y = sales_price))+geom_point(shape = 1)+
  geom_smooth(method = lm, color = "red", se = FALSE) +
  ggtitle("Figure 8 Scatter plot of Sales price and area size") +
  theme(plot.title = element_text(hjust = 0.4))


p2 <- ggplot(kpp, aes(bedrooms, y = sales_price))+geom_point(shape = 1)+
  geom_smooth(method = lm, color = "red", se = FALSE) +
  ggtitle("Figure 8 Scatter plot of Sales price and No of Bedrooms") +
  theme(plot.title = element_text(hjust = 0.4))

p3 <- ggplot(kpp, aes(bathrooms, y = sales_price))+geom_point(shape = 1)+
  geom_smooth(method = lm, color = "red", se = FALSE) +
  ggtitle("Figure 8 Scatter plot of Sales price and No of Bathrooms") +
  theme(plot.title = element_text(hjust = 0.4))
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#MODEL fITTING After Descriptive Analysis we are moving into Predictive Analysis section. There are three models here:

Linear Regresion Model Classification & Regression Trees (CART) Model Random Forest Model

##Linear Regression Model Step 1: choose variables and transfer SalePrice into log term

#prediction of lm
#build model dataset for linear regression

model_lin <- kpp[, model_var]
model_lin$lsales_price <- log(model_lin$sales_price)

Step 2: divide datasets into two parts – training and validation, to prepare for prediction later

# #partition data
# set.seed(10000)
# train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1] * 0.8)
# 
# model_lin_train <- model_lin[train.index, ]
# model_lin_test  <- model_lin[-train.index, ]

Step 3: Run Regression

#use lm() to run linear regression of sales price on all variables in model dataset
# 
# linreg <- lm(lsales_price ~ . -sales_price, data = model_lin_train)
# summary(linreg)

Step 4: forecast and check for model accuracy

# library(forecast)
# 
# #use predict() to make prediction on a new set
# pred1     <-   predict(linreg, model_lin_test, type = "response")
# residuals <-    model_lin_test$lsales_price  - pred1
# 
# linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_test$lsales_price, "Residuals" = residuals)
# 
# accuracy(pred1, model_lin_test$lsales_price)

##CART

#classification tree
# library(rpart)
# library(rpart.plot)
# 
# class.tree <- rpart(lsales_price ~. -sales_price,
#                     data = model_lin_train, control = rpart.control(cp = 0.01))
# 
# plotcp(class.tree)
# printcp(class.tree)
#rpart.plot(class.tree, box.palette = "GnBu", branch.lty = 3, shadow.col = "gray", nn = TRUE)

#Random Forest

#library(randomForest)
#RF <- randomForest(lsales_price ~.-sales_price, data = model_lin_train, 
#                   importance = TRUE, ntree = 500, nodesize = 7, na.action = na.roughfix)
#
#variable importance
#options(repr.plot.width = 9, repr.plot.height = 6)
#varImpPlot(RF, type = 1)
#prediction
#rf.pred <- predict(RF, newdata = model_lin_test)
#accuracy(rf.pred, model_lin_test$lsales_price)
#plot(rf.pred, model_lin_test$lsales_price, main = "Figure 9 Predicted vs. Actual log Sale Price")
#abline(0,1)