#load the libraries
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
library(Rmisc)
#load dataset
k <- read.csv("karachi_property.csv", stringsAsFactors = F)
#data size and structure
dim(k)
## [1] 8414 13
head(k)
## X_id
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## title
## 1 600 Yard Bungalow For Sale In DHA Phase 6
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3 4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4 3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5 Buying A Flat In Clifton - Block 9?
## 6 500 Yards Bungalow Available For Sale In Phase 7 Chance Deal Three Unit
## price date.added type bedrooms bathrooms area
## 1 11.5 Crore 14 hours ago House 5 6 600 Sq. Yd.
## 2 1.45 Crore 5 hours ago House 0 0
## 3 2.12 Crore 5 hours ago House 0 0
## 4 1.5 Crore 5 hours ago House 0 0
## 5 4 Crore 6 hours ago Flat 3 3 200 Sq. Yd.
## 6 9 Crore 6 hours ago House 7 6 500 Sq. Yd.
## location
## 1 DHA Defence, Karachi, Sindh
## 2 Bahria Town Karachi, Karachi, Sindh
## 3 Bahria Town Karachi, Karachi, Sindh
## 4 Bahria Town Karachi, Karachi, Sindh
## 5 Clifton, Karachi, Sindh
## 6 DHA Defence, Karachi, Sindh
## complete.location
## 1 DHA Phase 6, DHA Defence, Karachi, Sindh
## 2 Bahria Town - Ali Block, Bahria Town - Precinct 12, Bahria Town Karachi, Karachi, Sindh
## 3 Bahria Sports City, Bahria Town Karachi, Karachi, Sindh
## 4 Bahria Town - Precinct 31, Bahria Town Karachi, Karachi, Sindh
## 5 Clifton - Block 9, Clifton, Karachi, Sindh
## 6 DHA Phase 7, DHA Defence, Karachi, Sindh
## description
## 1 Chance Deal 600 Yard Bungalow For Sale
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3 4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4 3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5 Apartment for sale
## 6 500 Square Yards Bungalow Available For Sale In Phase 7 Dha 7 Bedrooms 3 Unit Bungalow
## keywords
## 1 Built in year: 1,Parking Spaces: 5,Flooring,Other Main Features,Bedrooms: 5,Bathrooms: 6,Servant Quarters: 1,Drawing Room,Dining Room,Kitchens: 2,Study Room,Prayer Room,Powder Room ,Store Rooms: 1,Lounge or Sitting Room
## 2 Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 3 Bedrooms: 4,Bathrooms: 4,Kitchens: 2
## 4 Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 5 Flooring,Electricity Backup,Broadband Internet Access,Satellite or Cable TV Ready,Business Center or Media Room in Building,Pet Policy
## 6 Flooring,Electricity Backup,Bedrooms: 7,Bathrooms: 7,Servant Quarters: 1,Kitchens: 3,Store Rooms: 1,Nearby Schools,Nearby Hospitals,Nearby Shopping Malls,Nearby Restaurants,Distance From Airport (kms),Nearby Public Transport Service
## url
## 1 https://www.zameen.com/Property/d_h_a_dha_phase_6_600_yard_bungalow_for_sale_in_dha_phase_6-43187753-1483-1.html
## 2 https://www.zameen.com/Property/bahria_town_precinct_12_bahria_town_ali_block_3_beds_luxury_125_sq_yards_villa_for_sale_located_in_precinct_12_ali_block__bahria_town_karachi-43194528-10023-1.html
## 3 https://www.zameen.com/Property/bahria_town_karachi_bahria_sports_city_4_beds_luxury_sports_city_villa_for_rent_bahria_town_karachi-43194532-10061-1.html
## 4 https://www.zameen.com/Property/bahria_town_karachi_bahria_town_precinct_31_3_beds_luxury_235_sq_yards_villa_for_sale_located_in_precinct_31__bahria_town_karachi-43194530-10045-1.html
## 5 https://www.zameen.com/Property/clifton_clifton_block_9_buying_a_flat_in_clifton_block_9_-42887085-1671-1.html
## 6 https://www.zameen.com/Property/d_h_a_dha_phase_7_500_yards_bungalow_available_for_sale_in_phase_7_chance_deal_three_unit-41145833-1484-1.html
#to extract numeric component
k$sales_price <- readr::parse_number(k$price)
library(stringr)
k[c('p', 'price_unit')] <- str_split_fixed(k$price, ' ', 2)
#get rid of price and p variables, url
k$price <- NULL
k$p <- NULL
k$url <- NULL
#repeat same for area variabel
k$area_size <- readr::parse_number(k$area)
k[c('a', 'area_unit')] <- str_split_fixed(k$area, ' ', 2)
#get rid of a and area vairables
k$area <- NULL
k$a <- NULL
head(k)
## X_id
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## title
## 1 600 Yard Bungalow For Sale In DHA Phase 6
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3 4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4 3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5 Buying A Flat In Clifton - Block 9?
## 6 500 Yards Bungalow Available For Sale In Phase 7 Chance Deal Three Unit
## date.added type bedrooms bathrooms location
## 1 14 hours ago House 5 6 DHA Defence, Karachi, Sindh
## 2 5 hours ago House 0 0 Bahria Town Karachi, Karachi, Sindh
## 3 5 hours ago House 0 0 Bahria Town Karachi, Karachi, Sindh
## 4 5 hours ago House 0 0 Bahria Town Karachi, Karachi, Sindh
## 5 6 hours ago Flat 3 3 Clifton, Karachi, Sindh
## 6 6 hours ago House 7 6 DHA Defence, Karachi, Sindh
## complete.location
## 1 DHA Phase 6, DHA Defence, Karachi, Sindh
## 2 Bahria Town - Ali Block, Bahria Town - Precinct 12, Bahria Town Karachi, Karachi, Sindh
## 3 Bahria Sports City, Bahria Town Karachi, Karachi, Sindh
## 4 Bahria Town - Precinct 31, Bahria Town Karachi, Karachi, Sindh
## 5 Clifton - Block 9, Clifton, Karachi, Sindh
## 6 DHA Phase 7, DHA Defence, Karachi, Sindh
## description
## 1 Chance Deal 600 Yard Bungalow For Sale
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3 4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4 3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5 Apartment for sale
## 6 500 Square Yards Bungalow Available For Sale In Phase 7 Dha 7 Bedrooms 3 Unit Bungalow
## keywords
## 1 Built in year: 1,Parking Spaces: 5,Flooring,Other Main Features,Bedrooms: 5,Bathrooms: 6,Servant Quarters: 1,Drawing Room,Dining Room,Kitchens: 2,Study Room,Prayer Room,Powder Room ,Store Rooms: 1,Lounge or Sitting Room
## 2 Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 3 Bedrooms: 4,Bathrooms: 4,Kitchens: 2
## 4 Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 5 Flooring,Electricity Backup,Broadband Internet Access,Satellite or Cable TV Ready,Business Center or Media Room in Building,Pet Policy
## 6 Flooring,Electricity Backup,Bedrooms: 7,Bathrooms: 7,Servant Quarters: 1,Kitchens: 3,Store Rooms: 1,Nearby Schools,Nearby Hospitals,Nearby Shopping Malls,Nearby Restaurants,Distance From Airport (kms),Nearby Public Transport Service
## sales_price price_unit area_size area_unit
## 1 11.50 Crore 600 Sq. Yd.
## 2 1.45 Crore NA
## 3 2.12 Crore NA
## 4 1.50 Crore NA
## 5 4.00 Crore 200 Sq. Yd.
## 6 9.00 Crore 500 Sq. Yd.
#make all prices in same unit
k$sales_price[k$price_unit == "Lakh"] <- k$sales_price[k$price_unit == "Lakh"] * 0.01
k$price_unit[k$price_unit == "Lakh"] <- k$price_unit[k$price_unit == "Crore"]
## Warning in k$price_unit[k$price_unit == "Lakh"] <- k$price_unit[k$price_unit ==
## : number of items to replace is not a multiple of replacement length
#neat the location names
k$location[k$location == "Bahria Town Karachi, Karachi, Sindh"] <- "Bahria Town"
k$location[k$location == "Clifton, Karachi, Sindh"] <- "Clifton"
k$location[k$location == "DHA Defence, Karachi, Sindh"] <- "DHA Defence"
table(k$price_unit)
##
## Crore
## 1 8413
unique(k$area_unit)
## [1] "Sq. Yd." ""
colSums(is.na(k))
## X_id title date.added type
## 0 0 0 0
## bedrooms bathrooms location complete.location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1092 0
# 1092 values missing in area size
range(k$sales_price)
## [1] NA NA
table(k$sales_price)
##
## 0.25 0.26 0.2682 0.2938 0.3 0.32 0.35 0.3574 0.36 0.3668 0.37
## 1 1 75 6 2 1 4 13 3 1 1
## 0.38 0.39 0.3943 0.3959 0.4 0.405 0.42 0.425 0.43 0.432 0.435
## 3 1 11 2 8 1 2 2 1 1 1
## 0.45 0.455 0.457 0.46 0.47 0.4712 0.475 0.48 0.4848 0.49 0.494
## 17 1 4 2 12 25 2 3 1 1 1
## 0.4998 0.5 0.51 0.52 0.525 0.53 0.5376 0.5387 0.54 0.545 0.55
## 47 51 4 6 1 1 1 4 2 1 28
## 0.56 0.567 0.57 0.5775 0.58 0.59 0.595 0.6 0.61 0.62 0.6205
## 3 1 4 4 8 24 1 17 1 10 1
## 0.6225 0.624 0.63 0.64 0.65 0.655 0.66 0.67 0.68 0.69 0.6935
## 7 1 2 4 82 6 6 2 12 5 1
## 0.698 0.7 0.715 0.72 0.73 0.74 0.75 0.76 0.765 0.77 0.78
## 1 40 1 12 1 2 66 6 3 23 19
## 0.79 0.795 0.8 0.804 0.8083 0.81 0.8156 0.82 0.83 0.84 0.85
## 14 3 81 1 1 6 2 15 5 4 66
## 0.86 0.8668 0.87 0.8732 0.875 0.8751 0.8788 0.88 0.89 0.9 0.91
## 7 1 6 46 2 1 1 7 13 60 6
## 0.918 0.92 0.925 0.9265 0.9288 0.93 0.94 0.946 0.95 0.96 0.9614
## 6 36 1 1 1 5 2 1 82 10 1
## 0.969 0.97 0.975 0.9792 0.98 0.9861 0.9886 0.99 0.998 1 1.01
## 25 3 1 1 13 42 1 11 11 56 1
## 1.02 1.03 1.04 1.05 1.06 1.07 1.08 1.09 1.1 1.11 1.12
## 6 48 1 75 2 5 50 1 66 5 10
## 1.13 1.14 1.15 1.16 1.17 1.18 1.19 1.2 1.21 1.22 1.25
## 4 1 40 27 10 3 3 52 3 5 50
## 1.26 1.28 1.3 1.32 1.34 1.35 1.36 1.38 1.39 1.4 1.41
## 13 24 55 1 4 36 2 20 3 46 11
## 1.42 1.43 1.44 1.45 1.46 1.47 1.48 1.49 1.5 1.51 1.52
## 11 23 12 54 3 24 32 3 149 2 11
## 1.53 1.54 1.55 1.56 1.57 1.58 1.59 1.6 1.61 1.62 1.63
## 4 5 87 7 3 12 5 77 47 8 4
## 1.64 1.65 1.66 1.67 1.68 1.69 1.7 1.71 1.72 1.73 1.74
## 3 51 1 4 6 8 72 1 9 3 2
## 1.75 1.76 1.78 1.79 1.8 1.82 1.85 1.86 1.88 1.89 1.9
## 149 5 15 2 44 1 48 10 9 12 104
## 1.92 1.94 1.95 1.96 1.97 1.98 1.99 2 2.02 2.03 2.04
## 6 1 77 5 3 65 8 122 3 7 1
## 2.05 2.06 2.08 2.09 2.1 2.12 2.14 2.15 2.16 2.17 2.18
## 41 18 8 2 116 6 4 61 1 7 8
## 2.19 2.2 2.22 2.23 2.24 2.25 2.29 2.3 2.31 2.32 2.33
## 1 53 3 10 2 85 3 49 1 2 2
## 2.34 2.35 2.36 2.37 2.38 2.39 2.4 2.41 2.42 2.43 2.44
## 1 27 1 2 5 1 45 1 6 2 4
## 2.45 2.46 2.47 2.48 2.49 2.5 2.52 2.53 2.55 2.58 2.59
## 38 4 2 4 2 76 2 10 29 1 1
## 2.6 2.62 2.65 2.67 2.68 2.7 2.74 2.75 2.8 2.84 2.85
## 57 1 12 1 6 34 1 24 37 1 7
## 2.86 2.9 2.95 2.99 3 3.1 3.12 3.15 3.18 3.19 3.2
## 1 37 75 4 54 10 1 10 1 1 26
## 3.22 3.23 3.24 3.25 3.3 3.35 3.39 3.4 3.45 3.5 3.55
## 1 1 1 36 16 10 2 18 7 69 7
## 3.57 3.59 3.6 3.65 3.67 3.7 3.74 3.75 3.8 3.85 3.89
## 6 1 25 9 1 20 2 29 31 19 2
## 3.9 3.95 3.96 3.98 3.99 4 4.09 4.1 4.15 4.19 4.2
## 19 9 9 1 1 80 2 27 14 2 15
## 4.25 4.3 4.35 4.36 4.37 4.38 4.4 4.45 4.48 4.5 4.51
## 51 19 8 1 1 1 12 4 1 107 1
## 4.53 4.54 4.55 4.57 4.58 4.6 4.61 4.63 4.65 4.68 4.7
## 1 1 11 1 1 17 1 1 12 1 11
## 4.71 4.75 4.76 4.8 4.84 4.85 4.87 4.89 4.9 4.91 4.92
## 1 29 1 16 1 6 1 1 22 1 1
## 4.95 4.99 5 5.05 5.1 5.13 5.15 5.2 5.25 5.3 5.35
## 12 1 52 1 16 1 5 9 47 9 7
## 5.4 5.45 5.5 5.55 5.6 5.65 5.66 5.69 5.7 5.74 5.75
## 8 1 58 1 12 3 1 1 12 2 22
## 5.76 5.77 5.8 5.85 5.9 5.95 5.98 5.99 6 6.01 6.05
## 1 1 13 5 15 5 2 1 62 1 6
## 6.09 6.1 6.15 6.2 6.22 6.25 6.29 6.3 6.32 6.35 6.38
## 1 9 5 10 1 33 1 15 1 6 1
## 6.4 6.45 6.49 6.5 6.55 6.6 6.65 6.7 6.75 6.76 6.8
## 11 2 5 93 6 14 8 21 36 1 9
## 6.84 6.85 6.9 6.92 6.94 6.95 7 7.1 7.15 7.2 7.25
## 2 6 13 2 1 10 55 5 6 5 23
## 7.26 7.28 7.3 7.35 7.37 7.4 7.42 7.45 7.49 7.5 7.51
## 2 1 12 6 1 13 1 2 1 71 2
## 7.52 7.6 7.65 7.7 7.75 7.8 7.85 7.9 7.95 7.99 8
## 1 9 4 6 28 15 3 11 4 1 43
## 8.03 8.05 8.1 8.14 8.15 8.2 8.25 8.3 8.34 8.35 8.4
## 1 1 5 2 1 2 26 3 1 5 1
## 8.45 8.48 8.5 8.6 8.65 8.7 8.74 8.75 8.8 8.85 8.9
## 3 1 55 4 1 3 1 18 3 1 9
## 8.97 8.98 8.99 9 9.1 9.15 9.2 9.25 9.29 9.3 9.31
## 2 1 2 60 1 1 4 29 6 3 1
## 9.32 9.35 9.4 9.45 9.49 9.5 9.6 9.7 9.75 9.8 9.85
## 1 2 6 3 1 47 5 2 19 8 2
## 9.9 9.99 10 10.05 10.1 10.2 10.24 10.25 10.3 10.4 10.49
## 10 1 53 1 1 2 2 14 2 2 1
## 10.5 10.6 10.74 10.75 10.8 10.83 10.85 10.9 10.95 10.99 11
## 58 2 1 14 1 1 1 2 1 9 77
## 11.01 11.05 11.15 11.25 11.3 11.4 11.49 11.5 11.55 11.75 11.8
## 1 2 1 16 5 2 1 56 1 14 2
## 11.9 11.95 12 12.15 12.25 12.39 12.4 12.45 12.5 12.65 12.7
## 2 4 70 1 5 2 3 1 53 1 2
## 12.73 12.75 12.8 12.85 12.9 12.95 13 13.03 13.15 13.25 13.4
## 1 9 1 2 2 1 44 1 1 8 1
## 13.5 13.6 13.65 13.75 13.8 13.85 13.9 13.99 14 14.2 14.25
## 40 1 2 9 1 2 2 3 46 2 4
## 14.3 14.4 14.5 14.7 14.9 15 15.04 15.2 15.25 15.4 15.45
## 1 2 23 1 2 36 2 2 2 2 1
## 15.5 15.6 15.75 15.8 15.89 15.9 15.95 16 16.2 16.25 16.4
## 24 1 1 7 1 1 2 63 1 4 1
## 16.5 16.64 16.7 16.75 16.99 17 17.25 17.3 17.5 17.65 17.75
## 61 2 1 5 1 55 4 1 28 16 11
## 17.9 17.95 18 18.25 18.5 18.7 18.75 18.9 18.95 19 19.25
## 2 4 42 5 15 4 3 1 1 15 2
## 19.4 19.5 19.75 20 20.5 21 21.4 21.5 21.7 21.9 21.99
## 1 10 3 34 4 18 1 2 1 2 1
## 22 22.5 22.6 22.9 23 23.1 23.4 23.5 23.55 23.75 24
## 39 12 2 4 21 7 2 5 1 1 11
## 24.2 24.5 24.75 25 25.5 26 26.5 26.75 26.99 27 27.5
## 2 2 1 16 2 11 2 1 1 5 3
## 27.75 27.98 28 28.05 28.5 29 29.4 29.8 30 31 32
## 1 1 42 1 1 3 1 3 8 4 9
## 33 34 35 36 36.16 36.5 37 38 40 42 43
## 5 4 12 3 1 1 2 9 5 3 1
## 45 47 47.5 48 49 50 70 75 80 85 95
## 4 1 1 5 1 1 4 1 3 1 1
## 99
## 2
colSums(is.na(k))
## X_id title date.added type
## 0 0 0 0
## bedrooms bathrooms location complete.location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1092 0
#to fill the missing area size dig into it
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
k <- clean_names(k)
colSums(is.na(k))
## x_id title date_added type
## 0 0 0 0
## bedrooms bathrooms location complete_location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1092 0
#area size and price location
ddply(k, .(location), summarize, size = mean(!is.na(area_size)))
## location size
## 1 0.9809264
## 2 Bahria Town 0.7804667
## 3 Clifton 0.9589258
## 4 DHA Defence 0.9246766
ddply(k, .(location), summarize, price = mean((sales_price)))
## location price
## 1 NA
## 2 Bahria Town 1.814954
## 3 Clifton 4.865529
## 4 DHA Defence 9.235381
#area size and price by house type
ddply(k, .(type), summarize, size = mean(!is.na(area_size)))
## type size
## 1 0.0000000
## 2 Farm House 0.5000000
## 3 Flat 0.9124204
## 4 House 0.8360584
## 5 Lower Portion 1.0000000
## 6 Penthouse 0.8541667
## 7 Upper Portion 1.0000000
ddply(k, .(type), summarize, price = mean((sales_price)))
## type price
## 1 NA
## 2 Farm House 12.500000
## 3 Flat 2.607330
## 4 House 7.886607
## 5 Lower Portion 3.400000
## 6 Penthouse 13.938958
## 7 Upper Portion 3.483333
#for imputation
#X['loading'].fillna(value=127.74233678955453, inplace=True)
colnames(k)
## [1] "x_id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "sales_price" "price_unit"
## [13] "area_size" "area_unit"
colSums(is.na(k))
## x_id title date_added type
## 0 0 0 0
## bedrooms bathrooms location complete_location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1092 0
#separate data with missing area size
have_area_size <- k %>% filter(!is.na(area_size))
table1 <- have_area_size %>% select(area_size, bedrooms, location, type) %>%
group_by(type,location) %>%
summarise(mean = mean(area_size))
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
t <- as.data.frame(table1)
table(k$type)
##
## Farm House Flat House Lower Portion
## 1 4 3768 4587 3
## Penthouse Upper Portion
## 48 3
no_area_size <- k %>% filter(is.na(area_size))
colSums(is.na(no_area_size))
## x_id title date_added type
## 0 0 0 0
## bedrooms bathrooms location complete_location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1092 0
dim(no_area_size)
## [1] 1092 14
#extract ids by locaiton in no area size dataset
b <- no_area_size %>% select(x_id,location, type) %>%
group_by(type, location,x_id) %>% filter(location == "Bahria Town")
c <- no_area_size %>% select(x_id,location, type) %>%
group_by(type, location,x_id) %>% filter(location == "Clifton")
d <- no_area_size %>% select(x_id,location, type) %>%
group_by(type, location,x_id) %>% filter(location == "DHA Defence")
# This gives laverage area size for each type of house as per location
# The house types are:
# 1- empty,
# farmhouse (4), flat(3768), house(4587), lower portion(3), penthouse(48), upper portion(3)
Now impute values
no_area_size %>% select(x_id,location, type, area_size) %>%
group_by(type, location,x_id, area_size) %>% filter(is.na(area_size))
## # A tibble: 1,092 × 4
## # Groups: type, location, x_id, area_size [1,092]
## x_id location type area_size
## <int> <chr> <chr> <dbl>
## 1 2 Bahria Town House NA
## 2 3 Bahria Town House NA
## 3 4 Bahria Town House NA
## 4 7 Bahria Town House NA
## 5 8 Bahria Town House NA
## 6 11 DHA Defence House NA
## 7 14 Bahria Town Flat NA
## 8 38 Bahria Town House NA
## 9 50 DHA Defence Flat NA
## 10 58 Bahria Town House NA
## # ℹ 1,082 more rows
no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "Bahria Town" ] <- 123.1602
no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "Bahria Town" ] <- 256.9191
no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "DHA Defence" ] <- 550.3603
no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "DHA Defence" ] <- 194.2698
no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "Clifton" ] <- 216.8858
no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "Clifton" ] <- 418.7263
no_area_size$area_size[no_area_size$type == "Farm House" & no_area_size$location == "Bahria Town" ] <- 4400.00
no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "Clifton" ] <- 281.25
no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "Bahria Town" ] <- 607.00
no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "DHA Defence" ] <- 123.1602
sum(is.na(no_area_size$location))
## [1] 0
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 115)
## x_id location type area_size complete_location
## 1 115 NA
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 170)
## x_id location type area_size
## 1 170 Flat NA
## complete_location
## 1 Bahria Apartments, Bahria Town Karachi, Karachi, Sindh
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 1880)
## x_id location type area_size
## 1 1880 House NA
## complete_location
## 1 Bahria Town - Precinct 10-B, Bahria Town Karachi, Karachi, Sindh
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 2525)
## x_id location type area_size
## 1 2525 Flat NA
## complete_location
## 1 Emaar The Views, DHA Phase 8, DHA Defence, Karachi, Sindh
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 5698)
## x_id location type area_size
## 1 5698 Flat NA
## complete_location
## 1 Bahria Town - Precinct 1, Bahria Town Karachi, Karachi, Sindh
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 7478)
## x_id location type area_size complete_location
## 1 7478 Flat NA Bahria Town Karachi, Karachi, Sindh
k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 7838)
## x_id location type area_size
## 1 7838 Flat NA
## complete_location
## 1 Bahria Town - Jinnah Avenue, Bahria Town Karachi, Karachi, Sindh
For missing ids data check one by one to retrieve information 115 has no information given 170 is flat in Bahria Town 1880 is house in Bahria town 2525 is flat in DHA 5698 is flat in Bahria Town 7478 is flat in Bahria Town 7838 is flat in Bahria Town
Now insert this information
no_area_size %>% select(x_id,location, type, area_size, keywords, description) %>%
group_by(type, location,x_id, area_size) %>% filter(is.na(area_size))
## # A tibble: 7 × 6
## # Groups: type, location, x_id, area_size [7]
## x_id location type area_size keywords description
## <int> <chr> <chr> <dbl> <chr> <chr>
## 1 115 "" "" NA "" ""
## 2 170 "" "Flat" NA "Parking Spaces,Double Glazed Wi… "Fully fur…
## 3 1880 "" "House" NA "Built in year: 2025,Parking Spa… "AQ Execut…
## 4 2525 "" "Flat" NA "Parking Spaces: 2,Flooring,Floo… "Emaar The…
## 5 5698 "" "Flat" NA "Built in year: 2025,Parking Spa… "JASMINE M…
## 6 7478 "" "Flat" NA "" "Bahria To…
## 7 7838 "" "Flat" NA "" "Grab This…
no_area_size %>% select(x_id,location, type, area_size) %>%
group_by(type, location,x_id, area_size) %>% filter(x_id == 5698)
## # A tibble: 1 × 4
## # Groups: type, location, x_id, area_size [1]
## x_id location type area_size
## <int> <chr> <chr> <dbl>
## 1 5698 "" Flat NA
no_area_size$area_size[no_area_size$x_id == 170 & no_area_size$type == "Flat" ] <- 123.1602
no_area_size$location[no_area_size$x_id == 170] <- "Bahria Town"
no_area_size$area_size[no_area_size$x_id == 1880 & no_area_size$type == "House" ] <- 256.9191
no_area_size$location[no_area_size$x_id == 1880] <- "Bahria Town"
no_area_size$area_size[no_area_size$x_id == 2525 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 2525] <- "DHA Defence"
no_area_size$area_size[no_area_size$x_id == 5698 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 5698] <- "Bahria Town"
no_area_size$area_size[no_area_size$x_id == 7478 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 7478] <- "Bahria Town"
no_area_size$area_size[no_area_size$x_id == 7838 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 7838] <- "Bahria Town"
#now remove id 115 with no other information provided
#no_area_size$x_id[no_area_size$x_id == 115] <- NULL
colSums(is.na(no_area_size))
## x_id title date_added type
## 0 0 0 0
## bedrooms bathrooms location complete_location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 1 0
## area_size area_unit
## 1 0
We have two data sets k with missing area size values those are handled separately
ggplot(data = k[!is.na(k$sales_price), ], aes(x = sales_price)) +
geom_histogram(fill = "blue", binwidth = 10)
summary(k$sales_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.250 1.430 2.500 5.556 7.280 99.000 1
#check variables with high correlation with response variable sales price
numeric_var <- which(sapply(k, is.numeric)) #index numeric vectors
numeric_var_names <- names(numeric_var) #saving names vector for later use
cat("there are", length(numeric_var), "numeric variables")
## there are 5 numeric variables
k_numvar <- k[, numeric_var]
cor_numvar <- cor(k_numvar, use = "pairwise.complete.obs") #correlation of all numeric variables
#sort on decreasing correlation with sales price
cor_sorted <- as.matrix(sort(cor_numvar[,'sales_price'], decreasing = TRUE))
#select only high correlations
corHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.5)))
cor_numVar <- cor_numvar[corHigh, corHigh]
corrplot.mixed(cor_numVar, tl.col = "black", tl.pos = "lt")
Now visualize the relation between sales price area area size, bedrooms and bathrooms
#area_size
ggplot(data = k[!is.na(k$sales_price), ], aes (x = factor(bedrooms), y = sales_price)) +
geom_boxplot(col = "blue") + labs(x = "Bedrooms Number") +
scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
labs(
title = "The sales price with Number of bedrooms",
subtitle = "The number of bedrooms effecting cost",
caption = "Karachi property dataset",
x = "Number of Bedrooms", y="Sales price"
)
The positive correlation is quite evident with increasing number of
bedrooms check for other two numeric variables
#Bathrooms
ggplot(data = k[!is.na(k$sales_price), ], aes (x = factor(bathrooms), y = sales_price)) +
geom_boxplot(col = "green") + labs(x = "Bedrooms Number") +
scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
labs(
title = "The sales price with Number of bathrooms",
subtitle = "The number of bathrooms effecting cost",
caption = "Karachi property dataset",
x = "Number of Bathrooms", y="Sales price"
)
This also shows positve coorelation of increasing number of bathrooms
with sales price
#area size
ggplot(data = k[!is.na(k$sales_price), ], aes (x = area_size, y = sales_price)) +
geom_point(col = "brown") +
geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
labs(
title = "The sales price with Number of Area",
subtitle = "The number of bathrooms effecting cost",
caption = "Karachi property dataset",
x = "Area size", y="Sales price"
) +
geom_text_repel(aes(label = ifelse(k$area_size[!is.na(k$sales_price)]>2500, rownames(k), '')))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1091 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1091 rows containing missing values (`geom_point()`).
## Warning: Removed 1091 rows containing missing values (`geom_text_repel()`).
#see variables wiht missing values
na_col <- which(colSums(is.na(k)) > 0)
sort(colSums(sapply(k[na_col], is.na)), decreasing = TRUE)
## area_size sales_price
## 1092 1
cat("there are ", length(na_col), "columns with missing values")
## there are 2 columns with missing values
fix 1092 NA in area size variables
Besides making sure that the NAs are taken care off, I have also converted character variables into ordinal integers if there is clear ordinality, or into factors if levels are categories without ordinality. I will convert these factors into numeric later on by using one-hot encoding (using the model.matrix function).
Label encoding/factorizing the remaining character variables
charcol <- names(k[,sapply(k, is.character)])
charcol
## [1] "title" "date_added" "type"
## [4] "location" "complete_location" "description"
## [7] "keywords" "price_unit" "area_unit"
cat("There are", length(charcol), "remaining columns with character values")
## There are 9 remaining columns with character values
#for no ordinality so convert into factors
k$location <- as.factor(k$location)
table(k$location)
##
## Bahria Town Clifton DHA Defence
## 367 3471 633 3943
sum(table(k$location))
## [1] 8414
k$type <- as.factor(k$type)
table(k$type)
##
## Farm House Flat House Lower Portion
## 1 4 3768 4587 3
## Penthouse Upper Portion
## 48 3
sum(table(k$type))
## [1] 8414
# k$keywords <- as.factor(k$keywords)
# table(k$keywords)
# sum(table(k$keywords))
names(k)
## [1] "x_id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "sales_price" "price_unit"
## [13] "area_size" "area_unit"
#Finding variable importance with a quick Random Forest Although the correlations are giving a good overview of the most important numeric variables and multicolinerity among those variables, I wanted to get an overview of the most important variables including the categorical variables before moving on to visualization.
I tried to get the relative importance of variables with a quick linear regression model with the calc.relimp function of package , and also tried the boruta function of package boruta which separates the variables into groups that are important or not. However, these method took a long time. As I only want to get an indication of the variable importance, I eventually decided to keep it simple and just use a quick and dirty Random Forest model with only 100 trees. This also does the job for me, and does not take very long as I can specify a (relatively) small number of trees.
k2 <- no_area_size
#to remove rows with na in it
k2 <- no_area_size
k2 <- na.omit(k2)
k2 %>% select(x_id, sales_price) %>% group_by(x_id, sales_price) %>% filter(is.na(sales_price))
## # A tibble: 0 × 2
## # Groups: x_id, sales_price [0]
## # ℹ 2 variables: x_id <int>, sales_price <dbl>
colSums(is.na(k2))
## x_id title date_added type
## 0 0 0 0
## bedrooms bathrooms location complete_location
## 0 0 0 0
## description keywords sales_price price_unit
## 0 0 0 0
## area_size area_unit
## 0 0
dim(k2)
## [1] 1091 14
set.seed(2024)
quick_RF <- randomForest(x = k2[1:1091,-11], y = k2$sales_price[1:1091], ntree = 10, importance = TRUE)
imp_RF <- importance(quick_RF)
imp_DF <- data.frame(Variables = row.names(imp_RF), MSE = imp_RF[,1])
imp_DF <- imp_DF[order(imp_DF$MSE, decreasing = TRUE), ]
ggplot(imp_DF[1:20,], aes(x = reorder(Variables, MSE), y = MSE, fill = MSE)) +
geom_bar(stat = "identity")+
labs(x = "Variables", y = "% increase MSE if variable is randomly permuted") + coord_flip() +
theme(legend.position = "none")
## Warning: Removed 7 rows containing missing values (`position_stack()`).
colnames(k2)
## [1] "x_id" "title" "date_added"
## [4] "type" "bedrooms" "bathrooms"
## [7] "location" "complete_location" "description"
## [10] "keywords" "sales_price" "price_unit"
## [13] "area_size" "area_unit"
as <- ggplot(data = k2, aes(area_size))+geom_density()
bed <- ggplot(data = k2, aes(as.factor(bedrooms)))+geom_histogram(stat= "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
bath <- ggplot(data = k2, aes(as.factor(bathrooms)))+geom_histogram(stat= "Count")
## Warning in geom_histogram(stat = "Count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
loc <- ggplot(data = k2, aes(as.factor(location)))+geom_histogram(stat= "Count")
## Warning in geom_histogram(stat = "Count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
layout <- matrix (c(1,2,3,4), byrow = TRUE)
multiplot(as, bed, bath, loc, layout = layout)
cor(k2$area_size, k2$sales_price)
## [1] 0.4500828
cor(k2$bathrooms, k2$sales_price)
## [1] -0.0407995
cor(k2$bedrooms, k2$sales_price)
## [1] 0.2489214
The most important categorical variable Create median sales price by location, type.
n1 <- ggplot(k2[!is.na(k2$sales_price), ], aes(x = type, y = sales_price)) +
geom_bar(stat = "summary", fun.y = "median", fill = "blue")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(breaks = seq(0,80, by = 4), labels = comma)+
geom_hline(yintercept = 40, linetype = "dashed", color = "red")+
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3)
## Warning in geom_bar(stat = "summary", fun.y = "median", fill = "blue"):
## Ignoring unknown parameters: `fun.y`
#dashed line is median sales price
n2 <- ggplot(k2, aes(x = type)) + geom_histogram(stat = "count") +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
grid.arrange(n1, n2)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## No summary function supplied, defaulting to `mean_se()`
n1 <- ggplot(k2[!is.na(k2$sales_price), ], aes(x = location, y = sales_price)) +
geom_bar(stat = "summary", fun.y = "median", fill = "blue")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
scale_y_continuous(breaks = seq(0, 15, by = 5), labels = comma)+
geom_hline(yintercept = 5, linetype = "dashed", color = "red")+
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3)
## Warning in geom_bar(stat = "summary", fun.y = "median", fill = "blue"):
## Ignoring unknown parameters: `fun.y`
#dashed line is median sales price
n2 <- ggplot(k2, aes(x = location)) + geom_histogram(stat = "count") +
geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
grid.arrange(n1, n2)
## No summary function supplied, defaulting to `mean_se()`
Total number of bathrooms and bedrooms
bt1 <- ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = as.factor(bathrooms), y = sales_price))+
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "black", aes(group = 1)) +
scale_y_continuous(breaks = seq(0, 100, by = 20), labels = comma)
bt2 <- ggplot(data = k2, aes(x = as.factor(bathrooms))) + geom_histogram(stat = "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
grid.arrange(bt1,bt2)
## `geom_smooth()` using formula = 'y ~ x'
bd1 <- ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = as.factor(bedrooms), y = sales_price))+
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "black", aes(group = 1)) +
scale_y_continuous(breaks = seq(0, 100, by = 20), labels = comma)
bd2 <- ggplot(data = k2, aes(x = as.factor(bedrooms))) + geom_histogram(stat = "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
grid.arrange(bd1,bd2)
## `geom_smooth()` using formula = 'y ~ x'
#Area size
ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = area_size, y = sales_price)) +
geom_point(col = "blue")+
geom_smooth(method = "lm", se= FALSE, color = "black", aes(group = 1)) +
scale_y_continuous(breaks = seq(0,100,15), labels = comma) +
geom_text_repel(aes(label = ifelse(k2$area_size[!is.na(k2$sales_price)] > 50, rownames(k2), "")))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
check correlation
cor(k2$sales_price, k2$area_size)
## [1] 0.4500828
#Preparing data for modeling
#to drop high correlated variables
#dropvars <- c("", "","")
#k2 <- k2[, !names(k2) %in% dropvars]
#normalize the data
prenum <- preProcess(k2, method = c("center", "scale"))
print(prenum)
## Created from 1091 samples and 14 variables
##
## Pre-processing:
## - centered (5)
## - ignored (9)
## - scaled (5)
DFnorm <- predict(prenum, k2)
dim(DFnorm)
## [1] 1091 14
#One hot encoding the categorical variables The last step needed to ensure that all predictors are converted into numeric columns (which is required by most Machine Learning algorithms) is to ‘one-hot encode’ the categorical variables. This basically means that all (not ordinal) factor values are getting a seperate colums with 1s and 0s (1 basically means Yes/Present). To do this one-hot encoding, I am using the model.matrix() function.
#dealing with skeness of response variable
skew(k2$sales_price)
## [1] 4.709454
qqnorm(k2$sales_price)
qqline(k2$sales_price)
The skew of 1.87 indicates a right skew that is too high, and the Q-Q
plot shows that sale prices are also not normally distributed. To fix
this I am taking the log of SalePrice.
k2$sales_price <- log(k2$sales_price)
skew(k2$sales_price)
## [1] 0.7246697
qqnorm(k2$sales_price)
qqline(k2$sales_price)
#Compose train and test datasets
train1 <- k2[!is.na(k2$sales_price), ]
test1 <- k2[is.na(k2$sales_price), ]
#Modeling #Lasso regression model I have also tried Ridge and Elastic Net models, but since lasso gives the best results of those 3 models I am only keeping the lasso model in the document.
The elastic-net penalty is controlled by alpha, and bridges the gap between lasso (alpha=1) and ridge (alpha=0). The tuning parameter lambda controls the overall strength of the penalty. It is known that the ridge penalty shrinks the coefficients of correlated predictors towards each other while the lasso tends to pick one of them and discard the others.
Below, I am using caret cross validation to find the best value for lambda, which is the only hyperparameter that needs to be tuned for the lasso model.
set.seed(1234)
mycontrol <- trainControl(method = "cv", number = 5)
lassogrid <- expand.grid(alpha =1, lambda = seq(0.001, 0.1, by = 0.0005))
lassomod <- train(x = train1, y = k2$sales_price[!is.na(k2$sales_price)], method = "glmnet", trControl = mycontrol, tuneGrid = lassogrid)
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion
## Warning in storage.mode(xd) <- "double": NAs introduced by coercion
lassomod$bestTune
## alpha lambda
## 59 1 0.03
min(lassomod$results$RMSE)
## [1] 0.03061157
lassovarimp <- varImp(lassomod, scale = FALSE)
lassoImportance <- lassovarimp$importance
varsSelected <- length(which(lassoImportance$Overall != 0))
varsNorSelected <- length(which(lassoImportance$Overall == 0))
cat("Lasso uses", varsSelected, "vriables in its model, and did not select ", varsNorSelected, "variables")
## Lasso uses 1 vriables in its model, and did not select 13 variables
lassopred <- predict(lassomod, test1)
## Warning in cbind2(1, newx): number of rows of result is not a multiple of
## vector length (arg 1)
predictions_lasso <- exp(lassopred)
head(predictions_lasso)
## numeric(0)