price-prediction

#load the libraries

library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
library(Rmisc)

#load dataset

k <- read.csv("karachi_property.csv", stringsAsFactors = F)

#data size and structure
dim(k)

## [1] 8414   13

head(k)

##   X_id
## 1    1
## 2    2
## 3    3
## 4    4
## 5    5
## 6    6
##                                                                                             title
## 1                                                       600 Yard Bungalow For Sale In DHA Phase 6
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3                                    4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4           3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5                                                             Buying A Flat In Clifton - Block 9?
## 6                         500 Yards Bungalow Available For Sale In Phase 7 Chance Deal Three Unit
##        price   date.added  type bedrooms bathrooms        area
## 1 11.5 Crore 14 hours ago House        5         6 600 Sq. Yd.
## 2 1.45 Crore  5 hours ago House        0         0            
## 3 2.12 Crore  5 hours ago House        0         0            
## 4  1.5 Crore  5 hours ago House        0         0            
## 5    4 Crore  6 hours ago  Flat        3         3 200 Sq. Yd.
## 6    9 Crore  6 hours ago House        7         6 500 Sq. Yd.
##                              location
## 1         DHA Defence, Karachi, Sindh
## 2 Bahria Town Karachi, Karachi, Sindh
## 3 Bahria Town Karachi, Karachi, Sindh
## 4 Bahria Town Karachi, Karachi, Sindh
## 5             Clifton, Karachi, Sindh
## 6         DHA Defence, Karachi, Sindh
##                                                                         complete.location
## 1                                                DHA Phase 6, DHA Defence, Karachi, Sindh
## 2 Bahria Town - Ali Block, Bahria Town - Precinct 12, Bahria Town Karachi, Karachi, Sindh
## 3                                 Bahria Sports City, Bahria Town Karachi, Karachi, Sindh
## 4                          Bahria Town - Precinct 31, Bahria Town Karachi, Karachi, Sindh
## 5                                              Clifton - Block 9, Clifton, Karachi, Sindh
## 6                                                DHA Phase 7, DHA Defence, Karachi, Sindh
##                                                                                       description
## 1                                                          Chance Deal 600 Yard Bungalow For Sale
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3                                    4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4           3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5                                                                              Apartment for sale
## 6          500 Square Yards Bungalow Available For Sale In Phase 7 Dha 7 Bedrooms 3 Unit Bungalow
##                                                                                                                                                                                                                                   keywords
## 1              Built in year: 1,Parking Spaces: 5,Flooring,Other Main Features,Bedrooms: 5,Bathrooms: 6,Servant Quarters: 1,Drawing Room,Dining Room,Kitchens: 2,Study Room,Prayer Room,Powder Room ,Store Rooms: 1,Lounge or Sitting Room
## 2                                                                                                                                                                                                     Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 3                                                                                                                                                                                                     Bedrooms: 4,Bathrooms: 4,Kitchens: 2
## 4                                                                                                                                                                                                     Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 5                                                                                                   Flooring,Electricity Backup,Broadband Internet Access,Satellite or Cable TV Ready,Business Center or Media Room in Building,Pet Policy
## 6 Flooring,Electricity Backup,Bedrooms: 7,Bathrooms: 7,Servant Quarters: 1,Kitchens: 3,Store Rooms: 1,Nearby Schools,Nearby Hospitals,Nearby Shopping Malls,Nearby Restaurants,Distance From Airport (kms),Nearby Public Transport Service
##                                                                                                                                                                                                   url
## 1                                                                                    https://www.zameen.com/Property/d_h_a_dha_phase_6_600_yard_bungalow_for_sale_in_dha_phase_6-43187753-1483-1.html
## 2 https://www.zameen.com/Property/bahria_town_precinct_12_bahria_town_ali_block_3_beds_luxury_125_sq_yards_villa_for_sale_located_in_precinct_12_ali_block__bahria_town_karachi-43194528-10023-1.html
## 3                                           https://www.zameen.com/Property/bahria_town_karachi_bahria_sports_city_4_beds_luxury_sports_city_villa_for_rent_bahria_town_karachi-43194532-10061-1.html
## 4             https://www.zameen.com/Property/bahria_town_karachi_bahria_town_precinct_31_3_beds_luxury_235_sq_yards_villa_for_sale_located_in_precinct_31__bahria_town_karachi-43194530-10045-1.html
## 5                                                                                      https://www.zameen.com/Property/clifton_clifton_block_9_buying_a_flat_in_clifton_block_9_-42887085-1671-1.html
## 6                                                      https://www.zameen.com/Property/d_h_a_dha_phase_7_500_yards_bungalow_available_for_sale_in_phase_7_chance_deal_three_unit-41145833-1484-1.html

#to extract numeric component
k$sales_price <- readr::parse_number(k$price)

library(stringr)
k[c('p', 'price_unit')] <- str_split_fixed(k$price, ' ', 2)

#get rid of price and p variables, url
k$price <- NULL
k$p <- NULL
k$url <- NULL

#repeat same for area variabel 

k$area_size <- readr::parse_number(k$area)
k[c('a', 'area_unit')] <- str_split_fixed(k$area, ' ', 2)


#get rid of a and area vairables
k$area <- NULL
k$a <- NULL

head(k)

##   X_id
## 1    1
## 2    2
## 3    3
## 4    4
## 5    5
## 6    6
##                                                                                             title
## 1                                                       600 Yard Bungalow For Sale In DHA Phase 6
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3                                    4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4           3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5                                                             Buying A Flat In Clifton - Block 9?
## 6                         500 Yards Bungalow Available For Sale In Phase 7 Chance Deal Three Unit
##     date.added  type bedrooms bathrooms                            location
## 1 14 hours ago House        5         6         DHA Defence, Karachi, Sindh
## 2  5 hours ago House        0         0 Bahria Town Karachi, Karachi, Sindh
## 3  5 hours ago House        0         0 Bahria Town Karachi, Karachi, Sindh
## 4  5 hours ago House        0         0 Bahria Town Karachi, Karachi, Sindh
## 5  6 hours ago  Flat        3         3             Clifton, Karachi, Sindh
## 6  6 hours ago House        7         6         DHA Defence, Karachi, Sindh
##                                                                         complete.location
## 1                                                DHA Phase 6, DHA Defence, Karachi, Sindh
## 2 Bahria Town - Ali Block, Bahria Town - Precinct 12, Bahria Town Karachi, Karachi, Sindh
## 3                                 Bahria Sports City, Bahria Town Karachi, Karachi, Sindh
## 4                          Bahria Town - Precinct 31, Bahria Town Karachi, Karachi, Sindh
## 5                                              Clifton - Block 9, Clifton, Karachi, Sindh
## 6                                                DHA Phase 7, DHA Defence, Karachi, Sindh
##                                                                                       description
## 1                                                          Chance Deal 600 Yard Bungalow For Sale
## 2 3 BEDS LUXURY 125 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 12 ALI BLOCK, Bahria Town Karachi
## 3                                    4 BEDS LUXURY SPORTS CITY VILLA FOR RENT BAHRIA TOWN KARACHI
## 4           3 BEDS LUXURY 235 SQ YARDS VILLA FOR SALE LOCATED IN Precinct 31, Bahria Town Karachi
## 5                                                                              Apartment for sale
## 6          500 Square Yards Bungalow Available For Sale In Phase 7 Dha 7 Bedrooms 3 Unit Bungalow
##                                                                                                                                                                                                                                   keywords
## 1              Built in year: 1,Parking Spaces: 5,Flooring,Other Main Features,Bedrooms: 5,Bathrooms: 6,Servant Quarters: 1,Drawing Room,Dining Room,Kitchens: 2,Study Room,Prayer Room,Powder Room ,Store Rooms: 1,Lounge or Sitting Room
## 2                                                                                                                                                                                                     Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 3                                                                                                                                                                                                     Bedrooms: 4,Bathrooms: 4,Kitchens: 2
## 4                                                                                                                                                                                                     Bedrooms: 3,Bathrooms: 3,Kitchens: 2
## 5                                                                                                   Flooring,Electricity Backup,Broadband Internet Access,Satellite or Cable TV Ready,Business Center or Media Room in Building,Pet Policy
## 6 Flooring,Electricity Backup,Bedrooms: 7,Bathrooms: 7,Servant Quarters: 1,Kitchens: 3,Store Rooms: 1,Nearby Schools,Nearby Hospitals,Nearby Shopping Malls,Nearby Restaurants,Distance From Airport (kms),Nearby Public Transport Service
##   sales_price price_unit area_size area_unit
## 1       11.50      Crore       600   Sq. Yd.
## 2        1.45      Crore        NA          
## 3        2.12      Crore        NA          
## 4        1.50      Crore        NA          
## 5        4.00      Crore       200   Sq. Yd.
## 6        9.00      Crore       500   Sq. Yd.

#make all prices in same unit
k$sales_price[k$price_unit == "Lakh"] <- k$sales_price[k$price_unit == "Lakh"] * 0.01

k$price_unit[k$price_unit == "Lakh"] <- k$price_unit[k$price_unit == "Crore"]

## Warning in k$price_unit[k$price_unit == "Lakh"] <- k$price_unit[k$price_unit ==
## : number of items to replace is not a multiple of replacement length

#neat the location names
k$location[k$location == "Bahria Town Karachi, Karachi, Sindh"] <-  "Bahria Town"
k$location[k$location == "Clifton, Karachi, Sindh"] <-  "Clifton"
k$location[k$location == "DHA Defence, Karachi, Sindh"] <- "DHA Defence"


table(k$price_unit)

## 
##       Crore 
##     1  8413

unique(k$area_unit)

## [1] "Sq. Yd." ""

colSums(is.na(k))

##              X_id             title        date.added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete.location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##              1092                 0

# 1092 values missing in area size

range(k$sales_price)

## [1] NA NA

table(k$sales_price)

## 
##   0.25   0.26 0.2682 0.2938    0.3   0.32   0.35 0.3574   0.36 0.3668   0.37 
##      1      1     75      6      2      1      4     13      3      1      1 
##   0.38   0.39 0.3943 0.3959    0.4  0.405   0.42  0.425   0.43  0.432  0.435 
##      3      1     11      2      8      1      2      2      1      1      1 
##   0.45  0.455  0.457   0.46   0.47 0.4712  0.475   0.48 0.4848   0.49  0.494 
##     17      1      4      2     12     25      2      3      1      1      1 
## 0.4998    0.5   0.51   0.52  0.525   0.53 0.5376 0.5387   0.54  0.545   0.55 
##     47     51      4      6      1      1      1      4      2      1     28 
##   0.56  0.567   0.57 0.5775   0.58   0.59  0.595    0.6   0.61   0.62 0.6205 
##      3      1      4      4      8     24      1     17      1     10      1 
## 0.6225  0.624   0.63   0.64   0.65  0.655   0.66   0.67   0.68   0.69 0.6935 
##      7      1      2      4     82      6      6      2     12      5      1 
##  0.698    0.7  0.715   0.72   0.73   0.74   0.75   0.76  0.765   0.77   0.78 
##      1     40      1     12      1      2     66      6      3     23     19 
##   0.79  0.795    0.8  0.804 0.8083   0.81 0.8156   0.82   0.83   0.84   0.85 
##     14      3     81      1      1      6      2     15      5      4     66 
##   0.86 0.8668   0.87 0.8732  0.875 0.8751 0.8788   0.88   0.89    0.9   0.91 
##      7      1      6     46      2      1      1      7     13     60      6 
##  0.918   0.92  0.925 0.9265 0.9288   0.93   0.94  0.946   0.95   0.96 0.9614 
##      6     36      1      1      1      5      2      1     82     10      1 
##  0.969   0.97  0.975 0.9792   0.98 0.9861 0.9886   0.99  0.998      1   1.01 
##     25      3      1      1     13     42      1     11     11     56      1 
##   1.02   1.03   1.04   1.05   1.06   1.07   1.08   1.09    1.1   1.11   1.12 
##      6     48      1     75      2      5     50      1     66      5     10 
##   1.13   1.14   1.15   1.16   1.17   1.18   1.19    1.2   1.21   1.22   1.25 
##      4      1     40     27     10      3      3     52      3      5     50 
##   1.26   1.28    1.3   1.32   1.34   1.35   1.36   1.38   1.39    1.4   1.41 
##     13     24     55      1      4     36      2     20      3     46     11 
##   1.42   1.43   1.44   1.45   1.46   1.47   1.48   1.49    1.5   1.51   1.52 
##     11     23     12     54      3     24     32      3    149      2     11 
##   1.53   1.54   1.55   1.56   1.57   1.58   1.59    1.6   1.61   1.62   1.63 
##      4      5     87      7      3     12      5     77     47      8      4 
##   1.64   1.65   1.66   1.67   1.68   1.69    1.7   1.71   1.72   1.73   1.74 
##      3     51      1      4      6      8     72      1      9      3      2 
##   1.75   1.76   1.78   1.79    1.8   1.82   1.85   1.86   1.88   1.89    1.9 
##    149      5     15      2     44      1     48     10      9     12    104 
##   1.92   1.94   1.95   1.96   1.97   1.98   1.99      2   2.02   2.03   2.04 
##      6      1     77      5      3     65      8    122      3      7      1 
##   2.05   2.06   2.08   2.09    2.1   2.12   2.14   2.15   2.16   2.17   2.18 
##     41     18      8      2    116      6      4     61      1      7      8 
##   2.19    2.2   2.22   2.23   2.24   2.25   2.29    2.3   2.31   2.32   2.33 
##      1     53      3     10      2     85      3     49      1      2      2 
##   2.34   2.35   2.36   2.37   2.38   2.39    2.4   2.41   2.42   2.43   2.44 
##      1     27      1      2      5      1     45      1      6      2      4 
##   2.45   2.46   2.47   2.48   2.49    2.5   2.52   2.53   2.55   2.58   2.59 
##     38      4      2      4      2     76      2     10     29      1      1 
##    2.6   2.62   2.65   2.67   2.68    2.7   2.74   2.75    2.8   2.84   2.85 
##     57      1     12      1      6     34      1     24     37      1      7 
##   2.86    2.9   2.95   2.99      3    3.1   3.12   3.15   3.18   3.19    3.2 
##      1     37     75      4     54     10      1     10      1      1     26 
##   3.22   3.23   3.24   3.25    3.3   3.35   3.39    3.4   3.45    3.5   3.55 
##      1      1      1     36     16     10      2     18      7     69      7 
##   3.57   3.59    3.6   3.65   3.67    3.7   3.74   3.75    3.8   3.85   3.89 
##      6      1     25      9      1     20      2     29     31     19      2 
##    3.9   3.95   3.96   3.98   3.99      4   4.09    4.1   4.15   4.19    4.2 
##     19      9      9      1      1     80      2     27     14      2     15 
##   4.25    4.3   4.35   4.36   4.37   4.38    4.4   4.45   4.48    4.5   4.51 
##     51     19      8      1      1      1     12      4      1    107      1 
##   4.53   4.54   4.55   4.57   4.58    4.6   4.61   4.63   4.65   4.68    4.7 
##      1      1     11      1      1     17      1      1     12      1     11 
##   4.71   4.75   4.76    4.8   4.84   4.85   4.87   4.89    4.9   4.91   4.92 
##      1     29      1     16      1      6      1      1     22      1      1 
##   4.95   4.99      5   5.05    5.1   5.13   5.15    5.2   5.25    5.3   5.35 
##     12      1     52      1     16      1      5      9     47      9      7 
##    5.4   5.45    5.5   5.55    5.6   5.65   5.66   5.69    5.7   5.74   5.75 
##      8      1     58      1     12      3      1      1     12      2     22 
##   5.76   5.77    5.8   5.85    5.9   5.95   5.98   5.99      6   6.01   6.05 
##      1      1     13      5     15      5      2      1     62      1      6 
##   6.09    6.1   6.15    6.2   6.22   6.25   6.29    6.3   6.32   6.35   6.38 
##      1      9      5     10      1     33      1     15      1      6      1 
##    6.4   6.45   6.49    6.5   6.55    6.6   6.65    6.7   6.75   6.76    6.8 
##     11      2      5     93      6     14      8     21     36      1      9 
##   6.84   6.85    6.9   6.92   6.94   6.95      7    7.1   7.15    7.2   7.25 
##      2      6     13      2      1     10     55      5      6      5     23 
##   7.26   7.28    7.3   7.35   7.37    7.4   7.42   7.45   7.49    7.5   7.51 
##      2      1     12      6      1     13      1      2      1     71      2 
##   7.52    7.6   7.65    7.7   7.75    7.8   7.85    7.9   7.95   7.99      8 
##      1      9      4      6     28     15      3     11      4      1     43 
##   8.03   8.05    8.1   8.14   8.15    8.2   8.25    8.3   8.34   8.35    8.4 
##      1      1      5      2      1      2     26      3      1      5      1 
##   8.45   8.48    8.5    8.6   8.65    8.7   8.74   8.75    8.8   8.85    8.9 
##      3      1     55      4      1      3      1     18      3      1      9 
##   8.97   8.98   8.99      9    9.1   9.15    9.2   9.25   9.29    9.3   9.31 
##      2      1      2     60      1      1      4     29      6      3      1 
##   9.32   9.35    9.4   9.45   9.49    9.5    9.6    9.7   9.75    9.8   9.85 
##      1      2      6      3      1     47      5      2     19      8      2 
##    9.9   9.99     10  10.05   10.1   10.2  10.24  10.25   10.3   10.4  10.49 
##     10      1     53      1      1      2      2     14      2      2      1 
##   10.5   10.6  10.74  10.75   10.8  10.83  10.85   10.9  10.95  10.99     11 
##     58      2      1     14      1      1      1      2      1      9     77 
##  11.01  11.05  11.15  11.25   11.3   11.4  11.49   11.5  11.55  11.75   11.8 
##      1      2      1     16      5      2      1     56      1     14      2 
##   11.9  11.95     12  12.15  12.25  12.39   12.4  12.45   12.5  12.65   12.7 
##      2      4     70      1      5      2      3      1     53      1      2 
##  12.73  12.75   12.8  12.85   12.9  12.95     13  13.03  13.15  13.25   13.4 
##      1      9      1      2      2      1     44      1      1      8      1 
##   13.5   13.6  13.65  13.75   13.8  13.85   13.9  13.99     14   14.2  14.25 
##     40      1      2      9      1      2      2      3     46      2      4 
##   14.3   14.4   14.5   14.7   14.9     15  15.04   15.2  15.25   15.4  15.45 
##      1      2     23      1      2     36      2      2      2      2      1 
##   15.5   15.6  15.75   15.8  15.89   15.9  15.95     16   16.2  16.25   16.4 
##     24      1      1      7      1      1      2     63      1      4      1 
##   16.5  16.64   16.7  16.75  16.99     17  17.25   17.3   17.5  17.65  17.75 
##     61      2      1      5      1     55      4      1     28     16     11 
##   17.9  17.95     18  18.25   18.5   18.7  18.75   18.9  18.95     19  19.25 
##      2      4     42      5     15      4      3      1      1     15      2 
##   19.4   19.5  19.75     20   20.5     21   21.4   21.5   21.7   21.9  21.99 
##      1     10      3     34      4     18      1      2      1      2      1 
##     22   22.5   22.6   22.9     23   23.1   23.4   23.5  23.55  23.75     24 
##     39     12      2      4     21      7      2      5      1      1     11 
##   24.2   24.5  24.75     25   25.5     26   26.5  26.75  26.99     27   27.5 
##      2      2      1     16      2     11      2      1      1      5      3 
##  27.75  27.98     28  28.05   28.5     29   29.4   29.8     30     31     32 
##      1      1     42      1      1      3      1      3      8      4      9 
##     33     34     35     36  36.16   36.5     37     38     40     42     43 
##      5      4     12      3      1      1      2      9      5      3      1 
##     45     47   47.5     48     49     50     70     75     80     85     95 
##      4      1      1      5      1      1      4      1      3      1      1 
##     99 
##      2

colSums(is.na(k))

##              X_id             title        date.added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete.location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##              1092                 0

#to fill the missing area size dig into it

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

k <- clean_names(k)
colSums(is.na(k))

##              x_id             title        date_added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##              1092                 0

#area size and price location 
ddply(k, .(location), summarize, size = mean(!is.na(area_size)))

##      location      size
## 1             0.9809264
## 2 Bahria Town 0.7804667
## 3     Clifton 0.9589258
## 4 DHA Defence 0.9246766

ddply(k, .(location), summarize, price = mean((sales_price)))

##      location    price
## 1                   NA
## 2 Bahria Town 1.814954
## 3     Clifton 4.865529
## 4 DHA Defence 9.235381

#area size and price by house type
ddply(k, .(type), summarize, size = mean(!is.na(area_size)))

##            type      size
## 1               0.0000000
## 2    Farm House 0.5000000
## 3          Flat 0.9124204
## 4         House 0.8360584
## 5 Lower Portion 1.0000000
## 6     Penthouse 0.8541667
## 7 Upper Portion 1.0000000

ddply(k, .(type), summarize, price = mean((sales_price)))

##            type     price
## 1                      NA
## 2    Farm House 12.500000
## 3          Flat  2.607330
## 4         House  7.886607
## 5 Lower Portion  3.400000
## 6     Penthouse 13.938958
## 7 Upper Portion  3.483333

#for imputation
#X['loading'].fillna(value=127.74233678955453, inplace=True)

colnames(k)

##  [1] "x_id"              "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "sales_price"       "price_unit"       
## [13] "area_size"         "area_unit"

colSums(is.na(k))

##              x_id             title        date_added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##              1092                 0

#separate data with missing area size

have_area_size <- k %>% filter(!is.na(area_size))

table1 <- have_area_size %>% select(area_size, bedrooms, location, type) %>% 
  group_by(type,location) %>% 
  summarise(mean = mean(area_size))

## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.

t <- as.data.frame(table1)

table(k$type)

## 
##                  Farm House          Flat         House Lower Portion 
##             1             4          3768          4587             3 
##     Penthouse Upper Portion 
##            48             3

no_area_size <- k %>% filter(is.na(area_size))
colSums(is.na(no_area_size))

##              x_id             title        date_added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##              1092                 0

dim(no_area_size)

## [1] 1092   14

#extract ids by locaiton in no area size dataset
b <- no_area_size %>% select(x_id,location, type) %>% 
  group_by(type, location,x_id) %>% filter(location == "Bahria Town")

c <- no_area_size %>% select(x_id,location, type) %>% 
  group_by(type, location,x_id) %>% filter(location == "Clifton")

d <- no_area_size %>% select(x_id,location, type) %>% 
  group_by(type, location,x_id) %>% filter(location == "DHA Defence")


# This gives laverage area size for each type  of house as per location 
# The house types are:
# 1- empty, 
# farmhouse (4), flat(3768), house(4587), lower portion(3), penthouse(48), upper portion(3)

Now impute values

no_area_size %>% select(x_id,location, type, area_size) %>% 
  group_by(type, location,x_id, area_size) %>% filter(is.na(area_size))

## # A tibble: 1,092 × 4
## # Groups:   type, location, x_id, area_size [1,092]
##     x_id location    type  area_size
##    <int> <chr>       <chr>     <dbl>
##  1     2 Bahria Town House        NA
##  2     3 Bahria Town House        NA
##  3     4 Bahria Town House        NA
##  4     7 Bahria Town House        NA
##  5     8 Bahria Town House        NA
##  6    11 DHA Defence House        NA
##  7    14 Bahria Town Flat         NA
##  8    38 Bahria Town House        NA
##  9    50 DHA Defence Flat         NA
## 10    58 Bahria Town House        NA
## # ℹ 1,082 more rows

no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "Bahria Town" ] <- 123.1602

no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "Bahria Town" ] <- 256.9191

no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "DHA Defence" ] <- 550.3603

no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "DHA Defence" ] <- 194.2698 

no_area_size$area_size[no_area_size$type == "Flat" & no_area_size$location == "Clifton" ] <- 216.8858

no_area_size$area_size[no_area_size$type == "House" & no_area_size$location == "Clifton" ] <- 418.7263

no_area_size$area_size[no_area_size$type == "Farm House" & no_area_size$location == "Bahria Town" ] <- 4400.00
  
no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "Clifton" ] <- 281.25
  
no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "Bahria Town" ] <- 607.00

no_area_size$area_size[no_area_size$type == "Penthouse" & no_area_size$location == "DHA Defence" ] <- 123.1602
  
  
sum(is.na(no_area_size$location))

## [1] 0

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 115)

##   x_id location type area_size complete_location
## 1  115                      NA

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 170)

##   x_id location type area_size
## 1  170          Flat        NA
##                                        complete_location
## 1 Bahria Apartments, Bahria Town Karachi, Karachi, Sindh

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 1880)

##   x_id location  type area_size
## 1 1880          House        NA
##                                                  complete_location
## 1 Bahria Town - Precinct 10-B, Bahria Town Karachi, Karachi, Sindh

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 2525)

##   x_id location type area_size
## 1 2525          Flat        NA
##                                           complete_location
## 1 Emaar The Views, DHA Phase 8, DHA Defence, Karachi, Sindh

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 5698)

##   x_id location type area_size
## 1 5698          Flat        NA
##                                               complete_location
## 1 Bahria Town - Precinct 1, Bahria Town Karachi, Karachi, Sindh

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 7478)

##   x_id location type area_size                   complete_location
## 1 7478          Flat        NA Bahria Town Karachi, Karachi, Sindh

k %>% select(x_id, location, type, area_size, complete_location) %>% filter(x_id == 7838)

##   x_id location type area_size
## 1 7838          Flat        NA
##                                                  complete_location
## 1 Bahria Town - Jinnah Avenue, Bahria Town Karachi, Karachi, Sindh

For missing ids data check one by one to retrieve information 115 has no information given 170 is flat in Bahria Town 1880 is house in Bahria town 2525 is flat in DHA 5698 is flat in Bahria Town 7478 is flat in Bahria Town 7838 is flat in Bahria Town

Now insert this information

no_area_size %>% select(x_id,location, type, area_size, keywords, description) %>% 
  group_by(type, location,x_id, area_size) %>% filter(is.na(area_size))

## # A tibble: 7 × 6
## # Groups:   type, location, x_id, area_size [7]
##    x_id location type    area_size keywords                          description
##   <int> <chr>    <chr>       <dbl> <chr>                             <chr>      
## 1   115 ""       ""             NA ""                                ""         
## 2   170 ""       "Flat"         NA "Parking Spaces,Double Glazed Wi… "Fully fur…
## 3  1880 ""       "House"        NA "Built in year: 2025,Parking Spa… "AQ Execut…
## 4  2525 ""       "Flat"         NA "Parking Spaces: 2,Flooring,Floo… "Emaar The…
## 5  5698 ""       "Flat"         NA "Built in year: 2025,Parking Spa… "JASMINE M…
## 6  7478 ""       "Flat"         NA ""                                "Bahria To…
## 7  7838 ""       "Flat"         NA ""                                "Grab This…

no_area_size %>% select(x_id,location, type, area_size) %>% 
  group_by(type, location,x_id, area_size) %>% filter(x_id == 5698)

## # A tibble: 1 × 4
## # Groups:   type, location, x_id, area_size [1]
##    x_id location type  area_size
##   <int> <chr>    <chr>     <dbl>
## 1  5698 ""       Flat         NA

no_area_size$area_size[no_area_size$x_id == 170 & no_area_size$type == "Flat" ] <- 123.1602
no_area_size$location[no_area_size$x_id == 170] <- "Bahria Town"

no_area_size$area_size[no_area_size$x_id == 1880 & no_area_size$type == "House" ] <- 256.9191
no_area_size$location[no_area_size$x_id == 1880] <- "Bahria Town"

no_area_size$area_size[no_area_size$x_id == 2525 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 2525] <- "DHA Defence"

no_area_size$area_size[no_area_size$x_id == 5698 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 5698] <- "Bahria Town"

no_area_size$area_size[no_area_size$x_id == 7478 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 7478] <- "Bahria Town"

no_area_size$area_size[no_area_size$x_id == 7838 & no_area_size$type == "Flat" ] <- 194.2698
no_area_size$location[no_area_size$x_id == 7838] <- "Bahria Town"

#now remove id 115 with no other information provided
#no_area_size$x_id[no_area_size$x_id == 115] <- NULL

colSums(is.na(no_area_size))

##              x_id             title        date_added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 1                 0 
##         area_size         area_unit 
##                 1                 0

We have two data sets k with missing area size values those are handled separately

ggplot(data = k[!is.na(k$sales_price), ], aes(x = sales_price)) +
  geom_histogram(fill = "blue", binwidth = 10)

summary(k$sales_price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.250   1.430   2.500   5.556   7.280  99.000       1

#check variables with high correlation with response variable sales price
numeric_var <- which(sapply(k, is.numeric)) #index numeric vectors
numeric_var_names <- names(numeric_var) #saving names vector for later use
cat("there are", length(numeric_var), "numeric variables")

## there are 5 numeric variables

k_numvar <- k[, numeric_var]
cor_numvar <- cor(k_numvar, use = "pairwise.complete.obs") #correlation of all numeric variables

#sort on decreasing correlation with sales price
cor_sorted <- as.matrix(sort(cor_numvar[,'sales_price'], decreasing = TRUE))

#select only high correlations
corHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.5)))
cor_numVar <- cor_numvar[corHigh, corHigh]

corrplot.mixed(cor_numVar, tl.col = "black", tl.pos = "lt")

Now visualize the relation between sales price area area size, bedrooms and bathrooms

#area_size

ggplot(data = k[!is.na(k$sales_price), ], aes (x = factor(bedrooms), y = sales_price)) +
  geom_boxplot(col = "blue") + labs(x = "Bedrooms Number") +
  scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
  labs(
    title = "The sales price with Number of bedrooms",
    subtitle = "The number of bedrooms effecting cost",
    caption = "Karachi property dataset",
    x = "Number of Bedrooms", y="Sales price"
  )

The positive correlation is quite evident with increasing number of bedrooms check for other two numeric variables

#Bathrooms

ggplot(data = k[!is.na(k$sales_price), ], aes (x = factor(bathrooms), y = sales_price)) +
  geom_boxplot(col = "green") + labs(x = "Bedrooms Number") +
  scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
  labs(
    title = "The sales price with Number of bathrooms",
    subtitle = "The number of bathrooms effecting cost",
    caption = "Karachi property dataset",
    x = "Number of Bathrooms", y="Sales price"
  )

This also shows positve coorelation of increasing number of bathrooms with sales price

#area size

ggplot(data = k[!is.na(k$sales_price), ], aes (x = area_size, y = sales_price)) +
  geom_point(col = "brown") + 
  geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
  scale_y_continuous(breaks = seq(0,800000, by = 100000), labels = comma)+
  labs(
    title = "The sales price with Number of Area",
    subtitle = "The number of bathrooms effecting cost",
    caption = "Karachi property dataset",
    x = "Area size", y="Sales price"
  ) +
  geom_text_repel(aes(label = ifelse(k$area_size[!is.na(k$sales_price)]>2500, rownames(k), '')))

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1091 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1091 rows containing missing values (`geom_point()`).

## Warning: Removed 1091 rows containing missing values (`geom_text_repel()`).

#see variables wiht missing values

na_col <- which(colSums(is.na(k)) > 0)
sort(colSums(sapply(k[na_col], is.na)), decreasing = TRUE)

##   area_size sales_price 
##        1092           1

cat("there are ", length(na_col), "columns with missing values")

## there are  2 columns with missing values

fix 1092 NA in area size variables

Besides making sure that the NAs are taken care off, I have also converted character variables into ordinal integers if there is clear ordinality, or into factors if levels are categories without ordinality. I will convert these factors into numeric later on by using one-hot encoding (using the model.matrix function).

Label encoding/factorizing the remaining character variables

charcol <- names(k[,sapply(k, is.character)])
charcol

## [1] "title"             "date_added"        "type"             
## [4] "location"          "complete_location" "description"      
## [7] "keywords"          "price_unit"        "area_unit"

cat("There are", length(charcol), "remaining columns with character values")

## There are 9 remaining columns with character values

#for no ordinality so convert into factors
k$location <- as.factor(k$location)
table(k$location)

## 
##             Bahria Town     Clifton DHA Defence 
##         367        3471         633        3943

sum(table(k$location))

## [1] 8414

k$type <- as.factor(k$type)
table(k$type)

## 
##                  Farm House          Flat         House Lower Portion 
##             1             4          3768          4587             3 
##     Penthouse Upper Portion 
##            48             3

sum(table(k$type))

## [1] 8414

# k$keywords <- as.factor(k$keywords)
# table(k$keywords)
# sum(table(k$keywords))

names(k)

##  [1] "x_id"              "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "sales_price"       "price_unit"       
## [13] "area_size"         "area_unit"

#Finding variable importance with a quick Random Forest Although the correlations are giving a good overview of the most important numeric variables and multicolinerity among those variables, I wanted to get an overview of the most important variables including the categorical variables before moving on to visualization.

I tried to get the relative importance of variables with a quick linear regression model with the calc.relimp function of package , and also tried the boruta function of package boruta which separates the variables into groups that are important or not. However, these method took a long time. As I only want to get an indication of the variable importance, I eventually decided to keep it simple and just use a quick and dirty Random Forest model with only 100 trees. This also does the job for me, and does not take very long as I can specify a (relatively) small number of trees.

k2 <- no_area_size

#to remove rows with na in it
k2 <- no_area_size
k2 <- na.omit(k2)


k2 %>% select(x_id, sales_price) %>% group_by(x_id, sales_price) %>% filter(is.na(sales_price))

## # A tibble: 0 × 2
## # Groups:   x_id, sales_price [0]
## # ℹ 2 variables: x_id <int>, sales_price <dbl>

colSums(is.na(k2))

##              x_id             title        date_added              type 
##                 0                 0                 0                 0 
##          bedrooms         bathrooms          location complete_location 
##                 0                 0                 0                 0 
##       description          keywords       sales_price        price_unit 
##                 0                 0                 0                 0 
##         area_size         area_unit 
##                 0                 0

dim(k2)

## [1] 1091   14

set.seed(2024)
quick_RF <- randomForest(x = k2[1:1091,-11], y = k2$sales_price[1:1091], ntree = 10, importance = TRUE)
imp_RF <- importance(quick_RF)
imp_DF <- data.frame(Variables = row.names(imp_RF), MSE = imp_RF[,1])
imp_DF <- imp_DF[order(imp_DF$MSE, decreasing = TRUE), ]

ggplot(imp_DF[1:20,], aes(x = reorder(Variables, MSE), y = MSE, fill = MSE)) +
  geom_bar(stat = "identity")+
  labs(x = "Variables", y = "% increase MSE if variable is randomly permuted") + coord_flip() + 
  theme(legend.position = "none")

## Warning: Removed 7 rows containing missing values (`position_stack()`).

colnames(k2)

##  [1] "x_id"              "title"             "date_added"       
##  [4] "type"              "bedrooms"          "bathrooms"        
##  [7] "location"          "complete_location" "description"      
## [10] "keywords"          "sales_price"       "price_unit"       
## [13] "area_size"         "area_unit"

as <- ggplot(data = k2, aes(area_size))+geom_density()
bed <- ggplot(data = k2, aes(as.factor(bedrooms)))+geom_histogram(stat= "count")

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

bath <- ggplot(data = k2, aes(as.factor(bathrooms)))+geom_histogram(stat= "Count")

## Warning in geom_histogram(stat = "Count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

loc <- ggplot(data = k2, aes(as.factor(location)))+geom_histogram(stat= "Count")

## Warning in geom_histogram(stat = "Count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

layout <- matrix (c(1,2,3,4), byrow = TRUE)
multiplot(as, bed, bath, loc, layout = layout)

cor(k2$area_size, k2$sales_price)

## [1] 0.4500828

cor(k2$bathrooms, k2$sales_price)

## [1] -0.0407995

cor(k2$bedrooms, k2$sales_price)

## [1] 0.2489214

The most important categorical variable Create median sales price by location, type.

n1 <- ggplot(k2[!is.na(k2$sales_price), ], aes(x = type, y = sales_price)) +
  geom_bar(stat = "summary", fun.y = "median", fill = "blue")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_y_continuous(breaks = seq(0,80, by = 4), labels = comma)+
  geom_hline(yintercept = 40, linetype = "dashed", color = "red")+
  geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3)

## Warning in geom_bar(stat = "summary", fun.y = "median", fill = "blue"):
## Ignoring unknown parameters: `fun.y`

#dashed line is median sales price

n2 <- ggplot(k2, aes(x = type)) + geom_histogram(stat = "count") +
  geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

grid.arrange(n1, n2)

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## No summary function supplied, defaulting to `mean_se()`

n1 <- ggplot(k2[!is.na(k2$sales_price), ], aes(x = location, y = sales_price)) +
  geom_bar(stat = "summary", fun.y = "median", fill = "blue")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_y_continuous(breaks = seq(0, 15, by = 5), labels = comma)+
  geom_hline(yintercept = 5, linetype = "dashed", color = "red")+
  geom_label(stat = "count", aes(label =  ..count.., y =  ..count..), size = 3)

## Warning in geom_bar(stat = "summary", fun.y = "median", fill = "blue"):
## Ignoring unknown parameters: `fun.y`

#dashed line is median sales price

n2 <- ggplot(k2, aes(x = location)) + geom_histogram(stat = "count") +
  geom_label(stat = "count", aes(label = ..count.., y = ..count..), size = 3) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

grid.arrange(n1, n2)

## No summary function supplied, defaulting to `mean_se()`

Total number of bathrooms and bedrooms

bt1 <- ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = as.factor(bathrooms), y = sales_price))+
  geom_point(col = "blue") + 
  geom_smooth(method = "lm", se = FALSE, color = "black", aes(group = 1)) + 
  scale_y_continuous(breaks = seq(0, 100, by = 20), labels = comma)
  
bt2 <- ggplot(data = k2, aes(x = as.factor(bathrooms))) + geom_histogram(stat = "count")

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

grid.arrange(bt1,bt2)

## `geom_smooth()` using formula = 'y ~ x'

bd1 <- ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = as.factor(bedrooms), y = sales_price))+
  geom_point(col = "blue") + 
  geom_smooth(method = "lm", se = FALSE, color = "black", aes(group = 1)) + 
  scale_y_continuous(breaks = seq(0, 100, by = 20), labels = comma)
  
bd2 <- ggplot(data = k2, aes(x = as.factor(bedrooms))) + geom_histogram(stat = "count")

## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

grid.arrange(bd1,bd2)

## `geom_smooth()` using formula = 'y ~ x'

#Area size

ggplot(data = k2[!is.na(k2$sales_price), ], aes(x = area_size, y = sales_price)) +
  geom_point(col = "blue")+
  geom_smooth(method = "lm", se= FALSE, color = "black", aes(group = 1)) +
  scale_y_continuous(breaks = seq(0,100,15), labels = comma) + 
  geom_text_repel(aes(label = ifelse(k2$area_size[!is.na(k2$sales_price)] > 50, rownames(k2), "")))

## `geom_smooth()` using formula = 'y ~ x'

## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

check correlation

cor(k2$sales_price, k2$area_size)

## [1] 0.4500828

#Preparing data for modeling

#to drop high correlated variables
#dropvars <- c("", "","")
#k2 <- k2[, !names(k2) %in% dropvars]

#normalize the data
prenum <- preProcess(k2, method = c("center", "scale"))
print(prenum)

## Created from 1091 samples and 14 variables
## 
## Pre-processing:
##   - centered (5)
##   - ignored (9)
##   - scaled (5)

DFnorm <- predict(prenum, k2)
dim(DFnorm)

## [1] 1091   14

#One hot encoding the categorical variables The last step needed to ensure that all predictors are converted into numeric columns (which is required by most Machine Learning algorithms) is to ‘one-hot encode’ the categorical variables. This basically means that all (not ordinal) factor values are getting a seperate colums with 1s and 0s (1 basically means Yes/Present). To do this one-hot encoding, I am using the model.matrix() function.

#dealing with skeness of response variable
skew(k2$sales_price)

## [1] 4.709454

qqnorm(k2$sales_price)
qqline(k2$sales_price)

The skew of 1.87 indicates a right skew that is too high, and the Q-Q plot shows that sale prices are also not normally distributed. To fix this I am taking the log of SalePrice.

k2$sales_price <- log(k2$sales_price)
skew(k2$sales_price)

## [1] 0.7246697

qqnorm(k2$sales_price)
qqline(k2$sales_price)

#Compose train and test datasets

train1 <- k2[!is.na(k2$sales_price), ]
test1 <- k2[is.na(k2$sales_price), ]

#Modeling #Lasso regression model I have also tried Ridge and Elastic Net models, but since lasso gives the best results of those 3 models I am only keeping the lasso model in the document.

The elastic-net penalty is controlled by alpha, and bridges the gap between lasso (alpha=1) and ridge (alpha=0). The tuning parameter lambda controls the overall strength of the penalty. It is known that the ridge penalty shrinks the coefficients of correlated predictors towards each other while the lasso tends to pick one of them and discard the others.

Below, I am using caret cross validation to find the best value for lambda, which is the only hyperparameter that needs to be tuned for the lasso model.

set.seed(1234)
mycontrol <- trainControl(method = "cv", number = 5)
lassogrid <- expand.grid(alpha =1, lambda = seq(0.001, 0.1, by = 0.0005))

lassomod <- train(x = train1, y = k2$sales_price[!is.na(k2$sales_price)], method = "glmnet", trControl = mycontrol, tuneGrid = lassogrid)

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in cbind2(1, newx) %*% nbeta: NAs introduced by coercion

## Warning in storage.mode(xd) <- "double": NAs introduced by coercion

lassomod$bestTune

##    alpha lambda
## 59     1   0.03

min(lassomod$results$RMSE)

## [1] 0.03061157

lassovarimp <- varImp(lassomod, scale = FALSE)
lassoImportance <- lassovarimp$importance

varsSelected <- length(which(lassoImportance$Overall != 0))
varsNorSelected <- length(which(lassoImportance$Overall == 0))

cat("Lasso uses", varsSelected, "vriables in its model, and did not select ",  varsNorSelected, "variables")

## Lasso uses 1 vriables in its model, and did not select  13 variables

lassopred <- predict(lassomod, test1)

## Warning in cbind2(1, newx): number of rows of result is not a multiple of
## vector length (arg 1)

predictions_lasso <- exp(lassopred)
head(predictions_lasso)

## numeric(0)

price-prediction

2024-04-28