sample1

#Description report provide house price using regression algorithms. the dataset is used in house data in australia

dataset link: here #1 Data extraction import libraries

read house dataset and colums

house <- read.csv("house.csv")
str(house)

## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

This dataset contains 4600 rows and 18 columns. the target variable is price

#2 explonary data analysis ## plot distribution of Price (boxplot)

ggplot(house, aes(y=price))+ geom_boxplot()+
  scale_y_continuous(limits = c(0,2000000))

house$bedrooms2 <- factor(house$bedrooms)
house$city2 <- factor(house$city)
house$statezip2 <- factor(house$statezip)
house$street2 <- factor(house$street)
house$country2 <- factor(house$country)

ggplot(house, aes(x= bedrooms2, y = price))+ geom_boxplot()+
  scale_y_continuous(limits = c(0,2000000))

library(corrgram)
house_num <- house[,2:12]

corrgram(house_num, order= TRUE, upper.panel = panel.pie)

#3 data preprocessing ##3.1 data cleansing remove incorrect prices

idx <- which(house_num$price %in% c(0))
house_num <- house_num[-idx,]
summary(house_num)

##      price             bedrooms       bathrooms      sqft_living   
##  Min.   :    7800   Min.   :0.000   Min.   :0.000   Min.   :  370  
##  1st Qu.:  326264   1st Qu.:3.000   1st Qu.:1.750   1st Qu.: 1460  
##  Median :  465000   Median :3.000   Median :2.250   Median : 1970  
##  Mean   :  557906   Mean   :3.395   Mean   :2.155   Mean   : 2132  
##  3rd Qu.:  657500   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.: 2610  
##  Max.   :26590000   Max.   :9.000   Max.   :8.000   Max.   :13540  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    638   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5000   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7680   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  14835   Mean   :1.512   Mean   :0.006592   Mean   :0.2347  
##  3rd Qu.:  10978   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1074218   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition       sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :3.000   Median :1590   Median :   0.0  
##  Mean   :3.449   Mean   :1822   Mean   : 310.2  
##  3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 600.0  
##  Max.   :5.000   Max.   :9410   Max.   :4820.0

remove rows with outliers

out_price <- boxplot.stats(house_num$price)$out
out_price

##   [1]  2384000  1400000  1200000  1395000  2280000  1425000  1225000  1580000
##   [9]  1870000  1800000  3200000  1500000  1340000  1532500  1750000  1500000
##  [17]  1307000  1250000  2400000  1370000  1157200  1702500  2238888  1225000
##  [25]  1200000  1575000  1315000  1300000  1346400  1225000  1264000  1895000
##  [33]  2555000  1550000  1400000  1680000  1400000  1222500  1255000  2100000
##  [41]  2000000  1270000  1216000  1405000  1190000  1300000  1212500  1275000
##  [49]  1800000  1690000  1506000  1580000  1220000  1185000  1550000  2700000
##  [57]  1600000  1170000  1180500  2150000  1325000  1610000  1200000  2000000
##  [65]  1680000  1570000  1795000  1655000  1381000  1600000  1710000  1285000
##  [73]  1175000  1400000  1210000  1970000  1400000  2453500  3100000  2750000
##  [81]  1425000  2400000  1385000  1365000  1886700  1256500  3710000  1505000
##  [89]  1595000  1250000  1965221  1298000  1270000  1901000  2400000  2005000
##  [97]  1300000  1800000  1415000  1300000  1370000  1280000  2700000  1635000
## [105]  1339000  1350000  1900000  1225000  1230000  1240000  1795000  1180000
## [113]  7062500  2888000  1365000  1325000  1485000  2250000  1184000  1228000
## [121]  1688000  1240000  1335000  1735000  1695000  1735000  1712500  1920000
## [129]  4668000  1170000  1200000  1538000  1185001  2027000  2475000  1728000
## [137]  1555000  1325000  1384000  1600000  4489000  1400000  3000000  1160000
## [145]  1399950  1525000  1325000  1309500  2200000  1440000  1240000  1815000
## [153]  1950000  1300000  1462497  2110000  2400000  1320000  1240000  2075000
## [161]  1580000  1220000  1700000  1215000  1157400  2466350  1295648  1250000
## [169]  1200000  2300000  1229000  1355000  1700000  1330000  1280000  1250000
## [177]  1340000  1388000  1990000  1387800  1410000  1619999  1820000  1195000
## [185]  1600000  1200000  1165000  1230000  1925000  1730000  1309500  1510000
## [193]  2065000  1198000  1297000  3800000  1695000  1640000  2300000  1565000
## [201]  2367000  1356925  1300000  2271150  2147500  1411600  1195000  1875000
## [209]  1755000  1625000  2351956  1200000  1738000  1280000  1275000  1636000
## [217]  1450000  1360000  1300000  1195000  2458000  1205000  1675000  2180000
## [225]  1355000  1465000  1250000  1681000  2680000  2321000  1240000  1550000
## [233]  1234582 12899000  2110000  2199900 26590000  2560498  1337044  1288333

out_idx <- which(house_num$price %in% c(out_price))
house_num <- house_num[-out_idx,]
summary(house_num$price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7800  320000  450000  487457  615000 1150000

##3.2 feature extraction

one hot encoding

house <- house[rownames(house_num),]
#statezip data frame
statezip <- house$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")

#2 one shot encoding stezip dataframe
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:corrgram':
## 
##     panel.fill

df1 <- dummyVars("~.", data = statezip_df )
df2 <- data.frame(predict(df1, newdata = statezip_df))
house_num <- cbind(house_num, df2)

#3 training and testing
d <- dim(house_num)
m <- d[1]
set.seed(2022)
train.idx <- sample(m, 0.85 * m)
train.idx[1:5]

## [1] 1459 2871 3915  708 2751

train_data <- house_num[train.idx,]
test_data <- house_num[-train.idx,]

#4 modelling

mymodel <- lm(formula = price~. + I(sqft_living^2),
              data =  train_data)

#5 evaluation predict price

actual <- test_data$price
pred.mymodel <- predict(mymodel, test_data)

## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading

#plot answer predict price

price_df <- data.frame(actual, pred.mymodel)
ggplot(price_df, aes(x= actual, y = pred.mymodel))+ geom_point()+
  scale_x_continuous(limits = c(0,2000000))+
  scale_y_continuous(limits = c(0,2000000))

#measure rmse and pearson correlation

performance <- function(pred, act, method){
  e <- pred - act
  se <- e^2
  mse <- mean(se)
  rmse <- sqrt(mse)
  
  r <- cor(pred, act)
  result <- paste ("method :", method,
                   "\n rmse:", round(rmse,3),
                   "\n R:", round(r,3),
                   "\n")
  cat(result)
}


performance(pred.mymodel,actual, "polynomial regression")

## method : polynomial regression 
##  rmse: 105510.188 
##  R: 0.886

sample1

Muhammad Rifki Hartanto

2022-05-15