#Description report provide house price using regression algorithms. the dataset is used in house data in australia
dataset link: here #1 Data extraction import libraries
read house dataset and colums
house <- read.csv("house.csv")
str(house)
## 'data.frame': 4600 obs. of 18 variables:
## $ date : chr "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
## $ price : num 313000 2384000 342000 420000 550000 ...
## $ bedrooms : num 3 5 3 3 4 2 2 4 3 4 ...
## $ bathrooms : num 1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
## $ sqft_living : int 1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
## $ sqft_lot : int 7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
## $ floors : num 1.5 2 1 1 1 1 1 2 1 1.5 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 4 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 5 4 4 4 3 3 3 4 3 ...
## $ sqft_above : int 1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
## $ sqft_basement: int 0 280 0 1000 800 0 0 0 860 0 ...
## $ yr_built : int 1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
## $ yr_renovated : int 2005 0 0 0 1992 1994 0 0 0 2010 ...
## $ street : chr "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
## $ city : chr "Shoreline" "Seattle" "Kent" "Bellevue" ...
## $ statezip : chr "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
This dataset contains 4600 rows and 18 columns. the target variable is price
#2 explonary data analysis ## plot distribution of Price (boxplot)
ggplot(house, aes(y=price))+ geom_boxplot()+
scale_y_continuous(limits = c(0,2000000))
house$bedrooms2 <- factor(house$bedrooms)
house$city2 <- factor(house$city)
house$statezip2 <- factor(house$statezip)
house$street2 <- factor(house$street)
house$country2 <- factor(house$country)
ggplot(house, aes(x= bedrooms2, y = price))+ geom_boxplot()+
scale_y_continuous(limits = c(0,2000000))
library(corrgram)
house_num <- house[,2:12]
corrgram(house_num, order= TRUE, upper.panel = panel.pie)
#3 data preprocessing ##3.1 data cleansing remove incorrect prices
idx <- which(house_num$price %in% c(0))
house_num <- house_num[-idx,]
summary(house_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 326264 1st Qu.:3.000 1st Qu.:1.750 1st Qu.: 1460
## Median : 465000 Median :3.000 Median :2.250 Median : 1970
## Mean : 557906 Mean :3.395 Mean :2.155 Mean : 2132
## 3rd Qu.: 657500 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2610
## Max. :26590000 Max. :9.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7680 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14835 Mean :1.512 Mean :0.006592 Mean :0.2347
## 3rd Qu.: 10978 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median :1590 Median : 0.0
## Mean :3.449 Mean :1822 Mean : 310.2
## 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 600.0
## Max. :5.000 Max. :9410 Max. :4820.0
remove rows with outliers
out_price <- boxplot.stats(house_num$price)$out
out_price
## [1] 2384000 1400000 1200000 1395000 2280000 1425000 1225000 1580000
## [9] 1870000 1800000 3200000 1500000 1340000 1532500 1750000 1500000
## [17] 1307000 1250000 2400000 1370000 1157200 1702500 2238888 1225000
## [25] 1200000 1575000 1315000 1300000 1346400 1225000 1264000 1895000
## [33] 2555000 1550000 1400000 1680000 1400000 1222500 1255000 2100000
## [41] 2000000 1270000 1216000 1405000 1190000 1300000 1212500 1275000
## [49] 1800000 1690000 1506000 1580000 1220000 1185000 1550000 2700000
## [57] 1600000 1170000 1180500 2150000 1325000 1610000 1200000 2000000
## [65] 1680000 1570000 1795000 1655000 1381000 1600000 1710000 1285000
## [73] 1175000 1400000 1210000 1970000 1400000 2453500 3100000 2750000
## [81] 1425000 2400000 1385000 1365000 1886700 1256500 3710000 1505000
## [89] 1595000 1250000 1965221 1298000 1270000 1901000 2400000 2005000
## [97] 1300000 1800000 1415000 1300000 1370000 1280000 2700000 1635000
## [105] 1339000 1350000 1900000 1225000 1230000 1240000 1795000 1180000
## [113] 7062500 2888000 1365000 1325000 1485000 2250000 1184000 1228000
## [121] 1688000 1240000 1335000 1735000 1695000 1735000 1712500 1920000
## [129] 4668000 1170000 1200000 1538000 1185001 2027000 2475000 1728000
## [137] 1555000 1325000 1384000 1600000 4489000 1400000 3000000 1160000
## [145] 1399950 1525000 1325000 1309500 2200000 1440000 1240000 1815000
## [153] 1950000 1300000 1462497 2110000 2400000 1320000 1240000 2075000
## [161] 1580000 1220000 1700000 1215000 1157400 2466350 1295648 1250000
## [169] 1200000 2300000 1229000 1355000 1700000 1330000 1280000 1250000
## [177] 1340000 1388000 1990000 1387800 1410000 1619999 1820000 1195000
## [185] 1600000 1200000 1165000 1230000 1925000 1730000 1309500 1510000
## [193] 2065000 1198000 1297000 3800000 1695000 1640000 2300000 1565000
## [201] 2367000 1356925 1300000 2271150 2147500 1411600 1195000 1875000
## [209] 1755000 1625000 2351956 1200000 1738000 1280000 1275000 1636000
## [217] 1450000 1360000 1300000 1195000 2458000 1205000 1675000 2180000
## [225] 1355000 1465000 1250000 1681000 2680000 2321000 1240000 1550000
## [233] 1234582 12899000 2110000 2199900 26590000 2560498 1337044 1288333
out_idx <- which(house_num$price %in% c(out_price))
house_num <- house_num[-out_idx,]
summary(house_num$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7800 320000 450000 487457 615000 1150000
##3.2 feature extraction
one hot encoding
house <- house[rownames(house_num),]
#statezip data frame
statezip <- house$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")
#2 one shot encoding stezip dataframe
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
df1 <- dummyVars("~.", data = statezip_df )
df2 <- data.frame(predict(df1, newdata = statezip_df))
house_num <- cbind(house_num, df2)
#3 training and testing
d <- dim(house_num)
m <- d[1]
set.seed(2022)
train.idx <- sample(m, 0.85 * m)
train.idx[1:5]
## [1] 1459 2871 3915 708 2751
train_data <- house_num[train.idx,]
test_data <- house_num[-train.idx,]
#4 modelling
mymodel <- lm(formula = price~. + I(sqft_living^2),
data = train_data)
#5 evaluation predict price
actual <- test_data$price
pred.mymodel <- predict(mymodel, test_data)
## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading
#plot answer predict price
price_df <- data.frame(actual, pred.mymodel)
ggplot(price_df, aes(x= actual, y = pred.mymodel))+ geom_point()+
scale_x_continuous(limits = c(0,2000000))+
scale_y_continuous(limits = c(0,2000000))
#measure rmse and pearson correlation
performance <- function(pred, act, method){
e <- pred - act
se <- e^2
mse <- mean(se)
rmse <- sqrt(mse)
r <- cor(pred, act)
result <- paste ("method :", method,
"\n rmse:", round(rmse,3),
"\n R:", round(r,3),
"\n")
cat(result)
}
performance(pred.mymodel,actual, "polynomial regression")
## method : polynomial regression
## rmse: 105510.188
## R: 0.886