mydata <- read.csv(paste("kc_house_data.csv"))
head(mydata)
## id date price bedrooms bathrooms sqft_living
## 1 7129300520 20141013T000000 221900 3 1.00 1180
## 2 6414100192 20141209T000000 538000 3 2.25 2570
## 3 5631500400 20150225T000000 180000 2 1.00 770
## 4 2487200875 20141209T000000 604000 4 3.00 1960
## 5 1954400510 20150218T000000 510000 3 2.00 1680
## 6 7237550310 20140512T000000 1225000 4 4.50 5420
## sqft_lot floors waterfront view condition grade sqft_above sqft_basement
## 1 5650 1 0 0 3 7 1180 0
## 2 7242 2 0 0 3 7 2170 400
## 3 10000 1 0 0 3 6 770 0
## 4 5000 1 0 0 5 7 1050 910
## 5 8080 1 0 0 3 8 1680 0
## 6 101930 1 0 0 3 11 3890 1530
## yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 1955 0 98178 47.5112 -122.257 1340 5650
## 2 1951 1991 98125 47.7210 -122.319 1690 7639
## 3 1933 0 98028 47.7379 -122.233 2720 8062
## 4 1965 0 98136 47.5208 -122.393 1360 5000
## 5 1987 0 98074 47.6168 -122.045 1800 7503
## 6 2001 0 98053 47.6561 -122.005 4760 101930
summary(mydata)
## id date price
## Min. :1.000e+06 20140623T000000: 142 Min. : 75000
## 1st Qu.:2.123e+09 20140625T000000: 131 1st Qu.: 321950
## Median :3.905e+09 20140626T000000: 131 Median : 450000
## Mean :4.580e+09 20140708T000000: 127 Mean : 540088
## 3rd Qu.:7.309e+09 20150427T000000: 126 3rd Qu.: 645000
## Max. :9.900e+09 20150325T000000: 123 Max. :7700000
## (Other) :20833
## bedrooms bathrooms sqft_living sqft_lot
## Min. : 0.000 Min. :0.000 Min. : 290 Min. : 520
## 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040
## Median : 3.000 Median :2.250 Median : 1910 Median : 7618
## Mean : 3.371 Mean :2.115 Mean : 2080 Mean : 15107
## 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688
## Max. :33.000 Max. :8.000 Max. :13540 Max. :1651359
##
## floors waterfront view condition
## Min. :1.000 Min. :0.000000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000
## Median :1.500 Median :0.000000 Median :0.0000 Median :3.000
## Mean :1.494 Mean :0.007542 Mean :0.2343 Mean :3.409
## 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000
## Max. :3.500 Max. :1.000000 Max. :4.0000 Max. :5.000
##
## grade sqft_above sqft_basement yr_built
## Min. : 1.000 Min. : 290 Min. : 0.0 Min. :1900
## 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951
## Median : 7.000 Median :1560 Median : 0.0 Median :1975
## Mean : 7.657 Mean :1788 Mean : 291.5 Mean :1971
## 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0 3rd Qu.:1997
## Max. :13.000 Max. :9410 Max. :4820.0 Max. :2015
##
## yr_renovated zipcode lat long
## Min. : 0.0 Min. :98001 Min. :47.16 Min. :-122.5
## 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3
## Median : 0.0 Median :98065 Median :47.57 Median :-122.2
## Mean : 84.4 Mean :98078 Mean :47.56 Mean :-122.2
## 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68 3rd Qu.:-122.1
## Max. :2015.0 Max. :98199 Max. :47.78 Max. :-121.3
##
## sqft_living15 sqft_lot15
## Min. : 399 Min. : 651
## 1st Qu.:1490 1st Qu.: 5100
## Median :1840 Median : 7620
## Mean :1987 Mean : 12768
## 3rd Qu.:2360 3rd Qu.: 10083
## Max. :6210 Max. :871200
##
attach(mydata)
str(mydata)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
cor(price,bedrooms,method = "pearson", use = "complete.obs")
## [1] 0.3083496
cor(price,bathrooms,method = "pearson", use = "complete.obs")
## [1] 0.5251375
cor(price,sqft_living,method = "pearson", use = "complete.obs")
## [1] 0.7020351
cor(price,floors,method = "pearson", use = "complete.obs")
## [1] 0.2567939
cor(price,condition,method = "pearson", use = "complete.obs")
## [1] 0.03636179
cor(price,yr_built,method = "pearson", use = "complete.obs")
## [1] 0.05401153
cor(price,yr_renovated,method = "pearson", use = "complete.obs")
## [1] 0.1264338
cor(price,sqft_above,method = "pearson")
## [1] 0.6055673
cor(price,sqft_basement,method = "pearson")
## [1] 0.323816
cor(price,sqft_lot,method="pearson")
## [1] 0.08966086
cor(price,grade,method = "pearson")
## [1] 0.6674343
cor(price,waterfront,method = "pearson")
## [1] 0.2663694
cor(price,condition,method = "pearson")
## [1] 0.03636179
library(corrplot)
## corrplot 0.84 loaded
mydata1<-mydata[,c("price","bathrooms","bedrooms","floors","condition",
"grade","sqft_living","sqft_lot")]
corrplot(cor(mydata1),method="circle")
cor(bathrooms,sqft_living,method="pearson")
## [1] 0.7546653
cor(sqft_above,sqft_living,method = "pearson")
## [1] 0.8765966
cor(sqft_basement,sqft_above,method = "pearson")
## [1] -0.05194331
cor(sqft_living,sqft_basement,method="pearson")
## [1] 0.435043
coor <- mydata[,c("bathrooms","sqft_above","sqft_basement","sqft_living","grade")]
library(corrplot)
corrplot(cor(coor),method="circle")
Since, sqft_above and sqft_basement are both highly correlated to bedrooms, bathrooms and sqft_living, and highly uncorrelated to each other. But sqft_above is highly correlated with grade. Therefore, For regression model. We will be using are grades, bathrooms, sqft_basement.
library("corrgram")
corrgram(mydata, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Price of housing in UK")
library(car)
scatterplotMatrix(~price + bedrooms + bathrooms + grade + waterfront,
data=mydata,
main="Price vs facilities of house")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
library(car)
scatterplotMatrix(~price+ sqft_living + sqft_above + sqft_lot+sqft_basement, data = mydata,
main="Price vs size of house")