mydata <- read.csv(paste("kc_house_data.csv"))
head(mydata)
##           id            date   price bedrooms bathrooms sqft_living
## 1 7129300520 20141013T000000  221900        3      1.00        1180
## 2 6414100192 20141209T000000  538000        3      2.25        2570
## 3 5631500400 20150225T000000  180000        2      1.00         770
## 4 2487200875 20141209T000000  604000        4      3.00        1960
## 5 1954400510 20150218T000000  510000        3      2.00        1680
## 6 7237550310 20140512T000000 1225000        4      4.50        5420
##   sqft_lot floors waterfront view condition grade sqft_above sqft_basement
## 1     5650      1          0    0         3     7       1180             0
## 2     7242      2          0    0         3     7       2170           400
## 3    10000      1          0    0         3     6        770             0
## 4     5000      1          0    0         5     7       1050           910
## 5     8080      1          0    0         3     8       1680             0
## 6   101930      1          0    0         3    11       3890          1530
##   yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1     1955            0   98178 47.5112 -122.257          1340       5650
## 2     1951         1991   98125 47.7210 -122.319          1690       7639
## 3     1933            0   98028 47.7379 -122.233          2720       8062
## 4     1965            0   98136 47.5208 -122.393          1360       5000
## 5     1987            0   98074 47.6168 -122.045          1800       7503
## 6     2001            0   98053 47.6561 -122.005          4760     101930
summary(mydata)
##        id                         date           price        
##  Min.   :1.000e+06   20140623T000000:  142   Min.   :  75000  
##  1st Qu.:2.123e+09   20140625T000000:  131   1st Qu.: 321950  
##  Median :3.905e+09   20140626T000000:  131   Median : 450000  
##  Mean   :4.580e+09   20140708T000000:  127   Mean   : 540088  
##  3rd Qu.:7.309e+09   20150427T000000:  126   3rd Qu.: 645000  
##  Max.   :9.900e+09   20150325T000000:  123   Max.   :7700000  
##                      (Other)        :20833                    
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 0.000   Min.   :0.000   Min.   :  290   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7618  
##  Mean   : 3.371   Mean   :2.115   Mean   : 2080   Mean   :  15107  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1651359  
##                                                                    
##      floors        waterfront            view          condition    
##  Min.   :1.000   Min.   :0.000000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :1.500   Median :0.000000   Median :0.0000   Median :3.000  
##  Mean   :1.494   Mean   :0.007542   Mean   :0.2343   Mean   :3.409  
##  3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000  
##  Max.   :3.500   Max.   :1.000000   Max.   :4.0000   Max.   :5.000  
##                                                                     
##      grade          sqft_above   sqft_basement       yr_built   
##  Min.   : 1.000   Min.   : 290   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.000   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.000   Median :1560   Median :   0.0   Median :1975  
##  Mean   : 7.657   Mean   :1788   Mean   : 291.5   Mean   :1971  
##  3rd Qu.: 8.000   3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997  
##  Max.   :13.000   Max.   :9410   Max.   :4820.0   Max.   :2015  
##                                                                 
##   yr_renovated       zipcode           lat             long       
##  Min.   :   0.0   Min.   :98001   Min.   :47.16   Min.   :-122.5  
##  1st Qu.:   0.0   1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3  
##  Median :   0.0   Median :98065   Median :47.57   Median :-122.2  
##  Mean   :  84.4   Mean   :98078   Mean   :47.56   Mean   :-122.2  
##  3rd Qu.:   0.0   3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1  
##  Max.   :2015.0   Max.   :98199   Max.   :47.78   Max.   :-121.3  
##                                                                   
##  sqft_living15    sqft_lot15    
##  Min.   : 399   Min.   :   651  
##  1st Qu.:1490   1st Qu.:  5100  
##  Median :1840   Median :  7620  
##  Mean   :1987   Mean   : 12768  
##  3rd Qu.:2360   3rd Qu.: 10083  
##  Max.   :6210   Max.   :871200  
## 
attach(mydata)
str(mydata)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
cor(price,bedrooms,method = "pearson", use = "complete.obs")
## [1] 0.3083496
cor(price,bathrooms,method = "pearson", use = "complete.obs")
## [1] 0.5251375
cor(price,sqft_living,method = "pearson", use = "complete.obs")
## [1] 0.7020351
cor(price,floors,method = "pearson", use = "complete.obs")
## [1] 0.2567939
cor(price,condition,method = "pearson", use = "complete.obs")
## [1] 0.03636179
cor(price,yr_built,method = "pearson", use = "complete.obs")
## [1] 0.05401153
cor(price,yr_renovated,method = "pearson", use = "complete.obs")
## [1] 0.1264338
cor(price,sqft_above,method = "pearson")
## [1] 0.6055673
cor(price,sqft_basement,method = "pearson")
## [1] 0.323816
cor(price,sqft_lot,method="pearson")
## [1] 0.08966086
cor(price,grade,method = "pearson")
## [1] 0.6674343
cor(price,waterfront,method = "pearson")
## [1] 0.2663694
cor(price,condition,method = "pearson")
## [1] 0.03636179
library(corrplot)
## corrplot 0.84 loaded
mydata1<-mydata[,c("price","bathrooms","bedrooms","floors","condition",
                   "grade","sqft_living","sqft_lot")]
corrplot(cor(mydata1),method="circle")

cor(bathrooms,sqft_living,method="pearson")
## [1] 0.7546653
cor(sqft_above,sqft_living,method = "pearson")
## [1] 0.8765966
cor(sqft_basement,sqft_above,method = "pearson")
## [1] -0.05194331
cor(sqft_living,sqft_basement,method="pearson")
## [1] 0.435043
coor <- mydata[,c("bathrooms","sqft_above","sqft_basement","sqft_living","grade")]
library(corrplot)
corrplot(cor(coor),method="circle")

Since, sqft_above and sqft_basement are both highly correlated to bedrooms, bathrooms and sqft_living, and highly uncorrelated to each other. But sqft_above is highly correlated with grade. Therefore, For regression model. We will be using are grades, bathrooms, sqft_basement.

library("corrgram")
corrgram(mydata, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Price of housing in UK") 

library(car)
scatterplotMatrix(~price + bedrooms + bathrooms + grade + waterfront,
                  data=mydata,
    main="Price vs facilities of house")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

library(car)
scatterplotMatrix(~price+ sqft_living + sqft_above  + sqft_lot+sqft_basement,     data = mydata,
    main="Price vs size of house")