email: aloukikadityadelhi@gmail.com

College: Maharaja surajmal institute of technology

Introduction

The purpose of this project is to analyse the pricing strategy of houses in the King county. Many factors drive house prices. The objective of this project is to identify the factors that matter the most.

Importing the data set

setwd("C:/Users/alouk/Downloads");
house <- read.csv(paste("kc_house_data.csv",sep = ""))
attach(house)

Legnth and breadth of dataset

dim(house)
## [1] 21613    21

Descriptive Statistics of the data

library(psych)
describe(house)
##               vars     n          mean           sd       median
## id               1 21613 4580301520.86 2.876566e+09  3.90493e+09
## date*            2 21613        178.30 1.095000e+02  1.68000e+02
## price            3 21613     540088.14 3.671272e+05  4.50000e+05
## bedrooms         4 21613          3.37 9.300000e-01  3.00000e+00
## bathrooms        5 21613          2.11 7.700000e-01  2.25000e+00
## sqft_living      6 21613       2079.90 9.184400e+02  1.91000e+03
## sqft_lot         7 21613      15106.97 4.142051e+04  7.61800e+03
## floors           8 21613          1.49 5.400000e-01  1.50000e+00
## waterfront       9 21613          0.01 9.000000e-02  0.00000e+00
## view            10 21613          0.23 7.700000e-01  0.00000e+00
## condition       11 21613          3.41 6.500000e-01  3.00000e+00
## grade           12 21613          7.66 1.180000e+00  7.00000e+00
## sqft_above      13 21613       1788.39 8.280900e+02  1.56000e+03
## sqft_basement   14 21613        291.51 4.425800e+02  0.00000e+00
## yr_built        15 21613       1971.01 2.937000e+01  1.97500e+03
## yr_renovated    16 21613         84.40 4.016800e+02  0.00000e+00
## zipcode         17 21613      98077.94 5.351000e+01  9.80650e+04
## lat             18 21613         47.56 1.400000e-01  4.75700e+01
## long            19 21613       -122.21 1.400000e-01 -1.22230e+02
## sqft_living15   20 21613       1986.55 6.853900e+02  1.84000e+03
## sqft_lot15      21 21613      12768.46 2.730418e+04  7.62000e+03
##                     trimmed          mad        min           max
## id            4500014357.18 3.561991e+09 1000102.00 9900000190.00
## date*                176.64 1.438100e+02       1.00        372.00
## price             481704.02 2.223900e+05   75000.00    7700000.00
## bedrooms               3.34 1.480000e+00       0.00         33.00
## bathrooms              2.07 7.400000e-01       0.00          8.00
## sqft_living         1984.40 8.006000e+02     290.00      13540.00
## sqft_lot            8259.53 3.881450e+03     520.00    1651359.00
## floors                 1.45 7.400000e-01       1.00          3.50
## waterfront             0.00 0.000000e+00       0.00          1.00
## view                   0.00 0.000000e+00       0.00          4.00
## condition              3.30 0.000000e+00       1.00          5.00
## grade                  7.58 1.480000e+00       1.00         13.00
## sqft_above          1682.94 6.671700e+02     290.00       9410.00
## sqft_basement        205.25 0.000000e+00       0.00       4820.00
## yr_built            1973.10 3.410000e+01    1900.00       2015.00
## yr_renovated           0.00 0.000000e+00       0.00       2015.00
## zipcode            98074.72 6.227000e+01   98001.00      98199.00
## lat                   47.57 1.600000e-01      47.16         47.78
## long                -122.23 1.500000e-01    -122.52       -121.31
## sqft_living15       1914.07 6.078700e+02     399.00       6210.00
## sqft_lot15          7903.21 3.713910e+03     651.00     871200.00
##                      range  skew kurtosis          se
## id            9.899000e+09  0.24    -1.26 19566662.38
## date*         3.710000e+02  0.15    -1.27        0.74
## price         7.625000e+06  4.02    34.57     2497.23
## bedrooms      3.300000e+01  1.97    49.05        0.01
## bathrooms     8.000000e+00  0.51     1.28        0.01
## sqft_living   1.325000e+04  1.47     5.24        6.25
## sqft_lot      1.650839e+06 13.06   284.98      281.75
## floors        2.500000e+00  0.62    -0.49        0.00
## waterfront    1.000000e+00 11.38   127.59        0.00
## view          4.000000e+00  3.40    10.89        0.01
## condition     4.000000e+00  1.03     0.53        0.00
## grade         1.200000e+01  0.77     1.19        0.01
## sqft_above    9.120000e+03  1.45     3.40        5.63
## sqft_basement 4.820000e+03  1.58     2.71        3.01
## yr_built      1.150000e+02 -0.47    -0.66        0.20
## yr_renovated  2.015000e+03  4.55    18.69        2.73
## zipcode       1.980000e+02  0.41    -0.85        0.36
## lat           6.200000e-01 -0.49    -0.68        0.00
## long          1.200000e+00  0.88     1.05        0.00
## sqft_living15 5.811000e+03  1.11     1.60        4.66
## sqft_lot15    8.705490e+05  9.51   150.71      185.73
str(house)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...

One way contigency tables for categorical data

## 1 bathrooms
table(bathrooms)
## bathrooms
##    0  0.5 0.75    1 1.25  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75 
##   10    4   72 3852    9 1446 3048 1930 2047 5380 1185  753  589  731  155 
##    4 4.25  4.5 4.75    5 5.25  5.5 5.75    6 6.25  6.5 6.75  7.5 7.75    8 
##  136   79  100   23   21   13   10    4    6    2    2    2    1    1    2
## 2 Bedrooms
table(bedrooms)
## bedrooms
##    0    1    2    3    4    5    6    7    8    9   10   11   33 
##   13  199 2760 9824 6882 1601  272   38   13    6    3    1    1
## 3 Number of floors
table(floors)
## floors
##     1   1.5     2   2.5     3   3.5 
## 10680  1910  8241   161   613     8
## 4 bathrooms
table(bathrooms)
## bathrooms
##    0  0.5 0.75    1 1.25  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75 
##   10    4   72 3852    9 1446 3048 1930 2047 5380 1185  753  589  731  155 
##    4 4.25  4.5 4.75    5 5.25  5.5 5.75    6 6.25  6.5 6.75  7.5 7.75    8 
##  136   79  100   23   21   13   10    4    6    2    2    2    1    1    2
## 5 Grade
table(grade)
## grade
##    1    3    4    5    6    7    8    9   10   11   12   13 
##    1    3   29  242 2038 8981 6068 2615 1134  399   90   13
## 6 Built in year
table(yr_built)
## yr_built
## 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 
##   87   29   27   46   45   74   92   65   86   94  134   73   79   59   54 
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 
##   64   79   56  120   88   98   76   95   84  139  165  180  115  126  114 
## 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 
##   90   61   38   30   21   24   40   68   52  106  156  161  223  170  140 
## 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 
##   95  126  263  235  195  250  229  220  223  305  271  198  198  224  334 
## 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 
##  248  224  312  256  172  187  250  350  381  280  132  104  149  149  162 
## 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 
##  189  253  417  387  343  240  199  105  212  229  228  215  294  270  290 
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 
##  320  224  198  202  249  169  195  177  239  265  218  305  222  422  433 
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 
##  450  454  417  367  230  143  130  170  201  559   38

Two way contingency tables

## 1. Floors and grade
xtabs(~floors+grade)
##       grade
## floors    1    3    4    5    6    7    8    9   10   11   12   13
##    1      1    3   27  202 1662 5916 2233  447  142   34   11    2
##    1.5    0    0    2   38  311 1006  402  105   35   11    0    0
##    2      0    0    0    2   63 1943 2989 1935  906  323   72    8
##    2.5    0    0    0    0    2   15   53   46   26   14    2    3
##    3      0    0    0    0    0  100  385   82   25   17    4    0
##    3.5    0    0    0    0    0    1    6    0    0    0    1    0
## 2. Bedrooms and grade
xtabs(~bedrooms+grade)
##         grade
## bedrooms    1    3    4    5    6    7    8    9   10   11   12   13
##       0     1    0    2    0    0    6    3    0    0    0    1    0
##       1     0    3   12   37   78   52   14    2    1    0    0    0
##       2     0    0   14  114  824 1205  499   78   21    3    2    0
##       3     0    0    1   62  854 4917 2796  832  296   56    9    1
##       4     0    0    0   21  233 2177 2194 1351  615  239   49    3
##       5     0    0    0    5   41  501  455  313  173   83   24    6
##       6     0    0    0    3    7   98   90   33   22   13    3    3
##       7     0    0    0    0    1   11   12    4    5    4    1    0
##       8     0    0    0    0    0    6    4    1    1    0    1    0
##       9     0    0    0    0    0    4    1    0    0    1    0    0
##       10    0    0    0    0    0    2    0    1    0    0    0    0
##       11    0    0    0    0    0    1    0    0    0    0    0    0
##       33    0    0    0    0    0    1    0    0    0    0    0    0
## 3. bathrooms and grade
xtabs(~bathrooms+grade)
##          grade
## bathrooms    1    3    4    5    6    7    8    9   10   11   12   13
##      0       1    2    0    0    0    4    2    0    0    0    1    0
##      0.5     0    0    0    1    2    0    1    0    0    0    0    0
##      0.75    0    1   14   14   26   17    0    0    0    0    0    0
##      1       0    0   14  190 1414 2084  143    7    0    0    0    0
##      1.25    0    0    0    0    1    3    2    2    0    1    0    0
##      1.5     0    0    0    9  137  984  288   23    4    1    0    0
##      1.75    0    0    1    9  225 1899  808   92   12    2    0    0
##      2       0    0    0   17  184 1165  458   93   13    0    0    0
##      2.25    0    0    0    0    4  778  956  237   66    6    0    0
##      2.5     0    0    0    1   24 1390 2278 1226  385   68    7    1
##      2.75    0    0    0    1   10  322  448  286  100   16    2    0
##      3       0    0    0    0    8  216  269  161   79   18    2    0
##      3.25    0    0    0    0    0   37  165  182  130   61   13    1
##      3.5     0    0    0    0    2   40  181  223  181   84   20    0
##      3.75    0    0    0    0    1   12   24   34   50   29    4    1
##      4       0    0    0    0    0   12   18   22   43   30   10    1
##      4.25    0    0    0    0    0    3    7    9   29   24    6    1
##      4.5     0    0    0    0    0    8   14   15   30   28    5    0
##      4.75    0    0    0    0    0    1    2    1    5   11    2    1
##      5       0    0    0    0    0    3    2    0    5    7    3    1
##      5.25    0    0    0    0    0    2    0    2    1    5    3    0
##      5.5     0    0    0    0    0    0    0    0    1    5    2    2
##      5.75    0    0    0    0    0    0    1    0    0    0    2    1
##      6       0    0    0    0    0    0    1    0    0    0    5    0
##      6.25    0    0    0    0    0    0    0    0    0    1    0    1
##      6.5     0    0    0    0    0    0    0    0    0    1    1    0
##      6.75    0    0    0    0    0    0    0    0    0    1    1    0
##      7.5     0    0    0    0    0    1    0    0    0    0    0    0
##      7.75    0    0    0    0    0    0    0    0    0    0    0    1
##      8       0    0    0    0    0    0    0    0    0    0    1    1

BOXPLOT

## 1. PRICING
boxplot(price,horizontal = TRUE,col = "orange")

## 2. Square feet living
boxplot(sqft_living,horizontal = TRUE,col = "blue")

## 3. Grade
boxplot(grade,horizontal = TRUE,col = "red")

## 4. bathrooms
boxplot(bathrooms,horizontal = TRUE,col = "orange")

Histograms

## 1. HOUSE PRICE
hist(price,xlab="Price of houses",ylab="Frequency",ylim=c(0,15000),)

## 2. bathrooms
hist(bathrooms,xlab="bathrooms of houses",ylab="Frequency",ylim=c(0,15000),)

## 3. grade
hist(grade,xlab="Grade of houses",ylab="Frequency",ylim=c(0,15000),)

PLOTS

## price vs squarefeet
plot(price~sqft_living)

## price vs Grade
plot(price~grade)

## price vs bathrooms
plot(price~bathrooms)

correlation

#3 to 21 coloums are selected
cor(house[,c(3:21)])  
##                     price     bedrooms   bathrooms sqft_living
## price          1.00000000  0.308349598  0.52513751  0.70203505
## bedrooms       0.30834960  1.000000000  0.51588364  0.57667069
## bathrooms      0.52513751  0.515883638  1.00000000  0.75466528
## sqft_living    0.70203505  0.576670693  0.75466528  1.00000000
## sqft_lot       0.08966086  0.031703243  0.08773966  0.17282566
## floors         0.25679389  0.175428935  0.50065317  0.35394929
## waterfront     0.26636943 -0.006582479  0.06374363  0.10381782
## view           0.39729349  0.079531852  0.18773702  0.28461119
## condition      0.03636179  0.028472104 -0.12498193 -0.05875259
## grade          0.66743426  0.356966725  0.66498253  0.76270448
## sqft_above     0.60556730  0.477600161  0.68534248  0.87659660
## sqft_basement  0.32381602  0.303093375  0.28377003  0.43504297
## yr_built       0.05401153  0.154178069  0.50601944  0.31804877
## yr_renovated   0.12643379  0.018840823  0.05073898  0.05536293
## zipcode       -0.05320285 -0.152668487 -0.20386627 -0.19943004
## lat            0.30700348 -0.008931010  0.02457295  0.05252946
## long           0.02162624  0.129472975  0.22304184  0.24022330
## sqft_living15  0.58537890  0.391637524  0.56863429  0.75642026
## sqft_lot15     0.08244715  0.029244224  0.08717536  0.18328555
##                   sqft_lot       floors   waterfront         view
## price          0.089660861  0.256793888  0.266369434  0.397293488
## bedrooms       0.031703243  0.175428935 -0.006582479  0.079531852
## bathrooms      0.087739662  0.500653173  0.063743629  0.187737024
## sqft_living    0.172825661  0.353949290  0.103817818  0.284611186
## sqft_lot       1.000000000 -0.005200991  0.021603683  0.074710106
## floors        -0.005200991  1.000000000  0.023698320  0.029443820
## waterfront     0.021603683  0.023698320  1.000000000  0.401857351
## view           0.074710106  0.029443820  0.401857351  1.000000000
## condition     -0.008958250 -0.263767946  0.016653157  0.045989737
## grade          0.113621124  0.458182514  0.082774914  0.251320585
## sqft_above     0.183512281  0.523884710  0.072074592  0.167649344
## sqft_basement  0.015286202 -0.245704542  0.080587939  0.276946579
## yr_built       0.053080367  0.489319425 -0.026161086 -0.053439851
## yr_renovated   0.007643505  0.006338401  0.092884837  0.103917288
## zipcode       -0.129574486 -0.059120642  0.030284728  0.084826917
## lat           -0.085682788  0.049614131 -0.014273776  0.006156732
## long           0.229520859  0.125419028 -0.041910200 -0.078399712
## sqft_living15  0.144608174  0.279885265  0.086463136  0.280439082
## sqft_lot15     0.718556752 -0.011269187  0.030703283  0.072574568
##                  condition       grade    sqft_above sqft_basement
## price          0.036361789  0.66743426  0.6055672984    0.32381602
## bedrooms       0.028472104  0.35696673  0.4776001614    0.30309338
## bathrooms     -0.124981933  0.66498253  0.6853424759    0.28377003
## sqft_living   -0.058752587  0.76270448  0.8765965987    0.43504297
## sqft_lot      -0.008958250  0.11362112  0.1835122809    0.01528620
## floors        -0.263767946  0.45818251  0.5238847103   -0.24570454
## waterfront     0.016653157  0.08277491  0.0720745917    0.08058794
## view           0.045989737  0.25132058  0.1676493441    0.27694658
## condition      1.000000000 -0.14467367 -0.1582136164    0.17410491
## grade         -0.144673671  1.00000000  0.7559229376    0.16839182
## sqft_above    -0.158213616  0.75592294  1.0000000000   -0.05194331
## sqft_basement  0.174104914  0.16839182 -0.0519433068    1.00000000
## yr_built      -0.361416562  0.44696320  0.4238983517   -0.13312410
## yr_renovated  -0.060617787  0.01441428  0.0232846879    0.07132290
## zipcode        0.003025524 -0.18486209 -0.2611899765    0.07484461
## lat           -0.014941006  0.11408406 -0.0008164986    0.11053796
## long          -0.106500448  0.19837215  0.3438030175   -0.14476477
## sqft_living15 -0.092824268  0.71320209  0.7318702924    0.20035498
## sqft_lot15    -0.003405523  0.11924790  0.1940498619    0.01727618
##                  yr_built yr_renovated      zipcode           lat
## price          0.05401153  0.126433793 -0.053202854  0.3070034800
## bedrooms       0.15417807  0.018840823 -0.152668487 -0.0089310097
## bathrooms      0.50601944  0.050738978 -0.203866274  0.0245729528
## sqft_living    0.31804877  0.055362927 -0.199430043  0.0525294622
## sqft_lot       0.05308037  0.007643505 -0.129574486 -0.0856827882
## floors         0.48931942  0.006338401 -0.059120642  0.0496141310
## waterfront    -0.02616109  0.092884837  0.030284728 -0.0142737756
## view          -0.05343985  0.103917288  0.084826917  0.0061567321
## condition     -0.36141656 -0.060617787  0.003025524 -0.0149410064
## grade          0.44696320  0.014414281 -0.184862093  0.1140840571
## sqft_above     0.42389835  0.023284688 -0.261189977 -0.0008164986
## sqft_basement -0.13312410  0.071322902  0.074844608  0.1105379580
## yr_built       1.00000000 -0.224873518 -0.346869178 -0.1481224021
## yr_renovated  -0.22487352  1.000000000  0.064357057  0.0293976092
## zipcode       -0.34686918  0.064357057  1.000000000  0.2670479500
## lat           -0.14812240  0.029397609  0.267047950  1.0000000000
## long           0.40935620 -0.068372369 -0.564071606 -0.1355117836
## sqft_living15  0.32622890 -0.002672555 -0.279032997  0.0488579321
## sqft_lot15     0.07095793  0.007853765 -0.147221069 -0.0864188072
##                      long sqft_living15   sqft_lot15
## price          0.02162624   0.585378904  0.082447153
## bedrooms       0.12947298   0.391637524  0.029244224
## bathrooms      0.22304184   0.568634290  0.087175361
## sqft_living    0.24022330   0.756420259  0.183285551
## sqft_lot       0.22952086   0.144608174  0.718556752
## floors         0.12541903   0.279885265 -0.011269187
## waterfront    -0.04191020   0.086463136  0.030703283
## view          -0.07839971   0.280439082  0.072574568
## condition     -0.10650045  -0.092824268 -0.003405523
## grade          0.19837215   0.713202093  0.119247897
## sqft_above     0.34380302   0.731870292  0.194049862
## sqft_basement -0.14476477   0.200354983  0.017276181
## yr_built       0.40935620   0.326228900  0.070957926
## yr_renovated  -0.06837237  -0.002672555  0.007853765
## zipcode       -0.56407161  -0.279032997 -0.147221069
## lat           -0.13551178   0.048857932 -0.086418807
## long           1.00000000   0.334604984  0.254451288
## sqft_living15  0.33460498   1.000000000  0.183191749
## sqft_lot15     0.25445129   0.183191749  1.000000000
library(corrgram)
corrgram(x=cor(house[,c(3:21)]))

Scatterplot matrix

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(x=cor(house[,c(3:15)]))

Regression

model1 <- lm(price~sqft_living+grade+bathrooms)
summary(model1)
## 
## Call:
## lm(formula = price ~ sqft_living + grade + bathrooms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1008780  -136352   -23106   100673  4801889 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.977e+05  1.326e+04  -45.07   <2e-16 ***
## sqft_living  2.033e+02  3.331e+00   61.04   <2e-16 ***
## grade        1.039e+05  2.286e+03   45.44   <2e-16 ***
## bathrooms   -3.809e+04  3.440e+03  -11.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 249800 on 21609 degrees of freedom
## Multiple R-squared:  0.5371, Adjusted R-squared:  0.5371 
## F-statistic:  8359 on 3 and 21609 DF,  p-value: < 2.2e-16
t.test(price, grade)
## 
##  Welch Two Sample t-test
## 
## data:  price and grade
## t = 216.27, df = 21612, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  535185.7 544975.2
## sample estimates:
##    mean of x    mean of y 
## 5.400881e+05 7.656873e+00

The less p-value validates the F-statistics and we reject the null hyothesis. Therefore, the data points taken in the model do affect the house pricing in king county.

Results

We have found empirical support for our hypothesis as p-value is <0.01. Therfore, grade, bedroom and sqaurefeet_living do affect the house pricing.