The purpose of this project is to analyse the pricing strategy of houses in the King county. Many factors drive house prices. The objective of this project is to identify the factors that matter the most.
setwd("C:/Users/alouk/Downloads");
house <- read.csv(paste("kc_house_data.csv",sep = ""))
attach(house)
dim(house)
## [1] 21613 21
library(psych)
describe(house)
## vars n mean sd median
## id 1 21613 4580301520.86 2.876566e+09 3.90493e+09
## date* 2 21613 178.30 1.095000e+02 1.68000e+02
## price 3 21613 540088.14 3.671272e+05 4.50000e+05
## bedrooms 4 21613 3.37 9.300000e-01 3.00000e+00
## bathrooms 5 21613 2.11 7.700000e-01 2.25000e+00
## sqft_living 6 21613 2079.90 9.184400e+02 1.91000e+03
## sqft_lot 7 21613 15106.97 4.142051e+04 7.61800e+03
## floors 8 21613 1.49 5.400000e-01 1.50000e+00
## waterfront 9 21613 0.01 9.000000e-02 0.00000e+00
## view 10 21613 0.23 7.700000e-01 0.00000e+00
## condition 11 21613 3.41 6.500000e-01 3.00000e+00
## grade 12 21613 7.66 1.180000e+00 7.00000e+00
## sqft_above 13 21613 1788.39 8.280900e+02 1.56000e+03
## sqft_basement 14 21613 291.51 4.425800e+02 0.00000e+00
## yr_built 15 21613 1971.01 2.937000e+01 1.97500e+03
## yr_renovated 16 21613 84.40 4.016800e+02 0.00000e+00
## zipcode 17 21613 98077.94 5.351000e+01 9.80650e+04
## lat 18 21613 47.56 1.400000e-01 4.75700e+01
## long 19 21613 -122.21 1.400000e-01 -1.22230e+02
## sqft_living15 20 21613 1986.55 6.853900e+02 1.84000e+03
## sqft_lot15 21 21613 12768.46 2.730418e+04 7.62000e+03
## trimmed mad min max
## id 4500014357.18 3.561991e+09 1000102.00 9900000190.00
## date* 176.64 1.438100e+02 1.00 372.00
## price 481704.02 2.223900e+05 75000.00 7700000.00
## bedrooms 3.34 1.480000e+00 0.00 33.00
## bathrooms 2.07 7.400000e-01 0.00 8.00
## sqft_living 1984.40 8.006000e+02 290.00 13540.00
## sqft_lot 8259.53 3.881450e+03 520.00 1651359.00
## floors 1.45 7.400000e-01 1.00 3.50
## waterfront 0.00 0.000000e+00 0.00 1.00
## view 0.00 0.000000e+00 0.00 4.00
## condition 3.30 0.000000e+00 1.00 5.00
## grade 7.58 1.480000e+00 1.00 13.00
## sqft_above 1682.94 6.671700e+02 290.00 9410.00
## sqft_basement 205.25 0.000000e+00 0.00 4820.00
## yr_built 1973.10 3.410000e+01 1900.00 2015.00
## yr_renovated 0.00 0.000000e+00 0.00 2015.00
## zipcode 98074.72 6.227000e+01 98001.00 98199.00
## lat 47.57 1.600000e-01 47.16 47.78
## long -122.23 1.500000e-01 -122.52 -121.31
## sqft_living15 1914.07 6.078700e+02 399.00 6210.00
## sqft_lot15 7903.21 3.713910e+03 651.00 871200.00
## range skew kurtosis se
## id 9.899000e+09 0.24 -1.26 19566662.38
## date* 3.710000e+02 0.15 -1.27 0.74
## price 7.625000e+06 4.02 34.57 2497.23
## bedrooms 3.300000e+01 1.97 49.05 0.01
## bathrooms 8.000000e+00 0.51 1.28 0.01
## sqft_living 1.325000e+04 1.47 5.24 6.25
## sqft_lot 1.650839e+06 13.06 284.98 281.75
## floors 2.500000e+00 0.62 -0.49 0.00
## waterfront 1.000000e+00 11.38 127.59 0.00
## view 4.000000e+00 3.40 10.89 0.01
## condition 4.000000e+00 1.03 0.53 0.00
## grade 1.200000e+01 0.77 1.19 0.01
## sqft_above 9.120000e+03 1.45 3.40 5.63
## sqft_basement 4.820000e+03 1.58 2.71 3.01
## yr_built 1.150000e+02 -0.47 -0.66 0.20
## yr_renovated 2.015000e+03 4.55 18.69 2.73
## zipcode 1.980000e+02 0.41 -0.85 0.36
## lat 6.200000e-01 -0.49 -0.68 0.00
## long 1.200000e+00 0.88 1.05 0.00
## sqft_living15 5.811000e+03 1.11 1.60 4.66
## sqft_lot15 8.705490e+05 9.51 150.71 185.73
str(house)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
## 1 bathrooms
table(bathrooms)
## bathrooms
## 0 0.5 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75
## 10 4 72 3852 9 1446 3048 1930 2047 5380 1185 753 589 731 155
## 4 4.25 4.5 4.75 5 5.25 5.5 5.75 6 6.25 6.5 6.75 7.5 7.75 8
## 136 79 100 23 21 13 10 4 6 2 2 2 1 1 2
## 2 Bedrooms
table(bedrooms)
## bedrooms
## 0 1 2 3 4 5 6 7 8 9 10 11 33
## 13 199 2760 9824 6882 1601 272 38 13 6 3 1 1
## 3 Number of floors
table(floors)
## floors
## 1 1.5 2 2.5 3 3.5
## 10680 1910 8241 161 613 8
## 4 bathrooms
table(bathrooms)
## bathrooms
## 0 0.5 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75
## 10 4 72 3852 9 1446 3048 1930 2047 5380 1185 753 589 731 155
## 4 4.25 4.5 4.75 5 5.25 5.5 5.75 6 6.25 6.5 6.75 7.5 7.75 8
## 136 79 100 23 21 13 10 4 6 2 2 2 1 1 2
## 5 Grade
table(grade)
## grade
## 1 3 4 5 6 7 8 9 10 11 12 13
## 1 3 29 242 2038 8981 6068 2615 1134 399 90 13
## 6 Built in year
table(yr_built)
## yr_built
## 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
## 87 29 27 46 45 74 92 65 86 94 134 73 79 59 54
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
## 64 79 56 120 88 98 76 95 84 139 165 180 115 126 114
## 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944
## 90 61 38 30 21 24 40 68 52 106 156 161 223 170 140
## 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
## 95 126 263 235 195 250 229 220 223 305 271 198 198 224 334
## 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974
## 248 224 312 256 172 187 250 350 381 280 132 104 149 149 162
## 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
## 189 253 417 387 343 240 199 105 212 229 228 215 294 270 290
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
## 320 224 198 202 249 169 195 177 239 265 218 305 222 422 433
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## 450 454 417 367 230 143 130 170 201 559 38
## 1. Floors and grade
xtabs(~floors+grade)
## grade
## floors 1 3 4 5 6 7 8 9 10 11 12 13
## 1 1 3 27 202 1662 5916 2233 447 142 34 11 2
## 1.5 0 0 2 38 311 1006 402 105 35 11 0 0
## 2 0 0 0 2 63 1943 2989 1935 906 323 72 8
## 2.5 0 0 0 0 2 15 53 46 26 14 2 3
## 3 0 0 0 0 0 100 385 82 25 17 4 0
## 3.5 0 0 0 0 0 1 6 0 0 0 1 0
## 2. Bedrooms and grade
xtabs(~bedrooms+grade)
## grade
## bedrooms 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 0 2 0 0 6 3 0 0 0 1 0
## 1 0 3 12 37 78 52 14 2 1 0 0 0
## 2 0 0 14 114 824 1205 499 78 21 3 2 0
## 3 0 0 1 62 854 4917 2796 832 296 56 9 1
## 4 0 0 0 21 233 2177 2194 1351 615 239 49 3
## 5 0 0 0 5 41 501 455 313 173 83 24 6
## 6 0 0 0 3 7 98 90 33 22 13 3 3
## 7 0 0 0 0 1 11 12 4 5 4 1 0
## 8 0 0 0 0 0 6 4 1 1 0 1 0
## 9 0 0 0 0 0 4 1 0 0 1 0 0
## 10 0 0 0 0 0 2 0 1 0 0 0 0
## 11 0 0 0 0 0 1 0 0 0 0 0 0
## 33 0 0 0 0 0 1 0 0 0 0 0 0
## 3. bathrooms and grade
xtabs(~bathrooms+grade)
## grade
## bathrooms 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 2 0 0 0 4 2 0 0 0 1 0
## 0.5 0 0 0 1 2 0 1 0 0 0 0 0
## 0.75 0 1 14 14 26 17 0 0 0 0 0 0
## 1 0 0 14 190 1414 2084 143 7 0 0 0 0
## 1.25 0 0 0 0 1 3 2 2 0 1 0 0
## 1.5 0 0 0 9 137 984 288 23 4 1 0 0
## 1.75 0 0 1 9 225 1899 808 92 12 2 0 0
## 2 0 0 0 17 184 1165 458 93 13 0 0 0
## 2.25 0 0 0 0 4 778 956 237 66 6 0 0
## 2.5 0 0 0 1 24 1390 2278 1226 385 68 7 1
## 2.75 0 0 0 1 10 322 448 286 100 16 2 0
## 3 0 0 0 0 8 216 269 161 79 18 2 0
## 3.25 0 0 0 0 0 37 165 182 130 61 13 1
## 3.5 0 0 0 0 2 40 181 223 181 84 20 0
## 3.75 0 0 0 0 1 12 24 34 50 29 4 1
## 4 0 0 0 0 0 12 18 22 43 30 10 1
## 4.25 0 0 0 0 0 3 7 9 29 24 6 1
## 4.5 0 0 0 0 0 8 14 15 30 28 5 0
## 4.75 0 0 0 0 0 1 2 1 5 11 2 1
## 5 0 0 0 0 0 3 2 0 5 7 3 1
## 5.25 0 0 0 0 0 2 0 2 1 5 3 0
## 5.5 0 0 0 0 0 0 0 0 1 5 2 2
## 5.75 0 0 0 0 0 0 1 0 0 0 2 1
## 6 0 0 0 0 0 0 1 0 0 0 5 0
## 6.25 0 0 0 0 0 0 0 0 0 1 0 1
## 6.5 0 0 0 0 0 0 0 0 0 1 1 0
## 6.75 0 0 0 0 0 0 0 0 0 1 1 0
## 7.5 0 0 0 0 0 1 0 0 0 0 0 0
## 7.75 0 0 0 0 0 0 0 0 0 0 0 1
## 8 0 0 0 0 0 0 0 0 0 0 1 1
## 1. PRICING
boxplot(price,horizontal = TRUE,col = "orange")
## 2. Square feet living
boxplot(sqft_living,horizontal = TRUE,col = "blue")
## 3. Grade
boxplot(grade,horizontal = TRUE,col = "red")
## 4. bathrooms
boxplot(bathrooms,horizontal = TRUE,col = "orange")
## 1. HOUSE PRICE
hist(price,xlab="Price of houses",ylab="Frequency",ylim=c(0,15000),)
## 2. bathrooms
hist(bathrooms,xlab="bathrooms of houses",ylab="Frequency",ylim=c(0,15000),)
## 3. grade
hist(grade,xlab="Grade of houses",ylab="Frequency",ylim=c(0,15000),)
## price vs squarefeet
plot(price~sqft_living)
## price vs Grade
plot(price~grade)
## price vs bathrooms
plot(price~bathrooms)
#3 to 21 coloums are selected
cor(house[,c(3:21)])
## price bedrooms bathrooms sqft_living
## price 1.00000000 0.308349598 0.52513751 0.70203505
## bedrooms 0.30834960 1.000000000 0.51588364 0.57667069
## bathrooms 0.52513751 0.515883638 1.00000000 0.75466528
## sqft_living 0.70203505 0.576670693 0.75466528 1.00000000
## sqft_lot 0.08966086 0.031703243 0.08773966 0.17282566
## floors 0.25679389 0.175428935 0.50065317 0.35394929
## waterfront 0.26636943 -0.006582479 0.06374363 0.10381782
## view 0.39729349 0.079531852 0.18773702 0.28461119
## condition 0.03636179 0.028472104 -0.12498193 -0.05875259
## grade 0.66743426 0.356966725 0.66498253 0.76270448
## sqft_above 0.60556730 0.477600161 0.68534248 0.87659660
## sqft_basement 0.32381602 0.303093375 0.28377003 0.43504297
## yr_built 0.05401153 0.154178069 0.50601944 0.31804877
## yr_renovated 0.12643379 0.018840823 0.05073898 0.05536293
## zipcode -0.05320285 -0.152668487 -0.20386627 -0.19943004
## lat 0.30700348 -0.008931010 0.02457295 0.05252946
## long 0.02162624 0.129472975 0.22304184 0.24022330
## sqft_living15 0.58537890 0.391637524 0.56863429 0.75642026
## sqft_lot15 0.08244715 0.029244224 0.08717536 0.18328555
## sqft_lot floors waterfront view
## price 0.089660861 0.256793888 0.266369434 0.397293488
## bedrooms 0.031703243 0.175428935 -0.006582479 0.079531852
## bathrooms 0.087739662 0.500653173 0.063743629 0.187737024
## sqft_living 0.172825661 0.353949290 0.103817818 0.284611186
## sqft_lot 1.000000000 -0.005200991 0.021603683 0.074710106
## floors -0.005200991 1.000000000 0.023698320 0.029443820
## waterfront 0.021603683 0.023698320 1.000000000 0.401857351
## view 0.074710106 0.029443820 0.401857351 1.000000000
## condition -0.008958250 -0.263767946 0.016653157 0.045989737
## grade 0.113621124 0.458182514 0.082774914 0.251320585
## sqft_above 0.183512281 0.523884710 0.072074592 0.167649344
## sqft_basement 0.015286202 -0.245704542 0.080587939 0.276946579
## yr_built 0.053080367 0.489319425 -0.026161086 -0.053439851
## yr_renovated 0.007643505 0.006338401 0.092884837 0.103917288
## zipcode -0.129574486 -0.059120642 0.030284728 0.084826917
## lat -0.085682788 0.049614131 -0.014273776 0.006156732
## long 0.229520859 0.125419028 -0.041910200 -0.078399712
## sqft_living15 0.144608174 0.279885265 0.086463136 0.280439082
## sqft_lot15 0.718556752 -0.011269187 0.030703283 0.072574568
## condition grade sqft_above sqft_basement
## price 0.036361789 0.66743426 0.6055672984 0.32381602
## bedrooms 0.028472104 0.35696673 0.4776001614 0.30309338
## bathrooms -0.124981933 0.66498253 0.6853424759 0.28377003
## sqft_living -0.058752587 0.76270448 0.8765965987 0.43504297
## sqft_lot -0.008958250 0.11362112 0.1835122809 0.01528620
## floors -0.263767946 0.45818251 0.5238847103 -0.24570454
## waterfront 0.016653157 0.08277491 0.0720745917 0.08058794
## view 0.045989737 0.25132058 0.1676493441 0.27694658
## condition 1.000000000 -0.14467367 -0.1582136164 0.17410491
## grade -0.144673671 1.00000000 0.7559229376 0.16839182
## sqft_above -0.158213616 0.75592294 1.0000000000 -0.05194331
## sqft_basement 0.174104914 0.16839182 -0.0519433068 1.00000000
## yr_built -0.361416562 0.44696320 0.4238983517 -0.13312410
## yr_renovated -0.060617787 0.01441428 0.0232846879 0.07132290
## zipcode 0.003025524 -0.18486209 -0.2611899765 0.07484461
## lat -0.014941006 0.11408406 -0.0008164986 0.11053796
## long -0.106500448 0.19837215 0.3438030175 -0.14476477
## sqft_living15 -0.092824268 0.71320209 0.7318702924 0.20035498
## sqft_lot15 -0.003405523 0.11924790 0.1940498619 0.01727618
## yr_built yr_renovated zipcode lat
## price 0.05401153 0.126433793 -0.053202854 0.3070034800
## bedrooms 0.15417807 0.018840823 -0.152668487 -0.0089310097
## bathrooms 0.50601944 0.050738978 -0.203866274 0.0245729528
## sqft_living 0.31804877 0.055362927 -0.199430043 0.0525294622
## sqft_lot 0.05308037 0.007643505 -0.129574486 -0.0856827882
## floors 0.48931942 0.006338401 -0.059120642 0.0496141310
## waterfront -0.02616109 0.092884837 0.030284728 -0.0142737756
## view -0.05343985 0.103917288 0.084826917 0.0061567321
## condition -0.36141656 -0.060617787 0.003025524 -0.0149410064
## grade 0.44696320 0.014414281 -0.184862093 0.1140840571
## sqft_above 0.42389835 0.023284688 -0.261189977 -0.0008164986
## sqft_basement -0.13312410 0.071322902 0.074844608 0.1105379580
## yr_built 1.00000000 -0.224873518 -0.346869178 -0.1481224021
## yr_renovated -0.22487352 1.000000000 0.064357057 0.0293976092
## zipcode -0.34686918 0.064357057 1.000000000 0.2670479500
## lat -0.14812240 0.029397609 0.267047950 1.0000000000
## long 0.40935620 -0.068372369 -0.564071606 -0.1355117836
## sqft_living15 0.32622890 -0.002672555 -0.279032997 0.0488579321
## sqft_lot15 0.07095793 0.007853765 -0.147221069 -0.0864188072
## long sqft_living15 sqft_lot15
## price 0.02162624 0.585378904 0.082447153
## bedrooms 0.12947298 0.391637524 0.029244224
## bathrooms 0.22304184 0.568634290 0.087175361
## sqft_living 0.24022330 0.756420259 0.183285551
## sqft_lot 0.22952086 0.144608174 0.718556752
## floors 0.12541903 0.279885265 -0.011269187
## waterfront -0.04191020 0.086463136 0.030703283
## view -0.07839971 0.280439082 0.072574568
## condition -0.10650045 -0.092824268 -0.003405523
## grade 0.19837215 0.713202093 0.119247897
## sqft_above 0.34380302 0.731870292 0.194049862
## sqft_basement -0.14476477 0.200354983 0.017276181
## yr_built 0.40935620 0.326228900 0.070957926
## yr_renovated -0.06837237 -0.002672555 0.007853765
## zipcode -0.56407161 -0.279032997 -0.147221069
## lat -0.13551178 0.048857932 -0.086418807
## long 1.00000000 0.334604984 0.254451288
## sqft_living15 0.33460498 1.000000000 0.183191749
## sqft_lot15 0.25445129 0.183191749 1.000000000
library(corrgram)
corrgram(x=cor(house[,c(3:21)]))
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(x=cor(house[,c(3:15)]))
model1 <- lm(price~sqft_living+grade+bathrooms)
summary(model1)
##
## Call:
## lm(formula = price ~ sqft_living + grade + bathrooms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1008780 -136352 -23106 100673 4801889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.977e+05 1.326e+04 -45.07 <2e-16 ***
## sqft_living 2.033e+02 3.331e+00 61.04 <2e-16 ***
## grade 1.039e+05 2.286e+03 45.44 <2e-16 ***
## bathrooms -3.809e+04 3.440e+03 -11.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 249800 on 21609 degrees of freedom
## Multiple R-squared: 0.5371, Adjusted R-squared: 0.5371
## F-statistic: 8359 on 3 and 21609 DF, p-value: < 2.2e-16
t.test(price, grade)
##
## Welch Two Sample t-test
##
## data: price and grade
## t = 216.27, df = 21612, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 535185.7 544975.2
## sample estimates:
## mean of x mean of y
## 5.400881e+05 7.656873e+00
The less p-value validates the F-statistics and we reject the null hyothesis. Therefore, the data points taken in the model do affect the house pricing in king county.
We have found empirical support for our hypothesis as p-value is <0.01. Therfore, grade, bedroom and sqaurefeet_living do affect the house pricing.