King County is a county located in the U.S. state of Washington. The population was 2,149,970 in a 2016 census estimate. King is the most populous county in Washington, and the 13th-most populous in the United States. The county seat is Seattle, which is the state’s largest city. King County is one of three Washington counties that are included in the Seattle-Tacoma-Bellevue metropolitan statistical area. About two-thirds of King County’s population lives in the city’s suburbs. As of 2011, King County was the 86th highest-income county in the United States. This paper addresses the factors concerning the “house sale prices” in King County sold between May 2014 and May 2015.
Our field study concerns house prices in King County, USA. The county comprises houses with varied features. The features include bedrooms/house, bathrooms/bedroom, area of the house and lot, presence of a waterfront, views, condition of the house, grade assigned by the county, built year, renovated year and the location of the house. We empirically study how the various factors influence the house prices. Our chi-square tests and T-tests analysed the hypothesis put forward. Our regression analysis revealed the best fit model to predict the price of the house. We found that the houses with no waterfront and fewer bedrooms were the cheapest and the houses which comprised a waterfront had more views than the ones which didn’t.
The specific objective of this Study was to investigate the prices of the houses located in King County, USA. This study analyzed house prices in King County, USA. Our goal was to compare prices of houses with waterfront and without waterfront and the views of the house with waterfront and without waterfront. Built a regression model which fits the best with the data and predicted the prices. Accordingly we construct our hypothsis:
Hypothesis-A: The average price of houses with a waterfront are higher than the price of houses without a waterfront
Hypothesis-B: The average no. of views with a waterfront are higher than the no. of views without a waterfront
For this study, we collected data from the website named “Kaggle”-(https://www.kaggle.com/harlfoxem/housesalesprediction). Kaggle is a platform for predictive modelling and analytics competitions in which statisticians and data miners compete to produce the best models for predicting and describing the datasets uploaded by companies and users.
In order to test Hypothesis-A, we perform the Chi-Square Test and T-Test:
setwd("C:/Users/Maneesh/Desktop")
store.df<-read.csv(paste("KingcountyData.csv", sep=""))
mytable4 <- xtabs(~ price+waterfront, data=store.df)
chisq.test(mytable4)
## Warning in chisq.test(mytable4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable4
## X-squared = 8246.7, df = 4027, p-value < 2.2e-16
t.test(price ~ waterfront, data=store.df)
##
## Welch Two Sample t-test
##
## data: price by waterfront
## t = -12.876, df = 162.23, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1303661.6 -956963.3
## sample estimates:
## mean in group 0 mean in group 1
## 531563.6 1661876.0
In order to test Hypothesis-B, we perform the Chi-Square Test and T-Test:
mytable5 <- xtabs(~ view+waterfront, data=store.df)
chisq.test(mytable5)
## Warning in chisq.test(mytable5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable5
## X-squared = 7572.6, df = 4, p-value < 2.2e-16
t.test(view ~ waterfront, data=store.df)
##
## Welch Two Sample t-test
##
## data: view by waterfront
## t = -80.404, df = 165.87, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.646815 -3.472009
## sample estimates:
## mean in group 0 mean in group 1
## 0.2074592 3.7668712
regr1 <- lm(price ~ bedrooms+bathrooms+sqft_living+sqft_lot+floors+waterfront+grade+yr_built+yr_renovated, data=store.df)
summary(regr1)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + waterfront + grade + yr_built + yr_renovated, data = store.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1390722 -112849 -9970 91117 4246695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.947e+06 1.274e+05 54.541 < 2e-16 ***
## bedrooms -4.137e+04 2.042e+03 -20.259 < 2e-16 ***
## bathrooms 5.112e+04 3.464e+03 14.757 < 2e-16 ***
## sqft_living 1.779e+02 3.290e+00 54.082 < 2e-16 ***
## sqft_lot -2.399e-01 3.679e-02 -6.522 7.11e-11 ***
## floors 1.695e+04 3.435e+03 4.936 8.05e-07 ***
## waterfront 7.201e+05 1.745e+04 41.273 < 2e-16 ***
## grade 1.288e+05 2.150e+03 59.893 < 2e-16 ***
## yr_built -3.937e+03 6.702e+01 -58.745 < 2e-16 ***
## yr_renovated 5.331e+00 3.906e+00 1.365 0.172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 218900 on 21603 degrees of freedom
## Multiple R-squared: 0.6446, Adjusted R-squared: 0.6445
## F-statistic: 4354 on 9 and 21603 DF, p-value: < 2.2e-16
regr1$coefficients
## (Intercept) bedrooms bathrooms sqft_living sqft_lot
## 6.947234e+06 -4.137487e+04 5.112191e+04 1.779465e+02 -2.399393e-01
## floors waterfront grade yr_built yr_renovated
## 1.695479e+04 7.201485e+05 1.287741e+05 -3.936823e+03 5.331105e+00
pr1 <- predict.lm(regr1)
head(pr1)
## 1 2 3 4 5 6
## 304737.3 658919.6 229946.3 465192.2 447045.2 1507683.6
regr2 <- lm(price ~ bedrooms+bathrooms+sqft_living+sqft_lot+floors+waterfront+grade+condition+view+sqft_above+zipcode+lat+long+sqft_living15+sqft_lot15+yr_built+yr_renovated, data=store.df)
summary(regr2)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + waterfront + grade + condition + view + sqft_above +
## zipcode + lat + long + sqft_living15 + sqft_lot15 + yr_built +
## yr_renovated, data = store.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1291725 -99229 -9739 77583 4333222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.690e+06 2.931e+06 2.282 0.02249 *
## bedrooms -3.577e+04 1.892e+03 -18.906 < 2e-16 ***
## bathrooms 4.114e+04 3.254e+03 12.645 < 2e-16 ***
## sqft_living 1.501e+02 4.385e+00 34.227 < 2e-16 ***
## sqft_lot 1.286e-01 4.792e-02 2.683 0.00729 **
## floors 6.690e+03 3.596e+03 1.860 0.06285 .
## waterfront 5.830e+05 1.736e+04 33.580 < 2e-16 ***
## grade 9.589e+04 2.153e+03 44.542 < 2e-16 ***
## condition 2.639e+04 2.351e+03 11.221 < 2e-16 ***
## view 5.287e+04 2.140e+03 24.705 < 2e-16 ***
## sqft_above 3.113e+01 4.360e+00 7.139 9.71e-13 ***
## zipcode -5.824e+02 3.299e+01 -17.657 < 2e-16 ***
## lat 6.027e+05 1.073e+04 56.149 < 2e-16 ***
## long -2.147e+05 1.313e+04 -16.349 < 2e-16 ***
## sqft_living15 2.168e+01 3.448e+00 6.289 3.26e-10 ***
## sqft_lot15 -3.826e-01 7.327e-02 -5.222 1.78e-07 ***
## yr_built -2.620e+03 7.266e+01 -36.062 < 2e-16 ***
## yr_renovated 1.981e+01 3.656e+00 5.420 6.03e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 201200 on 21595 degrees of freedom
## Multiple R-squared: 0.6997, Adjusted R-squared: 0.6995
## F-statistic: 2960 on 17 and 21595 DF, p-value: < 2.2e-16
regr2$coefficients
## (Intercept) bedrooms bathrooms sqft_living sqft_lot
## 6.690325e+06 -3.576654e+04 4.114428e+04 1.501005e+02 1.285979e-01
## floors waterfront grade condition view
## 6.689550e+03 5.829605e+05 9.589045e+04 2.638565e+04 5.287094e+04
## sqft_above zipcode lat long sqft_living15
## 3.112758e+01 -5.824199e+02 6.027482e+05 -2.147298e+05 2.168140e+01
## sqft_lot15 yr_built yr_renovated
## -3.826418e-01 -2.620223e+03 1.981258e+01
pr2 <- predict.lm(regr2)
head(pr2)
## 1 2 3 4 5 6
## 208877.9 734051.8 380504.7 455050.9 440955.6 1456649.6
We found empirical support for Hypothesis-A and Hypothesis-B. We found that the p<0.05 in both the cases which insists that we reject the Hypothesis-A that Price of house and Waterfront are independent and also the Hypothesis-B that Views and Waterfront are independent. We have also found that “The average price of houses with a waterfront are higher than the price of houses without a waterfront” and “The average no. of views with a waterfront are higher than the no. of views without a waterfront”. Model-2 is the best fit to the available data as the multiple R-Squared value is higher than the former one. Model-2 accounts to 69.97% of the variance in price and it is a good model as the p-value is very small. We have also predicted the prices, but they varied a bit from the exact values.
This paper was driven by the need to understand how different amenities influence the price of houses. We investigated the price of houses with varied features. We found that as the amenities increased the price also increased. We observed houses in King County were priced high for providing people with special amenities like the waterfront.
Source of the dataset: Kaggle.com (https://www.kaggle.com/harlfoxem/housesalesprediction)
str(store.df)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
library(psych)
summary(store.df$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 75000 322000 450000 540100 645000 7700000
summary(store.df$bedrooms)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 3.000 3.371 4.000 33.000
summary(store.df$bathrooms)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.750 2.250 2.115 2.500 8.000
summary(store.df$sqft_living)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 290 1427 1910 2080 2550 13540
summary(store.df$sqft_lot)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 520 5040 7618 15110 10690 1651000
summary(store.df$floors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.500 1.494 2.000 3.500
summary(store.df$waterfront)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.007542 0.000000 1.000000
summary(store.df$view)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2343 0.0000 4.0000
summary(store.df$condition)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.000 3.409 4.000 5.000
summary(store.df$grade)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 7.000 7.000 7.657 8.000 13.000
summary(store.df$sqft_above)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 290 1190 1560 1788 2210 9410
summary(store.df$sqft_basement)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 291.5 560.0 4820.0
summary(store.df$yr_built)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1900 1951 1975 1971 1997 2015
summary(store.df$yr_renovated)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 84.4 0.0 2015.0
summary(store.df$zipcode)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 98000 98030 98060 98080 98120 98200
summary(store.df$lat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.16 47.47 47.57 47.56 47.68 47.78
summary(store.df$long)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -122.5 -122.3 -122.2 -122.2 -122.1 -121.3
summary(store.df$sqft_living15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 399 1490 1840 1987 2360 6210
summary(store.df$sqft_lot15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 651 5100 7620 12770 10080 871200
mytable <- with(store.df, table(bedrooms))
mytable
## bedrooms
## 0 1 2 3 4 5 6 7 8 9 10 11 33
## 13 199 2760 9824 6882 1601 272 38 13 6 3 1 1
prop.table(mytable)*100
## bedrooms
## 0 1 2 3 4
## 0.060148984 0.920742146 12.770092074 45.454124832 31.841946976
## 5 6 7 8 9
## 7.407578772 1.258501828 0.175820108 0.060148984 0.027761070
## 10 11 33
## 0.013880535 0.004626845 0.004626845
mytable <- with(store.df, table(bathrooms))
mytable
## bathrooms
## 0 0.5 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75
## 10 4 72 3852 9 1446 3048 1930 2047 5380 1185 753 589 731 155
## 4 4.25 4.5 4.75 5 5.25 5.5 5.75 6 6.25 6.5 6.75 7.5 7.75 8
## 136 79 100 23 21 13 10 4 6 2 2 2 1 1 2
prop.table(mytable)*100
## bathrooms
## 0 0.5 0.75 1 1.25
## 0.046268450 0.018507380 0.333132837 17.822606764 0.041641605
## 1.5 1.75 2 2.25 2.5
## 6.690417804 14.102623421 8.929810762 9.471151622 24.892425855
## 2.75 3 3.25 3.5 3.75
## 5.482811271 3.484014251 2.725211678 3.382223662 0.717160968
## 4 4.25 4.5 4.75 5
## 0.629250914 0.365520751 0.462684495 0.106417434 0.097163744
## 5.25 5.5 5.75 6 6.25
## 0.060148984 0.046268450 0.018507380 0.027761070 0.009253690
## 6.5 6.75 7.5 7.75 8
## 0.009253690 0.009253690 0.004626845 0.004626845 0.009253690
mytable <- with(store.df, table(floors))
mytable
## floors
## 1 1.5 2 2.5 3 3.5
## 10680 1910 8241 161 613 8
prop.table(mytable)*100
## floors
## 1 1.5 2 2.5 3 3.5
## 49.41470411 8.83727386 38.12982927 0.74492204 2.83625596 0.03701476
mytable <- with(store.df, table(waterfront))
mytable
## waterfront
## 0 1
## 21450 163
prop.table(mytable)*100
## waterfront
## 0 1
## 99.2458243 0.7541757
mytable <- with(store.df, table(view))
mytable
## view
## 0 1 2 3 4
## 19489 332 963 510 319
prop.table(mytable)*100
## view
## 0 1 2 3 4
## 90.172581 1.536113 4.455652 2.359691 1.475964
mytable <- with(store.df, table(condition))
mytable
## condition
## 1 2 3 4 5
## 30 172 14031 5679 1701
prop.table(mytable)*100
## condition
## 1 2 3 4 5
## 0.1388053 0.7958173 64.9192616 26.2758525 7.8702633
mytable <- with(store.df, table(grade))
mytable
## grade
## 1 3 4 5 6 7 8 9 10 11 12 13
## 1 3 29 242 2038 8981 6068 2615 1134 399 90 13
prop.table(mytable)*100
## grade
## 1 3 4 5 6
## 0.004626845 0.013880535 0.134178504 1.119696479 9.429510017
## 7 8 9 10 11
## 41.553694536 28.075695183 12.099199556 5.246842178 1.846111137
## 12 13
## 0.416416046 0.060148984
mytable <- with(store.df, table(yr_built))
mytable
## yr_built
## 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
## 87 29 27 46 45 74 92 65 86 94 134 73 79 59 54
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
## 64 79 56 120 88 98 76 95 84 139 165 180 115 126 114
## 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944
## 90 61 38 30 21 24 40 68 52 106 156 161 223 170 140
## 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
## 95 126 263 235 195 250 229 220 223 305 271 198 198 224 334
## 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974
## 248 224 312 256 172 187 250 350 381 280 132 104 149 149 162
## 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
## 189 253 417 387 343 240 199 105 212 229 228 215 294 270 290
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
## 320 224 198 202 249 169 195 177 239 265 218 305 222 422 433
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## 450 454 417 367 230 143 130 170 201 559 38
prop.table(mytable)*100
## yr_built
## 1900 1901 1902 1903 1904 1905
## 0.40253551 0.13417850 0.12492481 0.21283487 0.20820802 0.34238653
## 1906 1907 1908 1909 1910 1911
## 0.42566974 0.30074492 0.39790867 0.43492343 0.61999722 0.33775968
## 1912 1913 1914 1915 1916 1917
## 0.36552075 0.27298385 0.24984963 0.29611808 0.36552075 0.25910332
## 1918 1919 1920 1921 1922 1923
## 0.55522139 0.40716236 0.45343081 0.35164022 0.43955027 0.38865498
## 1924 1925 1926 1927 1928 1929
## 0.64313145 0.76342942 0.83283209 0.53208717 0.58298246 0.52746032
## 1930 1931 1932 1933 1934 1935
## 0.41641605 0.28223754 0.17582011 0.13880535 0.09716374 0.11104428
## 1936 1937 1938 1939 1940 1941
## 0.18507380 0.31462546 0.24059594 0.49044557 0.72178781 0.74492204
## 1942 1943 1944 1945 1946 1947
## 1.03178642 0.78656364 0.64775829 0.43955027 0.58298246 1.21686022
## 1948 1949 1950 1951 1952 1953
## 1.08730856 0.90223477 1.15671124 1.05954749 1.01790589 1.03178642
## 1954 1955 1956 1957 1958 1959
## 1.41118771 1.25387498 0.91611530 0.91611530 1.03641327 1.54536621
## 1960 1961 1962 1963 1964 1965
## 1.14745755 1.03641327 1.44357563 1.18447231 0.79581733 0.86522001
## 1966 1967 1968 1969 1970 1971
## 1.15671124 1.61939573 1.76282793 1.29551659 0.61074353 0.48119188
## 1972 1973 1974 1975 1976 1977
## 0.68939990 0.68939990 0.74954888 0.87447370 1.17059177 1.92939435
## 1978 1979 1980 1981 1982 1983
## 1.79058900 1.58700782 1.11044279 0.92074215 0.48581872 0.98089113
## 1984 1985 1986 1987 1988 1989
## 1.05954749 1.05492065 0.99477167 1.36029242 1.24924814 1.34178504
## 1990 1991 1992 1993 1994 1995
## 1.48059039 1.03641327 0.91611530 0.93462268 1.15208439 0.78193680
## 1996 1997 1998 1999 2000 2001
## 0.90223477 0.81895156 1.10581594 1.22611391 1.00865220 1.41118771
## 2002 2003 2004 2005 2006 2007
## 1.02715958 1.95252857 2.00342387 2.08208023 2.10058761 1.92939435
## 2008 2009 2010 2011 2012 2013
## 1.69805210 1.06417434 0.66163883 0.60148984 0.78656364 0.92999584
## 2014 2015
## 2.58640633 0.17582011
mytable <- with(store.df, table(yr_renovated))
mytable
## yr_renovated
## 0 1934 1940 1944 1945 1946 1948 1950 1951 1953 1954 1955
## 20699 1 2 1 3 2 1 2 1 3 1 3
## 1956 1957 1958 1959 1960 1962 1963 1964 1965 1967 1968 1969
## 3 3 5 1 4 2 4 5 5 2 8 4
## 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
## 9 2 4 5 3 6 3 8 6 10 11 5
## 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
## 11 18 18 17 17 18 15 22 25 20 17 19
## 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
## 19 16 15 15 19 17 35 19 22 36 26 35
## 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## 24 35 18 22 18 13 11 37 91 16
prop.table(mytable)*100
## yr_renovated
## 0 1934 1940 1944 1945
## 95.771063712 0.004626845 0.009253690 0.004626845 0.013880535
## 1946 1948 1950 1951 1953
## 0.009253690 0.004626845 0.009253690 0.004626845 0.013880535
## 1954 1955 1956 1957 1958
## 0.004626845 0.013880535 0.013880535 0.013880535 0.023134225
## 1959 1960 1962 1963 1964
## 0.004626845 0.018507380 0.009253690 0.018507380 0.023134225
## 1965 1967 1968 1969 1970
## 0.023134225 0.009253690 0.037014760 0.018507380 0.041641605
## 1971 1972 1973 1974 1975
## 0.009253690 0.018507380 0.023134225 0.013880535 0.027761070
## 1976 1977 1978 1979 1980
## 0.013880535 0.037014760 0.027761070 0.046268450 0.050895294
## 1981 1982 1983 1984 1985
## 0.023134225 0.050895294 0.083283209 0.083283209 0.078656364
## 1986 1987 1988 1989 1990
## 0.078656364 0.083283209 0.069402674 0.101790589 0.115671124
## 1991 1992 1993 1994 1995
## 0.092536899 0.078656364 0.087910054 0.087910054 0.074029519
## 1996 1997 1998 1999 2000
## 0.069402674 0.069402674 0.087910054 0.078656364 0.161939573
## 2001 2002 2003 2004 2005
## 0.087910054 0.101790589 0.166566418 0.120297969 0.161939573
## 2006 2007 2008 2009 2010
## 0.111044279 0.161939573 0.083283209 0.101790589 0.083283209
## 2011 2012 2013 2014 2015
## 0.060148984 0.050895294 0.171193263 0.421042891 0.074029519
mytable <- with(store.df, table(zipcode))
mytable
## zipcode
## 98001 98002 98003 98004 98005 98006 98007 98008 98010 98011 98014 98019
## 362 199 280 317 168 498 141 283 100 195 124 190
## 98022 98023 98024 98027 98028 98029 98030 98031 98032 98033 98034 98038
## 234 499 81 412 283 321 256 274 125 432 545 590
## 98039 98040 98042 98045 98052 98053 98055 98056 98058 98059 98065 98070
## 50 282 548 221 574 405 268 406 455 468 310 118
## 98072 98074 98075 98077 98092 98102 98103 98105 98106 98107 98108 98109
## 273 441 359 198 351 105 602 229 335 266 186 109
## 98112 98115 98116 98117 98118 98119 98122 98125 98126 98133 98136 98144
## 269 583 330 553 508 184 290 410 354 494 263 343
## 98146 98148 98155 98166 98168 98177 98178 98188 98198 98199
## 288 57 446 254 269 255 262 136 280 317
prop.table(mytable)*100
## zipcode
## 98001 98002 98003 98004 98005 98006 98007
## 1.6749179 0.9207421 1.2955166 1.4667099 0.7773100 2.3041688 0.6523851
## 98008 98010 98011 98014 98019 98022 98023
## 1.3093971 0.4626845 0.9022348 0.5737288 0.8791005 1.0826817 2.3087956
## 98024 98027 98028 98029 98030 98031 98032
## 0.3747744 1.9062601 1.3093971 1.4852172 1.1844723 1.2677555 0.5783556
## 98033 98034 98038 98039 98040 98042 98045
## 1.9987970 2.5216305 2.7298385 0.2313422 1.3047703 2.5355110 1.0225327
## 98052 98053 98055 98056 98058 98059 98065
## 2.6558090 1.8738722 1.2399944 1.8784991 2.1052145 2.1653634 1.4343219
## 98070 98072 98074 98075 98077 98092 98102
## 0.5459677 1.2631287 2.0404386 1.6610373 0.9161153 1.6240226 0.4858187
## 98103 98105 98106 98107 98108 98109 98112
## 2.7853607 1.0595475 1.5499931 1.2307408 0.8605932 0.5043261 1.2446213
## 98115 98116 98117 98118 98119 98122 98125
## 2.6974506 1.5268588 2.5586453 2.3504372 0.8513395 1.3417850 1.8970064
## 98126 98133 98136 98144 98146 98148 98155
## 1.6379031 2.2856614 1.2168602 1.5870078 1.3325313 0.2637302 2.0635728
## 98166 98168 98177 98178 98188 98198 98199
## 1.1752186 1.2446213 1.1798455 1.2122334 0.6292509 1.2955166 1.4667099
mytable <- xtabs(~ bedrooms+floors, data=store.df)
mytable
## floors
## bedrooms 1 1.5 2 2.5 3 3.5
## 0 4 0 6 0 2 1
## 1 162 21 12 1 3 0
## 2 1951 182 497 5 123 2
## 3 5455 786 3118 56 405 4
## 4 2383 698 3682 58 61 0
## 5 605 185 775 23 13 0
## 6 104 30 119 14 5 0
## 7 9 7 19 2 1 0
## 8 5 1 6 0 0 1
## 9 0 0 4 2 0 0
## 10 1 0 2 0 0 0
## 11 0 0 1 0 0 0
## 33 1 0 0 0 0 0
mytable <- xtabs(~ bedrooms+view, data=store.df)
mytable
## view
## bedrooms 0 1 2 3 4
## 0 11 0 2 0 0
## 1 181 2 7 3 6
## 2 2548 42 97 35 38
## 3 9038 120 378 183 105
## 4 6111 121 332 212 106
## 5 1331 41 120 61 48
## 6 218 3 22 15 14
## 7 29 3 4 0 2
## 8 12 0 0 1 0
## 9 6 0 0 0 0
## 10 2 0 1 0 0
## 11 1 0 0 0 0
## 33 1 0 0 0 0
mytable <- xtabs(~ bedrooms+condition, data=store.df)
mytable
## condition
## bedrooms 1 2 3 4 5
## 0 1 1 10 1 0
## 1 4 11 124 48 12
## 2 12 51 1779 718 200
## 3 8 69 6308 2711 728
## 4 4 36 4580 1682 580
## 5 0 1 1031 418 151
## 6 1 3 158 87 23
## 7 0 0 25 9 4
## 8 0 0 8 3 2
## 9 0 0 6 0 0
## 10 0 0 1 2 0
## 11 0 0 1 0 0
## 33 0 0 0 0 1
mytable <- xtabs(~ bedrooms+grade, data=store.df)
mytable
## grade
## bedrooms 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 0 2 0 0 6 3 0 0 0 1 0
## 1 0 3 12 37 78 52 14 2 1 0 0 0
## 2 0 0 14 114 824 1205 499 78 21 3 2 0
## 3 0 0 1 62 854 4917 2796 832 296 56 9 1
## 4 0 0 0 21 233 2177 2194 1351 615 239 49 3
## 5 0 0 0 5 41 501 455 313 173 83 24 6
## 6 0 0 0 3 7 98 90 33 22 13 3 3
## 7 0 0 0 0 1 11 12 4 5 4 1 0
## 8 0 0 0 0 0 6 4 1 1 0 1 0
## 9 0 0 0 0 0 4 1 0 0 1 0 0
## 10 0 0 0 0 0 2 0 1 0 0 0 0
## 11 0 0 0 0 0 1 0 0 0 0 0 0
## 33 0 0 0 0 0 1 0 0 0 0 0 0
mytable <- xtabs(~ bathrooms+floors, data=store.df)
mytable
## floors
## bathrooms 1 1.5 2 2.5 3 3.5
## 0 5 0 3 0 1 1
## 0.5 3 0 1 0 0 0
## 0.75 64 5 3 0 0 0
## 1 3115 624 106 3 4 0
## 1.25 2 2 3 0 2 0
## 1.5 904 205 268 5 64 0
## 1.75 2494 282 244 8 20 0
## 2 1283 282 304 12 49 0
## 2.25 883 95 940 15 114 0
## 2.5 1013 155 3966 39 204 3
## 2.75 500 120 542 7 15 1
## 3 228 76 391 23 33 2
## 3.25 72 25 430 12 50 0
## 3.5 58 17 614 13 29 0
## 3.75 18 11 113 7 6 0
## 4 18 4 99 5 9 1
## 4.25 3 2 67 4 3 0
## 4.5 5 3 81 3 8 0
## 4.75 3 2 14 3 1 0
## 5 4 0 16 1 0 0
## 5.25 1 0 12 0 0 0
## 5.5 3 0 7 0 0 0
## 5.75 0 0 4 0 0 0
## 6 0 0 6 0 0 0
## 6.25 0 0 2 0 0 0
## 6.5 0 0 2 0 0 0
## 6.75 1 0 1 0 0 0
## 7.5 0 0 1 0 0 0
## 7.75 0 0 1 0 0 0
## 8 0 0 0 1 1 0
mytable <- xtabs(~ bathrooms+condition, data=store.df)
mytable
## condition
## bathrooms 1 2 3 4 5
## 0 1 1 6 2 0
## 0.5 0 0 3 1 0
## 0.75 1 5 34 23 9
## 1 19 86 2172 1273 302
## 1.25 0 0 5 4 0
## 1.5 5 12 825 497 107
## 1.75 1 25 1492 1193 337
## 2 2 21 1052 568 287
## 2.25 1 4 1258 660 124
## 2.5 0 13 4373 784 210
## 2.75 0 3 719 322 141
## 3 0 2 535 141 75
## 3.25 0 0 459 90 40
## 3.5 0 0 655 42 34
## 3.75 0 0 122 24 9
## 4 0 0 107 17 12
## 4.25 0 0 58 16 5
## 4.5 0 0 84 10 6
## 4.75 0 0 15 6 2
## 5 0 0 19 2 0
## 5.25 0 0 12 0 1
## 5.5 0 0 8 2 0
## 5.75 0 0 4 0 0
## 6 0 0 5 1 0
## 6.25 0 0 2 0 0
## 6.5 0 0 2 0 0
## 6.75 0 0 2 0 0
## 7.5 0 0 1 0 0
## 7.75 0 0 1 0 0
## 8 0 0 1 1 0
mytable <- xtabs(~ bathrooms+grade, data=store.df)
mytable
## grade
## bathrooms 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 2 0 0 0 4 2 0 0 0 1 0
## 0.5 0 0 0 1 2 0 1 0 0 0 0 0
## 0.75 0 1 14 14 26 17 0 0 0 0 0 0
## 1 0 0 14 190 1414 2084 143 7 0 0 0 0
## 1.25 0 0 0 0 1 3 2 2 0 1 0 0
## 1.5 0 0 0 9 137 984 288 23 4 1 0 0
## 1.75 0 0 1 9 225 1899 808 92 12 2 0 0
## 2 0 0 0 17 184 1165 458 93 13 0 0 0
## 2.25 0 0 0 0 4 778 956 237 66 6 0 0
## 2.5 0 0 0 1 24 1390 2278 1226 385 68 7 1
## 2.75 0 0 0 1 10 322 448 286 100 16 2 0
## 3 0 0 0 0 8 216 269 161 79 18 2 0
## 3.25 0 0 0 0 0 37 165 182 130 61 13 1
## 3.5 0 0 0 0 2 40 181 223 181 84 20 0
## 3.75 0 0 0 0 1 12 24 34 50 29 4 1
## 4 0 0 0 0 0 12 18 22 43 30 10 1
## 4.25 0 0 0 0 0 3 7 9 29 24 6 1
## 4.5 0 0 0 0 0 8 14 15 30 28 5 0
## 4.75 0 0 0 0 0 1 2 1 5 11 2 1
## 5 0 0 0 0 0 3 2 0 5 7 3 1
## 5.25 0 0 0 0 0 2 0 2 1 5 3 0
## 5.5 0 0 0 0 0 0 0 0 1 5 2 2
## 5.75 0 0 0 0 0 0 1 0 0 0 2 1
## 6 0 0 0 0 0 0 1 0 0 0 5 0
## 6.25 0 0 0 0 0 0 0 0 0 1 0 1
## 6.5 0 0 0 0 0 0 0 0 0 1 1 0
## 6.75 0 0 0 0 0 0 0 0 0 1 1 0
## 7.5 0 0 0 0 0 1 0 0 0 0 0 0
## 7.75 0 0 0 0 0 0 0 0 0 0 0 1
## 8 0 0 0 0 0 0 0 0 0 0 1 1
mytable <- xtabs(~ floors+waterfront, data=store.df)
mytable
## waterfront
## floors 0 1
## 1 10623 57
## 1.5 1889 21
## 2 8166 75
## 2.5 159 2
## 3 605 8
## 3.5 8 0
mytable <- xtabs(~ floors+view, data=store.df)
mytable
## view
## floors 0 1 2 3 4
## 1 9684 192 454 219 131
## 1.5 1713 27 90 50 30
## 2 7407 108 376 215 135
## 2.5 130 0 17 8 6
## 3 549 5 24 18 17
## 3.5 6 0 2 0 0
mytable <- xtabs(~ floors+condition, data=store.df)
mytable
## condition
## floors 1 2 3 4 5
## 1 23 137 5543 3888 1089
## 1.5 5 16 878 679 332
## 2 2 19 6904 1061 255
## 2.5 0 0 103 39 19
## 3 0 0 596 12 5
## 3.5 0 0 7 0 1
mytable <- xtabs(~ floors+grade, data=store.df)
mytable
## grade
## floors 1 3 4 5 6 7 8 9 10 11 12 13
## 1 1 3 27 202 1662 5916 2233 447 142 34 11 2
## 1.5 0 0 2 38 311 1006 402 105 35 11 0 0
## 2 0 0 0 2 63 1943 2989 1935 906 323 72 8
## 2.5 0 0 0 0 2 15 53 46 26 14 2 3
## 3 0 0 0 0 0 100 385 82 25 17 4 0
## 3.5 0 0 0 0 0 1 6 0 0 0 1 0
mytable <- xtabs(~ waterfront+view, data=store.df)
mytable
## view
## waterfront 0 1 2 3 4
## 0 19489 331 955 491 184
## 1 0 1 8 19 135
mytable <- xtabs(~ waterfront+condition, data=store.df)
mytable
## condition
## waterfront 1 2 3 4 5
## 0 29 171 13940 5629 1681
## 1 1 1 91 50 20
mytable <- xtabs(~ waterfront+grade, data=store.df)
mytable
## grade
## waterfront 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 3 29 238 2026 8958 6028 2590 1106 379 79 13
## 1 0 0 0 4 12 23 40 25 28 20 11 0
mytable <- xtabs(~ view+condition, data=store.df)
mytable
## condition
## view 1 2 3 4 5
## 0 27 166 12768 5054 1474
## 1 1 1 191 105 34
## 2 0 3 588 268 104
## 3 0 2 309 153 46
## 4 2 0 175 99 43
mytable <- xtabs(~ view+grade, data=store.df)
mytable
## grade
## view 1 3 4 5 6 7 8 9 10 11 12 13
## 0 1 3 26 229 1955 8556 5412 2160 857 242 43 5
## 1 0 0 2 2 15 94 110 63 26 18 2 0
## 2 0 0 1 6 42 222 326 189 102 59 13 3
## 3 0 0 0 1 14 78 146 130 84 45 10 2
## 4 0 0 0 4 12 31 74 73 65 35 22 3
mytable <- xtabs(~ condition+grade, data=store.df)
mytable
## grade
## condition 1 3 4 5 6 7 8 9 10 11 12 13
## 1 1 0 1 9 11 6 2 0 0 0 0 0
## 2 0 1 5 15 59 75 13 2 2 0 0 0
## 3 0 1 13 100 1035 5234 4269 2041 921 332 74 11
## 4 0 0 10 84 685 2833 1394 446 156 56 13 2
## 5 0 1 0 34 248 833 390 126 55 11 3 0
boxplot(store.df$price, xlab = "Price", main = "Boxplot for Price", horizontal = TRUE, col = "yellow")
boxplot(store.df$bedrooms, xlab = "Bedrooms", main = "Boxplot for Bedrooms/House", horizontal = TRUE, col = "yellow")
boxplot(store.df$bathrooms, xlab = "Bathrooms", main = "Boxplot for Bathrooms/House", horizontal = TRUE, col = "yellow")
boxplot(store.df$grade, xlab = "Grade", main = "Boxplot for Grade", horizontal = TRUE, col = "yellow")
boxplot(store.df$condition, xlab = "Condition", main = "Boxplot for Condition of the House", horizontal = TRUE, col = "yellow")
boxplot(store.df$sqft_living, xlab = "Square Feet-Living", main = "Boxplot for Square Feet-Living", horizontal = TRUE, col = "yellow")
boxplot(store.df$sqft_lot, xlab = "Square Feet-Lot", main = "Boxplot for Square Feet-Lot", horizontal = TRUE, col = "yellow")
boxplot(store.df$yr_built, xlab = "Year-Built", main = "Boxplot for Year-Built", horizontal = TRUE, col = "yellow")
boxplot(store.df$yr_renovated, xlab = "Year-Renovated", main = "Boxplot for Year-Renovated", horizontal = TRUE, col = "yellow")
hist(store.df$price, xlab = "Price", ylab = "Count", main = "Disrtibution of Price", ylim = c(0,14000), col = "yellow")
hist(store.df$sqft_living, xlab = "Square Feet-Living", ylab = "Count", main = "Disrtibution of Square Feet-Living", ylim = c(0,12000), col = "yellow")
hist(store.df$sqft_lot, xlab = "Square Feet-Lot", ylab = "Count", main = "Disrtibution of Square Feet-Lot", col = "yellow")
hist(store.df$grade, xlab = "Grade", ylab = "Count", main = "Disrtibution of Grade", col = "yellow", ylim = c(0,10000))
hist(store.df$yr_built, xlab = "Year-Built", ylab = "Count", main = "Disrtibution of Year-Built", col = "yellow", xlim = c(1900,2016))
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
agg1 <- aggregate(store.df$price, by=list(store.df$bedrooms), FUN = median)
names(agg1) <- c("Bedrooms","Median_Price")
p <- ggplot(agg1, aes(Bedrooms, Median_Price))
p +geom_bar(stat = "identity", colour="white", fill = "red")
library(ggplot2)
agg1 <- aggregate(store.df$price, by=list(store.df$bathrooms), FUN = median)
names(agg1) <- c("Bathrooms","Median_Price")
p <- ggplot(agg1, aes(Bathrooms, Median_Price))
p +geom_bar(stat = "identity", colour="white", fill = "purple")
library(ggplot2)
agg1 <- aggregate(store.df$price, by=list(store.df$grade), FUN = median)
names(agg1) <- c("Grade","Median_Price")
p <- ggplot(agg1, aes(Grade, Median_Price))
p +geom_bar(stat = "identity", colour="white", fill = "orange")
library(ggplot2)
agg1 <- aggregate(store.df$price, by=list(store.df$waterfront), FUN = median)
names(agg1) <- c("Waterfront","Median_Price")
p <- ggplot(agg1, aes(Waterfront, Median_Price))
p +geom_bar(stat = "identity", colour="white", fill = "yellow")
library(ggplot2)
agg1 <- aggregate(store.df$price, by=list(store.df$yr_built), FUN = median)
names(agg1) <- c("Year_Built","Median_Price")
p <- ggplot(agg1, aes(Year_Built, Median_Price))
p +geom_bar(stat = "identity")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(store.df$sqft_living, store.df$price, xlab = "Square Feet-Living", ylab = "Price", main = "Square Feet-Living vs Price")
scatterplot(store.df$sqft_lot, store.df$price, xlab = "Square Feet-Lot", ylab = "Price", main = "Square Feet-Lot vs Price")
store.sub <- subset(store.df, select = 3:21)
library(corrgram)
corrgram(store.sub, order=TRUE, main="Corrgram", upper.panel=panel.pie, lower.panel=panel.shade, text.panel=panel.txt)
library(car)
scatterplotMatrix(formula = ~ price+bedrooms+sqft_living, diagonal="histogram", data=store.sub, main = "Scatter Plot Matrix")
library(car)
scatterplotMatrix(formula = ~ price+waterfront+grade+view, diagonal="histogram", data=store.sub, main = "Scatter Plot Matrix")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth