Load Package and Data
#load package
library(gridExtra)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gridExtra)
#load data
my_data <- read.csv("qp1_data.csv")
head(my_data)
## price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 1 221900 3 1.00 1180 5650 1 0 0
## 2 538000 3 2.25 2570 7242 2 0 0
## 3 180000 2 1.00 770 10000 1 0 0
## 4 604000 4 3.00 1960 5000 1 0 0
## 5 510000 3 2.00 1680 8080 1 0 0
## 6 1230000 4 4.50 5420 101930 1 0 0
## grade condition yr_built yr_renovated zip_code
## 1 7 3 1955 0 98178
## 2 7 3 1951 1991 98125
## 3 6 3 1933 0 98028
## 4 7 5 1965 0 98136
## 5 8 3 1987 0 98074
## 6 11 3 2001 0 98053
fig_s<- "fig.width = 6, fig.height = 3"
#add transit id
tran_id <- c(1:nrow(my_data))
my_data <- add_column(my_data, tran_id, .before = "price")
head(my_data)
## tran_id price bedrooms bathrooms sqft_living sqft_lot floors
## 1 1 221900 3 1.00 1180 5650 1
## 2 2 538000 3 2.25 2570 7242 2
## 3 3 180000 2 1.00 770 10000 1
## 4 4 604000 4 3.00 1960 5000 1
## 5 5 510000 3 2.00 1680 8080 1
## 6 6 1230000 4 4.50 5420 101930 1
## waterfront view grade condition yr_built yr_renovated zip_code
## 1 0 0 7 3 1955 0 98178
## 2 0 0 7 3 1951 1991 98125
## 3 0 0 6 3 1933 0 98028
## 4 0 0 7 5 1965 0 98136
## 5 0 0 8 3 1987 0 98074
## 6 0 0 11 3 2001 0 98053
Base EDA
Uni-variate non-graphical EDA
str(my_data)
## 'data.frame': 21613 obs. of 14 variables:
## $ tran_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 221900 538000 180000 604000 510000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated: int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zip_code : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
summary(my_data)
## tran_id price bedrooms bathrooms
## Min. : 1 Min. : 75000 Min. : 0.000 Min. :0.000
## 1st Qu.: 5404 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750
## Median :10807 Median : 450000 Median : 3.000 Median :2.250
## Mean :10807 Mean : 540182 Mean : 3.371 Mean :2.115
## 3rd Qu.:16210 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500
## Max. :21613 Max. :7700000 Max. :33.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 290 Min. : 520 Min. :1.000 Min. :0.000000
## 1st Qu.: 1427 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1910 Median : 7618 Median :1.500 Median :0.000000
## Mean : 2080 Mean : 15107 Mean :1.494 Mean :0.007542
## 3rd Qu.: 2550 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1651359 Max. :3.500 Max. :1.000000
## view grade condition yr_built
## Min. :0.0000 Min. : 1.000 Min. :1.000 Min. :1900
## 1st Qu.:0.0000 1st Qu.: 7.000 1st Qu.:3.000 1st Qu.:1951
## Median :0.0000 Median : 7.000 Median :3.000 Median :1975
## Mean :0.2343 Mean : 7.657 Mean :3.409 Mean :1971
## 3rd Qu.:0.0000 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.:1997
## Max. :4.0000 Max. :13.000 Max. :5.000 Max. :2015
## yr_renovated zip_code
## Min. : 0.0 Min. :98001
## 1st Qu.: 0.0 1st Qu.:98033
## Median : 0.0 Median :98065
## Mean : 84.4 Mean :98078
## 3rd Qu.: 0.0 3rd Qu.:98118
## Max. :2015.0 Max. :98199
summary(my_data[,2:ncol(my_data)])
## price bedrooms bathrooms sqft_living
## Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
## 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
## Median : 450000 Median : 3.000 Median :2.250 Median : 1910
## Mean : 540182 Mean : 3.371 Mean :2.115 Mean : 2080
## 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
## Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 520 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7618 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 15107 Mean :1.494 Mean :0.007542 Mean :0.2343
## 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1651359 Max. :3.500 Max. :1.000000 Max. :4.0000
## grade condition yr_built yr_renovated
## Min. : 1.000 Min. :1.000 Min. :1900 Min. : 0.0
## 1st Qu.: 7.000 1st Qu.:3.000 1st Qu.:1951 1st Qu.: 0.0
## Median : 7.000 Median :3.000 Median :1975 Median : 0.0
## Mean : 7.657 Mean :3.409 Mean :1971 Mean : 84.4
## 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.:1997 3rd Qu.: 0.0
## Max. :13.000 Max. :5.000 Max. :2015 Max. :2015.0
## zip_code
## Min. :98001
## 1st Qu.:98033
## Median :98065
## Mean :98078
## 3rd Qu.:98118
## Max. :98199
[Number of Bedrooms]
#bedroom number count
table(my_data$bedrooms)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 33
## 13 199 2760 9824 6882 1601 272 38 13 6 3 1 1
Note: Since bedroom number is generally the first buyers’ priority, we concluded that the highest demand goes to the house with three bedrooms type.
Question: with no bedroom? studio? a house with 33 bedrooms? may want to deal with it as an outlier.
#pull out the 33 rooms entry
my_data%>%filter(my_data$bedrooms==33)
## tran_id price bedrooms bathrooms sqft_living sqft_lot floors waterfront
## 1 15871 640000 33 1.75 1620 6000 1 0
## view grade condition yr_built yr_renovated zip_code
## 1 0 7 5 1947 0 98103
Note: It might be a typo since there are only 1620 sq ft for living and one bathroom if I have the authority I will remove this data.
[Number of floors]
#floor numer count
table(my_data$floor)
##
## 1 1.5 2 2.5 3 3.5
## 10680 1910 8241 161 613 8
[Transition in different zip code]
table(my_data$zip_code)
##
## 98001 98002 98003 98004 98005 98006 98007 98008 98010 98011 98014 98019
## 362 199 280 317 168 498 141 283 100 195 124 190
## 98022 98023 98024 98027 98028 98029 98030 98031 98032 98033 98034 98038
## 234 499 81 412 283 321 256 274 125 432 545 590
## 98039 98040 98042 98045 98052 98053 98055 98056 98058 98059 98065 98070
## 50 282 548 221 574 405 268 406 455 468 310 118
## 98072 98074 98075 98077 98092 98102 98103 98105 98106 98107 98108 98109
## 273 441 359 198 351 105 602 229 335 266 186 109
## 98112 98115 98116 98117 98118 98119 98122 98125 98126 98133 98136 98144
## 269 583 330 553 508 184 290 410 354 494 263 343
## 98146 98148 98155 98166 98168 98177 98178 98188 98198 98199
## 288 57 446 254 269 255 262 136 280 317
Uni-variate graphical EDA
[Price,sqft_live,sqft_lot Distribution]
grid.arrange(
ggplot(my_data, aes(x= price)) + geom_histogram(),
ggplot(my_data,aes(x=sqft_living))+geom_histogram(),
ggplot(my_data,aes(x=sqft_lot))+geom_histogram(),
ncol=1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Note: Price, sqft_living, sqft_lot distribution are all skewed. It’s better use median instead of mean later.
[floors, bedrooms, bathrooms Distribution]
grid.arrange(
ggplot(my_data, aes(x= floors)) + geom_histogram(stat="count"),
ggplot(my_data, aes(x=bedrooms)) + geom_histogram(stat="count"),
ggplot(my_data, aes(x=bathrooms)) + geom_histogram(stat="count"),
ncol=1
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Finding: Most of houses have 1 to 2 floors. Most of houses have 3 to 4 rooms. Most of houses have 1 or 2.5 bathrooms.
[House Location Distribution]
ggplot(my_data,aes(x=zip_code))+geom_bar()

Note: House distribution in each zone is not even.
[Bedroom Number Distribution]
#just for checking the outlier house
ggplot(my_data,aes(x=bedrooms))+geom_bar()+coord_cartesian(xlim=c(0,11))

Note: The market is looking for a house with three or four bedrooms.
[Year of Built Distribution]
yr_built_pic <- ggplot(my_data,aes(x=yr_built))+geom_bar(stat = "count") +
theme( axis.text.x = element_text(size=10))
ggsave(filename = "yr_built.png",width = 8, height = 5, plot = yr_built_pic)
Note: House built from 2014 has highest number among King County area.
multi-variate non-graphical EDA
[Price & zip_code]
zip_price <- my_data %>%
group_by(zip_code) %>%
summarize(median(price, na.rm = TRUE))
colnames(zip_price)[2] <- "h_price"
zip_price
## # A tibble: 70 x 2
## zip_code h_price
## <int> <dbl>
## 1 98001 260000
## 2 98002 235000
## 3 98003 267475
## 4 98004 1150000
## 5 98005 765475
## 6 98006 760184.
## 7 98007 555000
## 8 98008 545000
## 9 98010 360000.
## 10 98011 470000
## # … with 60 more rows
Note:House located in 98004/98039/98112 are pretty expensive, which can due to specific attribution with that area, a graph may demonstrate better.
[Bedroom & Bathroom]
table(my_data$bedrooms,my_data$bathroom)
##
## 0 0.5 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5
## 0 7 0 1 1 0 1 0 0 0 3 0 0 0 0
## 1 3 1 27 138 2 12 4 6 4 2 0 0 0 0
## 2 0 2 26 1558 3 294 304 216 118 197 20 13 8 1
## 3 0 0 16 1780 4 829 1870 1048 1082 2357 275 197 184 143
## 4 0 1 2 325 0 254 719 525 709 2502 639 326 254 395
## 5 0 0 0 43 0 48 134 110 116 287 214 163 129 169
## 6 0 0 0 6 0 6 16 24 15 29 31 45 12 17
## 7 0 0 0 1 0 2 0 0 3 2 3 3 1 5
## 8 0 0 0 0 0 0 0 0 0 1 3 2 1 1
## 9 0 0 0 0 0 0 0 0 0 0 0 2 0 0
## 10 0 0 0 0 0 0 0 1 0 0 0 1 0 0
## 11 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 33 0 0 0 0 0 0 1 0 0 0 0 0 0 0
##
## 3.75 4 4.25 4.5 4.75 5 5.25 5.5 5.75 6 6.25 6.5 6.75 7.5
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 17 11 6 5 0 0 0 0 0 0 0 0 0 0
## 4 78 58 38 32 7 7 5 5 1 0 0 0 0 0
## 5 44 48 25 35 11 7 4 4 2 4 2 1 1 0
## 6 13 11 8 23 3 6 3 0 0 1 0 1 0 0
## 7 2 5 2 3 2 0 0 1 1 0 0 0 1 0
## 8 1 2 0 0 0 1 0 0 0 1 0 0 0 0
## 9 0 1 0 2 0 0 0 0 0 0 0 0 0 1
## 10 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 33 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 7.75 8
## 0 0 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 1 1
## 7 0 1
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 33 0 0
Note: more bedrooms with more bathrooms normally
*The 33 rooms house only has 1.75 bathroom
[Floor & View]
table(my_data$floor,my_data$view)
##
## 0 1 2 3 4
## 1 9684 192 454 219 131
## 1.5 1713 27 90 50 30
## 2 7407 108 376 215 135
## 2.5 130 0 17 8 6
## 3 549 5 24 18 17
## 3.5 6 0 2 0 0
Note: buildings with more floors are more likely with a better view
Multi-variate graphical EDA
Which attribution affect the price most
#Clean data for ploting
plot_data <- data.frame(my_data)
plot_data$floors <- as.factor(plot_data$floors)
plot_data$bedrooms <- as.factor(plot_data$bedrooms)
plot_data$bathrooms <- as.factor(plot_data$bathrooms)
plot_data$view <- as.factor(plot_data$view)
plot_data$waterfront <- as.factor(plot_data$waterfront)
plot_data$grade <- as.factor(plot_data$grade)
plot_data$condition <- as.factor(plot_data$condition)
summary(plot_data)
## tran_id price bedrooms bathrooms
## Min. : 1 Min. : 75000 3 :9824 2.5 :5380
## 1st Qu.: 5404 1st Qu.: 321950 4 :6882 1 :3852
## Median :10807 Median : 450000 2 :2760 1.75 :3048
## Mean :10807 Mean : 540182 5 :1601 2.25 :2047
## 3rd Qu.:16210 3rd Qu.: 645000 6 : 272 2 :1930
## Max. :21613 Max. :7700000 1 : 199 1.5 :1446
## (Other): 75 (Other):3910
## sqft_living sqft_lot floors waterfront view
## Min. : 290 Min. : 520 1 :10680 0:21450 0:19489
## 1st Qu.: 1427 1st Qu.: 5040 1.5: 1910 1: 163 1: 332
## Median : 1910 Median : 7618 2 : 8241 2: 963
## Mean : 2080 Mean : 15107 2.5: 161 3: 510
## 3rd Qu.: 2550 3rd Qu.: 10688 3 : 613 4: 319
## Max. :13540 Max. :1651359 3.5: 8
##
## grade condition yr_built yr_renovated zip_code
## 7 :8981 1: 30 Min. :1900 Min. : 0.0 Min. :98001
## 8 :6068 2: 172 1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033
## 9 :2615 3:14031 Median :1975 Median : 0.0 Median :98065
## 6 :2038 4: 5679 Mean :1971 Mean : 84.4 Mean :98078
## 10 :1134 5: 1701 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118
## 11 : 399 Max. :2015 Max. :2015.0 Max. :98199
## (Other): 378
grid.arrange(
ggplot(plot_data,aes(x=plot_data$bedrooms,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$bathrooms,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$sqft_living,y=price))+geom_point()+stat_smooth(method = "lm"),
ggplot(plot_data,aes(x=plot_data$sqft_lot,y=price))+geom_point()+stat_smooth(method = "lm"),
ggplot(plot_data,aes(x=plot_data$floors,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$waterfront,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$view,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$grade,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$condition,y=price))+geom_boxplot(),
ncol=2
)

Findings: The relation between each factor and price are a bellow : - A number of bedrooms increases, price increases, but the effect goes less significant when it’s more than 5 bedrooms. - Number of bathroom increases, price increases, - Both spaces of living and lot have a positive relationship with price, but living space has more power than a lot space. - Building with more floors at a higher price. However, the price reaches its peak at 2.5 floors and then drop. - The view doesn’t really increase the value of the house. However, the waterfront view significantly did. - The initial quality of the building is decisive to price, but the maintenance no so much.
[Price and Zip_code]
#group the zip_code
zip_price <- my_data %>%
group_by(zip_code) %>%
summarize(median(price, na.rm = TRUE))
colnames(zip_price)[2] <- "h_price"
zip_price
## # A tibble: 70 x 2
## zip_code h_price
## <int> <dbl>
## 1 98001 260000
## 2 98002 235000
## 3 98003 267475
## 4 98004 1150000
## 5 98005 765475
## 6 98006 760184.
## 7 98007 555000
## 8 98008 545000
## 9 98010 360000.
## 10 98011 470000
## # … with 60 more rows
#plotting
ggplot(zip_price,aes( x=reorder(zip_code,h_price),y=h_price))+geom_bar(stat = "identity")+coord_flip()+ geom_hline(yintercept=median(my_data$price), color= "red")

Question : why the house located in 98039 98004 98040 98112 are more expensive than others?
[Prcie & space of living ]
new_d <- my_data%>%filter(bedrooms<=8)
new_d$bedrooms <- as.factor(new_d$bedrooms)
ggplot(new_d,aes(x=sqft_living , y= price,color=bedrooms))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)

Findings : - More sqft_living increase the price, this phenomenon will be more significant if there are more rooms in the house - But the effect of sqft to price drops when there 7 rooms in the house, may due to the over-divided interior design - The effect bounces back when house has 8 rooms, but there are only 18 observations of this group. I don’t really trust this
#pull out all house with 8 room to check if there are other factor that may affect the price
summary(my_data %>% filter(bedrooms==8))
## tran_id price bedrooms bathrooms
## Min. : 1661 Min. : 340000 Min. :8 Min. :2.500
## 1st Qu.: 6175 1st Qu.: 490000 1st Qu.:8 1st Qu.:2.750
## Median :10959 Median : 700000 Median :8 Median :3.250
## Mean :11083 Mean :1105077 Mean :8 Mean :3.558
## 3rd Qu.:15671 3rd Qu.:1650000 3rd Qu.:8 3rd Qu.:4.000
## Max. :19303 Max. :3300000 Max. :8 Max. :6.000
## sqft_living sqft_lot floors waterfront
## Min. :2280 Min. : 2580 Min. :1.000 Min. :0
## 1st Qu.:2800 1st Qu.: 4800 1st Qu.:1.000 1st Qu.:0
## Median :3840 Median : 7500 Median :2.000 Median :0
## Mean :3800 Mean : 8972 Mean :1.692 Mean :0
## 3rd Qu.:4300 3rd Qu.:11750 3rd Qu.:2.000 3rd Qu.:0
## Max. :7710 Max. :20666 Max. :3.500 Max. :0
## view grade condition yr_built
## Min. :0.0000 Min. : 7.000 Min. :3.000 Min. :1901
## 1st Qu.:0.0000 1st Qu.: 7.000 1st Qu.:3.000 1st Qu.:1959
## Median :0.0000 Median : 8.000 Median :3.000 Median :1962
## Mean :0.2308 Mean : 8.077 Mean :3.538 Mean :1956
## 3rd Qu.:0.0000 3rd Qu.: 8.000 3rd Qu.:4.000 3rd Qu.:1977
## Max. :3.0000 Max. :12.000 Max. :5.000 Max. :1997
## yr_renovated zip_code
## Min. :0 Min. :98004
## 1st Qu.:0 1st Qu.:98055
## Median :0 Median :98105
## Mean :0 Mean :98082
## 3rd Qu.:0 3rd Qu.:98112
## Max. :0 Max. :98133
Finding : The house with 8 rooms is more expenxive even the living space doesn’t increase may because this group is comparatively higher in grade and condition. However, interestingly, they barely come with a view or waterfront view, which intuitionally will cause the price to decrease.
[Price and year built]
library(tidyverse)
plot_data %>% group_by(yr_built) %>% summarize(med_pri = median(price)) %>% ggplot(mapping = aes(x = yr_built, y = med_pri)) + geom_line()

Finding: The relation comes in a U shape, maybe people like the old house for its vintage style and the new one for its condition.
[Price Trend & Floors] !! Premise !!! The transaction record is chronological
ggplot(my_data[1:5000,],aes(x=tran_id, y=price))+geom_line()+facet_grid(floors~.)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

Finding: The price change pattern is not the same in different bedrooms group.
Findings (Executive Advice )
Main sales force city
Tartget: Find out which city and what kind of house occupied the most sale
# group by zip_code, but take sum of the price for sales force analysis
zip_price_sum <- my_data %>%
group_by(zip_code) %>%
summarize(sum(price, na.rm = TRUE))
colnames(zip_price_sum)[2] <- "h_price"
zip_price_sum <- zip_price_sum%>%arrange(-h_price)
# Let's look at the zip_code price sum
ggplot(zip_price_sum,aes( x=reorder(zip_code,h_price),y=h_price))+geom_bar(stat = "identity")+coord_flip()

# try to find which city sales the most. Intuitionally, Seattle should occupy the most sales.
# 98102-98199 - Seattle
Seattle_sales<- my_data %>% filter(zip_code%in% c(98102,98199))%>%summarise(sum(price))
Seattle_market_share <- as.double(Seattle_sales / sum(my_data$price))
#We will also check some city from the top
#98004-98009 bellevue
bellevue_sales<- my_data %>% filter(zip_code%in%c(98004,98009))%>%summarise(sum(price))
bellevue_market_share <- as.double(bellevue_sales / sum(my_data$price))
#98040 Mercer island
mercer_sales<- my_data %>% filter(zip_code==98040)%>%summarise(sum(price))
mercer_market_share <- as.double(mercer_sales/sum(my_data$price))
#98052 53 Redmond
Redmond_sales<- my_data %>% filter(zip_code%in%c(98052,98053))%>%summarise(sum(price))
Redmond_market_share <- as.double(Redmond_sales/sum(my_data$price))
# 98033 98034 kirkland
kirkland_sales<- my_data %>% filter(zip_code%in%c(98033,98034))%>%summarise(sum(price))
kirkland_market_share <- as.double(kirkland_sales/sum(my_data$price))
kirkland_market_share
## [1] 0.0541049
#gather data for plotting
mkt_share <- c(Seattle_market_share,bellevue_market_share,mercer_market_share,Redmond_market_share,kirkland_market_share)
mkt_city <- c("Seattle","Bellevue","Mercer Island","Redmond","kirkland")
mkt <- data.frame(mkt_city,mkt_share )
#plotting
mkt_share <- ggplot(mkt, aes(x= mkt_city,y=mkt_share))+geom_bar(stat = "identity")+labs(title = "Main Sales Force Market Share", x="City",y ="Market Share")+ geom_text(aes(label=round(mkt_share,2)),color = "DarkGreen",vjust=-0.10)+theme_classic()
mkt_share

ggsave("mkt_share.png")
## Saving 5 x 3 in image
Finding: Redmond makes most of the money, followed by Kirkland.
#Now let check what type of houese these buyers are looking for
# We will foucusing on the more relevant factor based on previous analysis: bedrooms, sqrt_living and
# Redmon
Redmon_sales<- plot_data%>%filter(zip_code%in%c(98052,98053))
summary(Redmon_sales)
## tran_id price bedrooms bathrooms
## Min. : 6 Min. : 170000 3 :394 2.5 :392
## 1st Qu.: 5550 1st Qu.: 500000 4 :381 1.75 :133
## Median :10815 Median : 625000 2 :116 2.25 :102
## Mean :10827 Mean : 658909 5 : 74 2 : 88
## 3rd Qu.:16126 3rd Qu.: 768000 6 : 10 2.75 : 73
## Max. :21600 Max. :2280000 1 : 2 3.5 : 45
## (Other): 2 (Other):146
## sqft_living sqft_lot floors waterfront view
## Min. : 530 Min. : 785 1 :384 0:976 0:953
## 1st Qu.: 1730 1st Qu.: 5912 1.5: 16 1: 3 1: 2
## Median : 2360 Median : 8652 2 :563 2: 14
## Mean : 2470 Mean : 21226 2.5: 6 3: 6
## 3rd Qu.: 3040 3rd Qu.: 14596 3 : 10 4: 4
## Max. :13540 Max. :453895 3.5: 0
##
## grade condition yr_built yr_renovated zip_code
## 8 :381 1: 1 Min. :1919 Min. : 0.00 Min. :98052
## 7 :242 2: 3 1st Qu.:1978 1st Qu.: 0.00 1st Qu.:98052
## 9 :216 3:776 Median :1990 Median : 0.00 Median :98052
## 10 : 87 4:184 Mean :1991 Mean : 24.48 Mean :98052
## 11 : 31 5: 15 3rd Qu.:2004 3rd Qu.: 0.00 3rd Qu.:98053
## 6 : 16 Max. :2014 Max. :2014.00 Max. :98053
## (Other): 6
#plotting for choice preference
ggplot(Redmon_sales, aes(y=sqft_living ,x=grade, color= bedrooms))+geom_point(alpha=0.2)+ geom_hline(yintercept=quantile(Redmon_sales$sqft_living),probs=.25,color= "red",linetype="dashed", size=0.5)+ geom_jitter()+labs(title = "Redmon Customer Preference", x="Grade",y ="Living Space(Sqrt)",color="Bedrooms")
## Warning: Ignoring unknown parameters: probs

ggsave("Remond_mkt.png")
## Saving 7 x 5 in image
Finding: Poeple in Redmon are mainly looking for house with 3 or 4 bedrooms. around 2350 sqrt, and grade in 8.
#Kirkland
Kirkland_sales<- plot_data%>%filter(zip_code%in%c(98033,98034))
summary(Kirkland_sales)
## tran_id price bedrooms bathrooms
## Min. : 72 Min. : 90000 3 :490 2.5 :229
## 1st Qu.: 5152 1st Qu.: 425000 4 :342 1.75 :166
## Median :10485 Median : 510000 5 : 81 2.25 :123
## Mean :10527 Mean : 646543 2 : 48 1 :112
## 3rd Qu.:16204 3rd Qu.: 739000 6 : 12 2 : 81
## Max. :21486 Max. :5110000 1 : 2 2.75 : 71
## (Other): 2 (Other):195
## sqft_living sqft_lot floors waterfront view
## Min. : 750 Min. : 1360 1 :573 0:967 0:870
## 1st Qu.:1510 1st Qu.: 7201 1.5: 46 1: 10 1: 23
## Median :1980 Median : 8424 2 :351 2: 52
## Mean :2157 Mean : 10092 2.5: 2 3: 16
## 3rd Qu.:2600 3rd Qu.: 10500 3 : 5 4: 16
## Max. :8010 Max. :290980 3.5: 0
##
## grade condition yr_built yr_renovated zip_code
## 7 :496 1: 1 Min. :1900 Min. : 0.00 Min. :98033
## 8 :209 2: 5 1st Qu.:1967 1st Qu.: 0.00 1st Qu.:98033
## 9 :121 3:620 Median :1976 Median : 0.00 Median :98034
## 10 : 74 4:278 Mean :1977 Mean : 61.22 Mean :98034
## 6 : 47 5: 73 3rd Qu.:1989 3rd Qu.: 0.00 3rd Qu.:98034
## 11 : 21 Max. :2014 Max. :2014.00 Max. :98034
## (Other): 9
#plotting for choice preference
ggplot(Kirkland_sales, aes(y=sqft_living ,x=grade, color= bedrooms))+geom_point(alpha=0.2)+ geom_hline(yintercept=quantile(Kirkland_sales$sqft_living),probs=.25,color= "red",linetype="dashed", size=0.5)+ geom_jitter()+labs(title = "Kirkland Customer Preference", x="Grade",y ="Living Space(Sqrt)",color="Bedrooms")
## Warning: Ignoring unknown parameters: probs

ggsave("Kirkland_mkt.png")
## Saving 7 x 5 in image
Finding: Poeple in Kirkland are mainly looking for house with 3 or 4 bedrooms. around 2000 sqrt, and grade in 7
The power of rennovation
Tartget: Find out how much value the renovation can bring to the price
# We use price per sqft as idex, for ruling out the effect of house size
t_data <- data.frame(my_data)
t_data$sqrt_price <- t_data$price/ t_data$sqft_living
#head(t_data)
# add one more column
for(i in 21613){
t_data$renovated <- NA
}
# put in data
t_data$renovation <- ifelse(t_data$yr_renovated ==0, "N", "Y")
head(t_data)
## tran_id price bedrooms bathrooms sqft_living sqft_lot floors
## 1 1 221900 3 1.00 1180 5650 1
## 2 2 538000 3 2.25 2570 7242 2
## 3 3 180000 2 1.00 770 10000 1
## 4 4 604000 4 3.00 1960 5000 1
## 5 5 510000 3 2.00 1680 8080 1
## 6 6 1230000 4 4.50 5420 101930 1
## waterfront view grade condition yr_built yr_renovated zip_code
## 1 0 0 7 3 1955 0 98178
## 2 0 0 7 3 1951 1991 98125
## 3 0 0 6 3 1933 0 98028
## 4 0 0 7 5 1965 0 98136
## 5 0 0 8 3 1987 0 98074
## 6 0 0 11 3 2001 0 98053
## sqrt_price renovated renovation
## 1 188.0508 NA N
## 2 209.3385 NA Y
## 3 233.7662 NA N
## 4 308.1633 NA N
## 5 303.5714 NA N
## 6 226.9373 NA N
#plot the price increse
t_data$yr_built <- as.factor(t_data$yr_built)
ggplot(t_data,aes(x=yr_built,y=price,color = renovation))+geom_boxplot()+coord_flip()

Note: There’s a real difference in price the renovation can bring. But we try to focus more on how much the difference
# prepare data
reno_power <- t_data%>%group_by(renovation,yr_built)%>%summarise(median(sqrt_price))
colnames(reno_power)[3] <- "price"
head(reno_power)
## # A tibble: 6 x 3
## # Groups: renovation [1]
## renovation yr_built price
## <chr> <fct> <dbl>
## 1 N 1900 342.
## 2 N 1901 305.
## 3 N 1902 327.
## 4 N 1903 344.
## 5 N 1904 360.
## 6 N 1905 403.
#plot the difference
reno_pic <- ggplot(reno_power,aes(x = yr_built, y=price, group = renovation)) +
geom_line(aes(color=renovation), size = 2.0) +
labs(fill = "Renovation", size=10.0) +
ggtitle("The difference between renovation and without renovation") +
ylab("Median of Price per sqft") +
labs(fill = "Renovation") +
theme(plot.title = element_text(face="bold", size=20)) +
theme(axis.title = element_text(face="bold", size=17), axis.text.x = element_text(size=6,angle=45), axis.text.y = element_text(size=10))
Finding: The price of of renovated houese( blue line) is almost always higher the those without(red). And sometime is times of difference.
ggsave(filename = "reno_pic.png", width = 16, height = 10, plot = reno_pic)