# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)

1 Load Package and Data

#load package 
library(gridExtra)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.3     ✔ purrr   0.3.2
## ✔ tidyr   1.0.0     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ tibble  2.1.3     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
library(gridExtra)

#load data 
my_data <- read.csv("qp1_data.csv")
head(my_data)
##     price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 1  221900        3      1.00        1180     5650      1          0    0
## 2  538000        3      2.25        2570     7242      2          0    0
## 3  180000        2      1.00         770    10000      1          0    0
## 4  604000        4      3.00        1960     5000      1          0    0
## 5  510000        3      2.00        1680     8080      1          0    0
## 6 1230000        4      4.50        5420   101930      1          0    0
##   grade condition yr_built yr_renovated zip_code
## 1     7         3     1955            0    98178
## 2     7         3     1951         1991    98125
## 3     6         3     1933            0    98028
## 4     7         5     1965            0    98136
## 5     8         3     1987            0    98074
## 6    11         3     2001            0    98053
fig_s<- "fig.width = 6, fig.height = 3"
#add transit id
tran_id <- c(1:nrow(my_data))
my_data <- add_column(my_data, tran_id, .before = "price")
head(my_data)
##   tran_id   price bedrooms bathrooms sqft_living sqft_lot floors
## 1       1  221900        3      1.00        1180     5650      1
## 2       2  538000        3      2.25        2570     7242      2
## 3       3  180000        2      1.00         770    10000      1
## 4       4  604000        4      3.00        1960     5000      1
## 5       5  510000        3      2.00        1680     8080      1
## 6       6 1230000        4      4.50        5420   101930      1
##   waterfront view grade condition yr_built yr_renovated zip_code
## 1          0    0     7         3     1955            0    98178
## 2          0    0     7         3     1951         1991    98125
## 3          0    0     6         3     1933            0    98028
## 4          0    0     7         5     1965            0    98136
## 5          0    0     8         3     1987            0    98074
## 6          0    0    11         3     2001            0    98053

2 Base EDA

2.1 Uni-variate non-graphical EDA

str(my_data)
## 'data.frame':    21613 obs. of  14 variables:
##  $ tran_id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price       : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms    : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms   : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot    : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors      : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ grade       : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ condition   : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ yr_built    : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated: int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zip_code    : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
summary(my_data)
##     tran_id          price            bedrooms        bathrooms    
##  Min.   :    1   Min.   :  75000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 5404   1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750  
##  Median :10807   Median : 450000   Median : 3.000   Median :2.250  
##  Mean   :10807   Mean   : 540182   Mean   : 3.371   Mean   :2.115  
##  3rd Qu.:16210   3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500  
##  Max.   :21613   Max.   :7700000   Max.   :33.000   Max.   :8.000  
##   sqft_living       sqft_lot           floors        waterfront      
##  Min.   :  290   Min.   :    520   Min.   :1.000   Min.   :0.000000  
##  1st Qu.: 1427   1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000  
##  Median : 1910   Median :   7618   Median :1.500   Median :0.000000  
##  Mean   : 2080   Mean   :  15107   Mean   :1.494   Mean   :0.007542  
##  3rd Qu.: 2550   3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000  
##  Max.   :13540   Max.   :1651359   Max.   :3.500   Max.   :1.000000  
##       view            grade          condition        yr_built   
##  Min.   :0.0000   Min.   : 1.000   Min.   :1.000   Min.   :1900  
##  1st Qu.:0.0000   1st Qu.: 7.000   1st Qu.:3.000   1st Qu.:1951  
##  Median :0.0000   Median : 7.000   Median :3.000   Median :1975  
##  Mean   :0.2343   Mean   : 7.657   Mean   :3.409   Mean   :1971  
##  3rd Qu.:0.0000   3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.:1997  
##  Max.   :4.0000   Max.   :13.000   Max.   :5.000   Max.   :2015  
##   yr_renovated       zip_code    
##  Min.   :   0.0   Min.   :98001  
##  1st Qu.:   0.0   1st Qu.:98033  
##  Median :   0.0   Median :98065  
##  Mean   :  84.4   Mean   :98078  
##  3rd Qu.:   0.0   3rd Qu.:98118  
##  Max.   :2015.0   Max.   :98199
summary(my_data[,2:ncol(my_data)])
##      price            bedrooms        bathrooms      sqft_living   
##  Min.   :  75000   Min.   : 0.000   Min.   :0.000   Min.   :  290  
##  1st Qu.: 321950   1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427  
##  Median : 450000   Median : 3.000   Median :2.250   Median : 1910  
##  Mean   : 540182   Mean   : 3.371   Mean   :2.115   Mean   : 2080  
##  3rd Qu.: 645000   3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550  
##  Max.   :7700000   Max.   :33.000   Max.   :8.000   Max.   :13540  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    520   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5040   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7618   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  15107   Mean   :1.494   Mean   :0.007542   Mean   :0.2343  
##  3rd Qu.:  10688   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1651359   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##      grade          condition        yr_built     yr_renovated   
##  Min.   : 1.000   Min.   :1.000   Min.   :1900   Min.   :   0.0  
##  1st Qu.: 7.000   1st Qu.:3.000   1st Qu.:1951   1st Qu.:   0.0  
##  Median : 7.000   Median :3.000   Median :1975   Median :   0.0  
##  Mean   : 7.657   Mean   :3.409   Mean   :1971   Mean   :  84.4  
##  3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.:1997   3rd Qu.:   0.0  
##  Max.   :13.000   Max.   :5.000   Max.   :2015   Max.   :2015.0  
##     zip_code    
##  Min.   :98001  
##  1st Qu.:98033  
##  Median :98065  
##  Mean   :98078  
##  3rd Qu.:98118  
##  Max.   :98199

[Number of Bedrooms]

#bedroom number count
table(my_data$bedrooms)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   33 
##   13  199 2760 9824 6882 1601  272   38   13    6    3    1    1

Note: Since bedroom number is generally the first buyers’ priority, we concluded that the highest demand goes to the house with three bedrooms type.

Question: with no bedroom? studio? a house with 33 bedrooms? may want to deal with it as an outlier.

#pull out the 33 rooms entry
my_data%>%filter(my_data$bedrooms==33)
##   tran_id  price bedrooms bathrooms sqft_living sqft_lot floors waterfront
## 1   15871 640000       33      1.75        1620     6000      1          0
##   view grade condition yr_built yr_renovated zip_code
## 1    0     7         5     1947            0    98103

Note: It might be a typo since there are only 1620 sq ft for living and one bathroom if I have the authority I will remove this data.

[Number of floors]

#floor numer count
table(my_data$floor)
## 
##     1   1.5     2   2.5     3   3.5 
## 10680  1910  8241   161   613     8

[Transition in different zip code]

table(my_data$zip_code)
## 
## 98001 98002 98003 98004 98005 98006 98007 98008 98010 98011 98014 98019 
##   362   199   280   317   168   498   141   283   100   195   124   190 
## 98022 98023 98024 98027 98028 98029 98030 98031 98032 98033 98034 98038 
##   234   499    81   412   283   321   256   274   125   432   545   590 
## 98039 98040 98042 98045 98052 98053 98055 98056 98058 98059 98065 98070 
##    50   282   548   221   574   405   268   406   455   468   310   118 
## 98072 98074 98075 98077 98092 98102 98103 98105 98106 98107 98108 98109 
##   273   441   359   198   351   105   602   229   335   266   186   109 
## 98112 98115 98116 98117 98118 98119 98122 98125 98126 98133 98136 98144 
##   269   583   330   553   508   184   290   410   354   494   263   343 
## 98146 98148 98155 98166 98168 98177 98178 98188 98198 98199 
##   288    57   446   254   269   255   262   136   280   317

2.2 Uni-variate graphical EDA

[Price,sqft_live,sqft_lot Distribution]

grid.arrange( 
  
ggplot(my_data, aes(x= price)) + geom_histogram(),
ggplot(my_data,aes(x=sqft_living))+geom_histogram(),
ggplot(my_data,aes(x=sqft_lot))+geom_histogram(),
ncol=1
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Note: Price, sqft_living, sqft_lot distribution are all skewed. It’s better use median instead of mean later.

[floors, bedrooms, bathrooms Distribution]

grid.arrange( 
  
ggplot(my_data, aes(x= floors)) + geom_histogram(stat="count"),
ggplot(my_data, aes(x=bedrooms)) + geom_histogram(stat="count"),
ggplot(my_data, aes(x=bathrooms)) + geom_histogram(stat="count"),
ncol=1
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Finding: Most of houses have 1 to 2 floors. Most of houses have 3 to 4 rooms. Most of houses have 1 or 2.5 bathrooms.

[House Location Distribution]

ggplot(my_data,aes(x=zip_code))+geom_bar()

Note: House distribution in each zone is not even.

[Bedroom Number Distribution]

#just for checking the outlier house 
ggplot(my_data,aes(x=bedrooms))+geom_bar()+coord_cartesian(xlim=c(0,11))

Note: The market is looking for a house with three or four bedrooms.

[Year of Built Distribution]

yr_built_pic <- ggplot(my_data,aes(x=yr_built))+geom_bar(stat = "count") +
theme( axis.text.x = element_text(size=10))
ggsave(filename = "yr_built.png",width = 8, height = 5, plot = yr_built_pic)

Note: House built from 2014 has highest number among King County area.

2.3 multi-variate non-graphical EDA

[Price & zip_code]

zip_price <- my_data %>%
  group_by(zip_code) %>%
  summarize(median(price, na.rm = TRUE))

colnames(zip_price)[2] <- "h_price"

zip_price
## # A tibble: 70 x 2
##    zip_code  h_price
##       <int>    <dbl>
##  1    98001  260000 
##  2    98002  235000 
##  3    98003  267475 
##  4    98004 1150000 
##  5    98005  765475 
##  6    98006  760184.
##  7    98007  555000 
##  8    98008  545000 
##  9    98010  360000.
## 10    98011  470000 
## # … with 60 more rows

Note:House located in 98004/98039/98112 are pretty expensive, which can due to specific attribution with that area, a graph may demonstrate better.

[Bedroom & Bathroom]

table(my_data$bedrooms,my_data$bathroom)
##     
##         0  0.5 0.75    1 1.25  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5
##   0     7    0    1    1    0    1    0    0    0    3    0    0    0    0
##   1     3    1   27  138    2   12    4    6    4    2    0    0    0    0
##   2     0    2   26 1558    3  294  304  216  118  197   20   13    8    1
##   3     0    0   16 1780    4  829 1870 1048 1082 2357  275  197  184  143
##   4     0    1    2  325    0  254  719  525  709 2502  639  326  254  395
##   5     0    0    0   43    0   48  134  110  116  287  214  163  129  169
##   6     0    0    0    6    0    6   16   24   15   29   31   45   12   17
##   7     0    0    0    1    0    2    0    0    3    2    3    3    1    5
##   8     0    0    0    0    0    0    0    0    0    1    3    2    1    1
##   9     0    0    0    0    0    0    0    0    0    0    0    2    0    0
##   10    0    0    0    0    0    0    0    1    0    0    0    1    0    0
##   11    0    0    0    0    0    0    0    0    0    0    0    1    0    0
##   33    0    0    0    0    0    0    1    0    0    0    0    0    0    0
##     
##      3.75    4 4.25  4.5 4.75    5 5.25  5.5 5.75    6 6.25  6.5 6.75  7.5
##   0     0    0    0    0    0    0    0    0    0    0    0    0    0    0
##   1     0    0    0    0    0    0    0    0    0    0    0    0    0    0
##   2     0    0    0    0    0    0    0    0    0    0    0    0    0    0
##   3    17   11    6    5    0    0    0    0    0    0    0    0    0    0
##   4    78   58   38   32    7    7    5    5    1    0    0    0    0    0
##   5    44   48   25   35   11    7    4    4    2    4    2    1    1    0
##   6    13   11    8   23    3    6    3    0    0    1    0    1    0    0
##   7     2    5    2    3    2    0    0    1    1    0    0    0    1    0
##   8     1    2    0    0    0    1    0    0    0    1    0    0    0    0
##   9     0    1    0    2    0    0    0    0    0    0    0    0    0    1
##   10    0    0    0    0    0    0    1    0    0    0    0    0    0    0
##   11    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##   33    0    0    0    0    0    0    0    0    0    0    0    0    0    0
##     
##      7.75    8
##   0     0    0
##   1     0    0
##   2     0    0
##   3     0    0
##   4     0    0
##   5     0    0
##   6     1    1
##   7     0    1
##   8     0    0
##   9     0    0
##   10    0    0
##   11    0    0
##   33    0    0

Note: more bedrooms with more bathrooms normally

*The 33 rooms house only has 1.75 bathroom

[Floor & View]

table(my_data$floor,my_data$view)
##      
##          0    1    2    3    4
##   1   9684  192  454  219  131
##   1.5 1713   27   90   50   30
##   2   7407  108  376  215  135
##   2.5  130    0   17    8    6
##   3    549    5   24   18   17
##   3.5    6    0    2    0    0

Note: buildings with more floors are more likely with a better view

2.4 Multi-variate graphical EDA

Which attribution affect the price most

#Clean data for ploting 
plot_data <- data.frame(my_data)
plot_data$floors <- as.factor(plot_data$floors)
plot_data$bedrooms <- as.factor(plot_data$bedrooms)
plot_data$bathrooms <- as.factor(plot_data$bathrooms)
plot_data$view <- as.factor(plot_data$view)
plot_data$waterfront <- as.factor(plot_data$waterfront)
plot_data$grade <- as.factor(plot_data$grade)
plot_data$condition <- as.factor(plot_data$condition)

summary(plot_data)
##     tran_id          price            bedrooms      bathrooms   
##  Min.   :    1   Min.   :  75000   3      :9824   2.5    :5380  
##  1st Qu.: 5404   1st Qu.: 321950   4      :6882   1      :3852  
##  Median :10807   Median : 450000   2      :2760   1.75   :3048  
##  Mean   :10807   Mean   : 540182   5      :1601   2.25   :2047  
##  3rd Qu.:16210   3rd Qu.: 645000   6      : 272   2      :1930  
##  Max.   :21613   Max.   :7700000   1      : 199   1.5    :1446  
##                                    (Other):  75   (Other):3910  
##   sqft_living       sqft_lot       floors      waterfront view     
##  Min.   :  290   Min.   :    520   1  :10680   0:21450    0:19489  
##  1st Qu.: 1427   1st Qu.:   5040   1.5: 1910   1:  163    1:  332  
##  Median : 1910   Median :   7618   2  : 8241              2:  963  
##  Mean   : 2080   Mean   :  15107   2.5:  161              3:  510  
##  3rd Qu.: 2550   3rd Qu.:  10688   3  :  613              4:  319  
##  Max.   :13540   Max.   :1651359   3.5:    8                       
##                                                                    
##      grade      condition    yr_built     yr_renovated       zip_code    
##  7      :8981   1:   30   Min.   :1900   Min.   :   0.0   Min.   :98001  
##  8      :6068   2:  172   1st Qu.:1951   1st Qu.:   0.0   1st Qu.:98033  
##  9      :2615   3:14031   Median :1975   Median :   0.0   Median :98065  
##  6      :2038   4: 5679   Mean   :1971   Mean   :  84.4   Mean   :98078  
##  10     :1134   5: 1701   3rd Qu.:1997   3rd Qu.:   0.0   3rd Qu.:98118  
##  11     : 399             Max.   :2015   Max.   :2015.0   Max.   :98199  
##  (Other): 378
grid.arrange(

ggplot(plot_data,aes(x=plot_data$bedrooms,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$bathrooms,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$sqft_living,y=price))+geom_point()+stat_smooth(method = "lm"),
ggplot(plot_data,aes(x=plot_data$sqft_lot,y=price))+geom_point()+stat_smooth(method = "lm"),
ggplot(plot_data,aes(x=plot_data$floors,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$waterfront,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$view,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$grade,y=price))+geom_boxplot(),
ggplot(plot_data,aes(x=plot_data$condition,y=price))+geom_boxplot(),

ncol=2
)

Findings: The relation between each factor and price are a bellow : - A number of bedrooms increases, price increases, but the effect goes less significant when it’s more than 5 bedrooms. - Number of bathroom increases, price increases, - Both spaces of living and lot have a positive relationship with price, but living space has more power than a lot space. - Building with more floors at a higher price. However, the price reaches its peak at 2.5 floors and then drop. - The view doesn’t really increase the value of the house. However, the waterfront view significantly did. - The initial quality of the building is decisive to price, but the maintenance no so much.

[Price and Zip_code]

#group the zip_code
zip_price <- my_data %>%
  group_by(zip_code) %>%
  summarize(median(price, na.rm = TRUE))

colnames(zip_price)[2] <- "h_price"
zip_price
## # A tibble: 70 x 2
##    zip_code  h_price
##       <int>    <dbl>
##  1    98001  260000 
##  2    98002  235000 
##  3    98003  267475 
##  4    98004 1150000 
##  5    98005  765475 
##  6    98006  760184.
##  7    98007  555000 
##  8    98008  545000 
##  9    98010  360000.
## 10    98011  470000 
## # … with 60 more rows
#plotting 
ggplot(zip_price,aes( x=reorder(zip_code,h_price),y=h_price))+geom_bar(stat = "identity")+coord_flip()+ geom_hline(yintercept=median(my_data$price), color= "red")

Question : why the house located in 98039 98004 98040 98112 are more expensive than others?

[Prcie & space of living ]

new_d <- my_data%>%filter(bedrooms<=8)
new_d$bedrooms <- as.factor(new_d$bedrooms)
ggplot(new_d,aes(x=sqft_living , y= price,color=bedrooms))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)

Findings : - More sqft_living increase the price, this phenomenon will be more significant if there are more rooms in the house - But the effect of sqft to price drops when there 7 rooms in the house, may due to the over-divided interior design - The effect bounces back when house has 8 rooms, but there are only 18 observations of this group. I don’t really trust this

#pull out all house with 8 room to check if there are other factor that may affect the price 
summary(my_data %>% filter(bedrooms==8))
##     tran_id          price            bedrooms   bathrooms    
##  Min.   : 1661   Min.   : 340000   Min.   :8   Min.   :2.500  
##  1st Qu.: 6175   1st Qu.: 490000   1st Qu.:8   1st Qu.:2.750  
##  Median :10959   Median : 700000   Median :8   Median :3.250  
##  Mean   :11083   Mean   :1105077   Mean   :8   Mean   :3.558  
##  3rd Qu.:15671   3rd Qu.:1650000   3rd Qu.:8   3rd Qu.:4.000  
##  Max.   :19303   Max.   :3300000   Max.   :8   Max.   :6.000  
##   sqft_living      sqft_lot         floors        waterfront
##  Min.   :2280   Min.   : 2580   Min.   :1.000   Min.   :0   
##  1st Qu.:2800   1st Qu.: 4800   1st Qu.:1.000   1st Qu.:0   
##  Median :3840   Median : 7500   Median :2.000   Median :0   
##  Mean   :3800   Mean   : 8972   Mean   :1.692   Mean   :0   
##  3rd Qu.:4300   3rd Qu.:11750   3rd Qu.:2.000   3rd Qu.:0   
##  Max.   :7710   Max.   :20666   Max.   :3.500   Max.   :0   
##       view            grade          condition        yr_built   
##  Min.   :0.0000   Min.   : 7.000   Min.   :3.000   Min.   :1901  
##  1st Qu.:0.0000   1st Qu.: 7.000   1st Qu.:3.000   1st Qu.:1959  
##  Median :0.0000   Median : 8.000   Median :3.000   Median :1962  
##  Mean   :0.2308   Mean   : 8.077   Mean   :3.538   Mean   :1956  
##  3rd Qu.:0.0000   3rd Qu.: 8.000   3rd Qu.:4.000   3rd Qu.:1977  
##  Max.   :3.0000   Max.   :12.000   Max.   :5.000   Max.   :1997  
##   yr_renovated    zip_code    
##  Min.   :0     Min.   :98004  
##  1st Qu.:0     1st Qu.:98055  
##  Median :0     Median :98105  
##  Mean   :0     Mean   :98082  
##  3rd Qu.:0     3rd Qu.:98112  
##  Max.   :0     Max.   :98133

Finding : The house with 8 rooms is more expenxive even the living space doesn’t increase may because this group is comparatively higher in grade and condition. However, interestingly, they barely come with a view or waterfront view, which intuitionally will cause the price to decrease.

[Price and year built]

library(tidyverse)
plot_data %>% group_by(yr_built) %>% summarize(med_pri = median(price)) %>% ggplot(mapping = aes(x = yr_built, y = med_pri)) + geom_line()

Finding: The relation comes in a U shape, maybe people like the old house for its vintage style and the new one for its condition.

[Price Trend & Floors] !! Premise !!! The transaction record is chronological

ggplot(my_data[1:5000,],aes(x=tran_id, y=price))+geom_line()+facet_grid(floors~.)
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

Finding: The price change pattern is not the same in different bedrooms group.

3 Findings (Executive Advice )

3.1 Main sales force city

Tartget: Find out which city and what kind of house occupied the most sale

# group by zip_code, but take sum of the price for sales force analysis
zip_price_sum <- my_data %>%
  group_by(zip_code) %>%
  summarize(sum(price, na.rm = TRUE))

colnames(zip_price_sum)[2] <- "h_price"
zip_price_sum <- zip_price_sum%>%arrange(-h_price)

# Let's look at the zip_code price sum 
ggplot(zip_price_sum,aes( x=reorder(zip_code,h_price),y=h_price))+geom_bar(stat = "identity")+coord_flip()

# try to find which city sales the most. Intuitionally, Seattle should occupy the most sales.
# 98102-98199 - Seattle 
Seattle_sales<- my_data %>% filter(zip_code%in% c(98102,98199))%>%summarise(sum(price))
Seattle_market_share <- as.double(Seattle_sales / sum(my_data$price))


#We will also check some city from the top
#98004-98009 bellevue
bellevue_sales<- my_data %>% filter(zip_code%in%c(98004,98009))%>%summarise(sum(price))
bellevue_market_share <- as.double(bellevue_sales / sum(my_data$price))

#98040 Mercer island
mercer_sales<- my_data %>% filter(zip_code==98040)%>%summarise(sum(price))
mercer_market_share <- as.double(mercer_sales/sum(my_data$price))

#98052 53 Redmond
Redmond_sales<- my_data %>% filter(zip_code%in%c(98052,98053))%>%summarise(sum(price))
Redmond_market_share <- as.double(Redmond_sales/sum(my_data$price))

# 98033 98034 kirkland
kirkland_sales<- my_data %>% filter(zip_code%in%c(98033,98034))%>%summarise(sum(price))
kirkland_market_share <- as.double(kirkland_sales/sum(my_data$price))
kirkland_market_share
## [1] 0.0541049
#gather data for plotting
mkt_share <- c(Seattle_market_share,bellevue_market_share,mercer_market_share,Redmond_market_share,kirkland_market_share)
mkt_city <- c("Seattle","Bellevue","Mercer Island","Redmond","kirkland")
mkt <- data.frame(mkt_city,mkt_share )

#plotting
mkt_share <- ggplot(mkt, aes(x= mkt_city,y=mkt_share))+geom_bar(stat = "identity")+labs(title = "Main Sales Force Market Share", x="City",y ="Market Share")+ geom_text(aes(label=round(mkt_share,2)),color = "DarkGreen",vjust=-0.10)+theme_classic()
mkt_share

ggsave("mkt_share.png")
## Saving 5 x 3 in image

Finding: Redmond makes most of the money, followed by Kirkland.

#Now let check what type of houese these buyers are looking for
# We will foucusing on the more relevant factor based on previous analysis: bedrooms, sqrt_living and  
# Redmon
Redmon_sales<- plot_data%>%filter(zip_code%in%c(98052,98053))
summary(Redmon_sales)
##     tran_id          price            bedrooms     bathrooms  
##  Min.   :    6   Min.   : 170000   3      :394   2.5    :392  
##  1st Qu.: 5550   1st Qu.: 500000   4      :381   1.75   :133  
##  Median :10815   Median : 625000   2      :116   2.25   :102  
##  Mean   :10827   Mean   : 658909   5      : 74   2      : 88  
##  3rd Qu.:16126   3rd Qu.: 768000   6      : 10   2.75   : 73  
##  Max.   :21600   Max.   :2280000   1      :  2   3.5    : 45  
##                                    (Other):  2   (Other):146  
##   sqft_living       sqft_lot      floors    waterfront view   
##  Min.   :  530   Min.   :   785   1  :384   0:976      0:953  
##  1st Qu.: 1730   1st Qu.:  5912   1.5: 16   1:  3      1:  2  
##  Median : 2360   Median :  8652   2  :563              2: 14  
##  Mean   : 2470   Mean   : 21226   2.5:  6              3:  6  
##  3rd Qu.: 3040   3rd Qu.: 14596   3  : 10              4:  4  
##  Max.   :13540   Max.   :453895   3.5:  0                     
##                                                               
##      grade     condition    yr_built     yr_renovated        zip_code    
##  8      :381   1:  1     Min.   :1919   Min.   :   0.00   Min.   :98052  
##  7      :242   2:  3     1st Qu.:1978   1st Qu.:   0.00   1st Qu.:98052  
##  9      :216   3:776     Median :1990   Median :   0.00   Median :98052  
##  10     : 87   4:184     Mean   :1991   Mean   :  24.48   Mean   :98052  
##  11     : 31   5: 15     3rd Qu.:2004   3rd Qu.:   0.00   3rd Qu.:98053  
##  6      : 16             Max.   :2014   Max.   :2014.00   Max.   :98053  
##  (Other):  6
#plotting for choice preference 
ggplot(Redmon_sales, aes(y=sqft_living ,x=grade, color= bedrooms))+geom_point(alpha=0.2)+ geom_hline(yintercept=quantile(Redmon_sales$sqft_living),probs=.25,color= "red",linetype="dashed", size=0.5)+ geom_jitter()+labs(title = "Redmon Customer Preference", x="Grade",y ="Living Space(Sqrt)",color="Bedrooms")
## Warning: Ignoring unknown parameters: probs

ggsave("Remond_mkt.png")
## Saving 7 x 5 in image

Finding: Poeple in Redmon are mainly looking for house with 3 or 4 bedrooms. around 2350 sqrt, and grade in 8.

#Kirkland
Kirkland_sales<- plot_data%>%filter(zip_code%in%c(98033,98034))
summary(Kirkland_sales)
##     tran_id          price            bedrooms     bathrooms  
##  Min.   :   72   Min.   :  90000   3      :490   2.5    :229  
##  1st Qu.: 5152   1st Qu.: 425000   4      :342   1.75   :166  
##  Median :10485   Median : 510000   5      : 81   2.25   :123  
##  Mean   :10527   Mean   : 646543   2      : 48   1      :112  
##  3rd Qu.:16204   3rd Qu.: 739000   6      : 12   2      : 81  
##  Max.   :21486   Max.   :5110000   1      :  2   2.75   : 71  
##                                    (Other):  2   (Other):195  
##   sqft_living      sqft_lot      floors    waterfront view   
##  Min.   : 750   Min.   :  1360   1  :573   0:967      0:870  
##  1st Qu.:1510   1st Qu.:  7201   1.5: 46   1: 10      1: 23  
##  Median :1980   Median :  8424   2  :351              2: 52  
##  Mean   :2157   Mean   : 10092   2.5:  2              3: 16  
##  3rd Qu.:2600   3rd Qu.: 10500   3  :  5              4: 16  
##  Max.   :8010   Max.   :290980   3.5:  0                     
##                                                              
##      grade     condition    yr_built     yr_renovated        zip_code    
##  7      :496   1:  1     Min.   :1900   Min.   :   0.00   Min.   :98033  
##  8      :209   2:  5     1st Qu.:1967   1st Qu.:   0.00   1st Qu.:98033  
##  9      :121   3:620     Median :1976   Median :   0.00   Median :98034  
##  10     : 74   4:278     Mean   :1977   Mean   :  61.22   Mean   :98034  
##  6      : 47   5: 73     3rd Qu.:1989   3rd Qu.:   0.00   3rd Qu.:98034  
##  11     : 21             Max.   :2014   Max.   :2014.00   Max.   :98034  
##  (Other):  9
#plotting for choice preference 
ggplot(Kirkland_sales, aes(y=sqft_living ,x=grade, color= bedrooms))+geom_point(alpha=0.2)+ geom_hline(yintercept=quantile(Kirkland_sales$sqft_living),probs=.25,color= "red",linetype="dashed", size=0.5)+ geom_jitter()+labs(title = "Kirkland Customer Preference", x="Grade",y ="Living Space(Sqrt)",color="Bedrooms")
## Warning: Ignoring unknown parameters: probs

ggsave("Kirkland_mkt.png")
## Saving 7 x 5 in image

Finding: Poeple in Kirkland are mainly looking for house with 3 or 4 bedrooms. around 2000 sqrt, and grade in 7

3.2 The power of rennovation

Tartget: Find out how much value the renovation can bring to the price

# We use price per sqft as idex, for ruling out the effect of house size
t_data <- data.frame(my_data)
t_data$sqrt_price <- t_data$price/ t_data$sqft_living
#head(t_data)

# add one more column
for(i in 21613){
  t_data$renovated <- NA
}


# put in data

t_data$renovation <- ifelse(t_data$yr_renovated ==0, "N", "Y")
head(t_data)
##   tran_id   price bedrooms bathrooms sqft_living sqft_lot floors
## 1       1  221900        3      1.00        1180     5650      1
## 2       2  538000        3      2.25        2570     7242      2
## 3       3  180000        2      1.00         770    10000      1
## 4       4  604000        4      3.00        1960     5000      1
## 5       5  510000        3      2.00        1680     8080      1
## 6       6 1230000        4      4.50        5420   101930      1
##   waterfront view grade condition yr_built yr_renovated zip_code
## 1          0    0     7         3     1955            0    98178
## 2          0    0     7         3     1951         1991    98125
## 3          0    0     6         3     1933            0    98028
## 4          0    0     7         5     1965            0    98136
## 5          0    0     8         3     1987            0    98074
## 6          0    0    11         3     2001            0    98053
##   sqrt_price renovated renovation
## 1   188.0508        NA          N
## 2   209.3385        NA          Y
## 3   233.7662        NA          N
## 4   308.1633        NA          N
## 5   303.5714        NA          N
## 6   226.9373        NA          N
#plot the price increse 
t_data$yr_built <- as.factor(t_data$yr_built)

ggplot(t_data,aes(x=yr_built,y=price,color = renovation))+geom_boxplot()+coord_flip()

Note: There’s a real difference in price the renovation can bring. But we try to focus more on how much the difference

# prepare data 
reno_power <- t_data%>%group_by(renovation,yr_built)%>%summarise(median(sqrt_price))
colnames(reno_power)[3] <- "price"
head(reno_power)
## # A tibble: 6 x 3
## # Groups:   renovation [1]
##   renovation yr_built price
##   <chr>      <fct>    <dbl>
## 1 N          1900      342.
## 2 N          1901      305.
## 3 N          1902      327.
## 4 N          1903      344.
## 5 N          1904      360.
## 6 N          1905      403.
#plot the difference
reno_pic <- ggplot(reno_power,aes(x = yr_built, y=price, group = renovation)) + 
geom_line(aes(color=renovation), size = 2.0) +
labs(fill = "Renovation", size=10.0) +
ggtitle("The difference between renovation and without renovation") +
ylab("Median of Price per sqft") +
labs(fill = "Renovation") +
theme(plot.title = element_text(face="bold", size=20)) +
theme(axis.title = element_text(face="bold", size=17), axis.text.x = element_text(size=6,angle=45), axis.text.y = element_text(size=10))

Finding: The price of of renovated houese( blue line) is almost always higher the those without(red). And sometime is times of difference.

ggsave(filename = "reno_pic.png", width = 16, height = 10, plot = reno_pic)