Data Wrangling

Input Data

hp <- read.csv("data_input/data_house.csv")
head(hp)
##   X         id            date   price bedrooms bathrooms sqft_living sqft_lot
## 1 0 7129300520 20141013T000000  221900        3      1.00        1180     5650
## 2 1 6414100192 20141209T000000  538000        3      2.25        2570     7242
## 3 2 5631500400 20150225T000000  180000        2      1.00         770    10000
## 4 3 2487200875 20141209T000000  604000        4      3.00        1960     5000
## 5 4 1954400510 20150218T000000  510000        3      2.00        1680     8080
## 6 5 7237550310 20140512T000000 1225000        4      4.50        5420   101930
##   floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1      1          0    0         3     7       1180             0     1955
## 2      2          0    0         3     7       2170           400     1951
## 3      1          0    0         3     6        770             0     1933
## 4      1          0    0         5     7       1050           910     1965
## 5      1          0    0         3     8       1680             0     1987
## 6      1          0    0         3    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930

Data Structure

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(hp)
## Rows: 21,613
## Columns: 22
## $ X             <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ id            <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 19544005…
## $ date          <chr> "20141013T000000", "20141209T000000", "20150225T000000",…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built      <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated  <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ zipcode       <int> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, …
## $ lat           <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47…
## $ long          <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.0…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …

💡 Kesimpulan

  • Data memiliki 21,613, dan 22 olom
  • Ada beberapa kolum yang tidak diperlukan, seperti lat, long, zipcode, X, dan id
  • Kita dapat mengubah beberapa kolum dtype menjadi categorical seperti bedrooms, bathrooms, floors, dan waterfront.

Data Adjusment

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
hp1 <- hp %>% select(-c(lat, long, zipcode, X, id))
hp1$date <- format(as.Date(substr(hp1$date, 1, 8), "%Y%m%d"), "%m%d%Y")

hp1$renovated <- ifelse(hp1$yr_renovated == 0, 0, 1)

glimpse(hp1)
## Rows: 21,613
## Columns: 18
## $ date          <chr> "10132014", "12092014", "02252015", "12092014", "0218201…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms      <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms     <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built      <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated  <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
## $ renovated     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
hp2 <- hp1 %>% 
  mutate(bathrooms = as.factor(bathrooms),
         bedrooms = as.factor(bedrooms),
         floors = as.factor(floors),
         waterfront = as.factor(waterfront))
glimpse(hp2)
## Rows: 21,613
## Columns: 18
## $ date          <chr> "10132014", "12092014", "02252015", "12092014", "0218201…
## $ price         <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms      <fct> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms     <fct> 1, 2.25, 1, 3, 2, 4.5, 2.25, 1.5, 1, 2.5, 2.5, 1, 1, 1.7…
## $ sqft_living   <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot      <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors        <fct> 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1.5, 1, 1.5, 2, 2, 1…
## $ waterfront    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition     <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade         <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above    <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built      <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated  <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15    <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
## $ renovated     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

NA

anyNA(hp2)
## [1] FALSE

EDA

boxplot(hp2$price)

boxplot(hp2$sqft_living)

boxplot(hp2$sqft_lot)

plot(x = hp2$price, y = hp2$sqft_living)

Variabel Selection

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
ggcorr(hp1, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(hp1, label = TRUE, label_size = 2.9, hjust = 1, layout.exp =
## 2): data in column(s) 'date' are not numeric and were ignored

ggcorr(hp2, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(hp2, label = TRUE, label_size = 2.9, hjust = 1, layout.exp =
## 2): data in column(s) 'date', 'bedrooms', 'bathrooms', 'floors', 'waterfront'
## are not numeric and were ignored

# Waterfront We want to analyze about waterfront feature in the house. Whether having a waterfront increase the value of the house and whether

wf1 <- hp2 %>% filter(waterfront == 1)
wf0 <- hp2 %>% filter(waterfront == 0)

swf1 <- summary(wf1[c("price", "sqft_living", "sqft_lot")])
swf0 <- summary(wf0[c("price", "sqft_living", "sqft_lot")])

print(swf1)
##      price          sqft_living       sqft_lot     
##  Min.   : 285000   Min.   :  440   Min.   :  1767  
##  1st Qu.: 760000   1st Qu.: 2060   1st Qu.: 10851  
##  Median :1400000   Median : 2850   Median : 17342  
##  Mean   :1661876   Mean   : 3174   Mean   : 25372  
##  3rd Qu.:2215000   3rd Qu.: 3905   3rd Qu.: 26408  
##  Max.   :7062500   Max.   :10040   Max.   :505166
print(swf0)
##      price          sqft_living       sqft_lot      
##  Min.   :  75000   Min.   :  290   Min.   :    520  
##  1st Qu.: 320000   1st Qu.: 1420   1st Qu.:   5032  
##  Median : 450000   Median : 1910   Median :   7588  
##  Mean   : 531564   Mean   : 2072   Mean   :  15029  
##  3rd Qu.: 639897   3rd Qu.: 2540   3rd Qu.:  10576  
##  Max.   :7700000   Max.   :13540   Max.   :1651359
print(count(wf0))
##       n
## 1 21450
wf1_count <- count(wf1) %>% rename(Waterfront_Count = n)
wf0_count <- count(wf0) %>% rename(No_Waterfront_Count = n)

count_table <- bind_rows(wf1_count, wf0_count)

print(count_table)
##   Waterfront_Count No_Waterfront_Count
## 1              163                  NA
## 2               NA               21450
cor(x = wf1$price,
    y = wf1$sqft_living)
## [1] 0.878811
plot(x = wf1$price, y = wf1$sqft_living)

cor(x = wf0$price,
    y = wf0$sqft_living)
## [1] 0.7015244
plot(x = wf0$price, y = wf0$sqft_living)

💡 Kesimpulan Dari analisis diatas, dapat disimpulkan bahwa rumah yang memiliki waterfront adalah rumah yang memiliki sqft_lot (luas tanah) yang lebih besar. Tapi banyak juga rumah yang besar namun tidak memiliki waterfront. Jadi, meskipun bisa dilihat bahwa rumah yang memiliki harga yang lebih tinggi, pengaruh luas tanah tidak bisa dilepaskan juga dari tingginya harga.