hp <- read.csv("data_input/data_house.csv")
head(hp)
## X id date price bedrooms bathrooms sqft_living sqft_lot
## 1 0 7129300520 20141013T000000 221900 3 1.00 1180 5650
## 2 1 6414100192 20141209T000000 538000 3 2.25 2570 7242
## 3 2 5631500400 20150225T000000 180000 2 1.00 770 10000
## 4 3 2487200875 20141209T000000 604000 4 3.00 1960 5000
## 5 4 1954400510 20150218T000000 510000 3 2.00 1680 8080
## 6 5 7237550310 20140512T000000 1225000 4 4.50 5420 101930
## floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1 1 0 0 3 7 1180 0 1955
## 2 2 0 0 3 7 2170 400 1951
## 3 1 0 0 3 6 770 0 1933
## 4 1 0 0 5 7 1050 910 1965
## 5 1 0 0 3 8 1680 0 1987
## 6 1 0 0 3 11 3890 1530 2001
## yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 0 98178 47.5112 -122.257 1340 5650
## 2 1991 98125 47.7210 -122.319 1690 7639
## 3 0 98028 47.7379 -122.233 2720 8062
## 4 0 98136 47.5208 -122.393 1360 5000
## 5 0 98074 47.6168 -122.045 1800 7503
## 6 0 98053 47.6561 -122.005 4760 101930
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(hp)
## Rows: 21,613
## Columns: 22
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ id <dbl> 7129300520, 6414100192, 5631500400, 2487200875, 19544005…
## $ date <chr> "20141013T000000", "20141209T000000", "20150225T000000",…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ zipcode <int> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, …
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47…
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.0…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
💡 Kesimpulan
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.1
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
hp1 <- hp %>% select(-c(lat, long, zipcode, X, id))
hp1$date <- format(as.Date(substr(hp1$date, 1, 8), "%Y%m%d"), "%m%d%Y")
hp1$renovated <- ifelse(hp1$yr_renovated == 0, 0, 1)
glimpse(hp1)
## Rows: 21,613
## Columns: 18
## $ date <chr> "10132014", "12092014", "02252015", "12092014", "0218201…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1…
## $ waterfront <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
## $ renovated <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
hp2 <- hp1 %>%
mutate(bathrooms = as.factor(bathrooms),
bedrooms = as.factor(bedrooms),
floors = as.factor(floors),
waterfront = as.factor(waterfront))
glimpse(hp2)
## Rows: 21,613
## Columns: 18
## $ date <chr> "10132014", "12092014", "02252015", "12092014", "0218201…
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500,…
## $ bedrooms <fct> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2,…
## $ bathrooms <fct> 1, 2.25, 1, 3, 2, 4.5, 2.25, 1.5, 1, 2.5, 2.5, 1, 1, 1.7…
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 189…
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470,…
## $ floors <fct> 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1.5, 1, 1.5, 2, 2, 1…
## $ waterfront <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,…
## $ condition <int> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4,…
## $ grade <int> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7…
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 189…
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, …
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 20…
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 23…
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, …
## $ renovated <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
anyNA(hp2)
## [1] FALSE
boxplot(hp2$price)
boxplot(hp2$sqft_living)
boxplot(hp2$sqft_lot)
plot(x = hp2$price, y = hp2$sqft_living)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
ggcorr(hp1, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(hp1, label = TRUE, label_size = 2.9, hjust = 1, layout.exp =
## 2): data in column(s) 'date' are not numeric and were ignored
ggcorr(hp2, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)
## Warning in ggcorr(hp2, label = TRUE, label_size = 2.9, hjust = 1, layout.exp =
## 2): data in column(s) 'date', 'bedrooms', 'bathrooms', 'floors', 'waterfront'
## are not numeric and were ignored
# Waterfront We want to analyze about waterfront feature in the house.
Whether having a waterfront increase the value of the house and
whether
wf1 <- hp2 %>% filter(waterfront == 1)
wf0 <- hp2 %>% filter(waterfront == 0)
swf1 <- summary(wf1[c("price", "sqft_living", "sqft_lot")])
swf0 <- summary(wf0[c("price", "sqft_living", "sqft_lot")])
print(swf1)
## price sqft_living sqft_lot
## Min. : 285000 Min. : 440 Min. : 1767
## 1st Qu.: 760000 1st Qu.: 2060 1st Qu.: 10851
## Median :1400000 Median : 2850 Median : 17342
## Mean :1661876 Mean : 3174 Mean : 25372
## 3rd Qu.:2215000 3rd Qu.: 3905 3rd Qu.: 26408
## Max. :7062500 Max. :10040 Max. :505166
print(swf0)
## price sqft_living sqft_lot
## Min. : 75000 Min. : 290 Min. : 520
## 1st Qu.: 320000 1st Qu.: 1420 1st Qu.: 5032
## Median : 450000 Median : 1910 Median : 7588
## Mean : 531564 Mean : 2072 Mean : 15029
## 3rd Qu.: 639897 3rd Qu.: 2540 3rd Qu.: 10576
## Max. :7700000 Max. :13540 Max. :1651359
print(count(wf0))
## n
## 1 21450
wf1_count <- count(wf1) %>% rename(Waterfront_Count = n)
wf0_count <- count(wf0) %>% rename(No_Waterfront_Count = n)
count_table <- bind_rows(wf1_count, wf0_count)
print(count_table)
## Waterfront_Count No_Waterfront_Count
## 1 163 NA
## 2 NA 21450
cor(x = wf1$price,
y = wf1$sqft_living)
## [1] 0.878811
plot(x = wf1$price, y = wf1$sqft_living)
cor(x = wf0$price,
y = wf0$sqft_living)
## [1] 0.7015244
plot(x = wf0$price, y = wf0$sqft_living)
💡 Kesimpulan Dari analisis diatas, dapat disimpulkan bahwa rumah yang memiliki waterfront adalah rumah yang memiliki sqft_lot (luas tanah) yang lebih besar. Tapi banyak juga rumah yang besar namun tidak memiliki waterfront. Jadi, meskipun bisa dilihat bahwa rumah yang memiliki harga yang lebih tinggi, pengaruh luas tanah tidak bisa dilepaskan juga dari tingginya harga.