Praktikum 8 - Visualisasi Data

Febrian Adhitya Cahya Belardi

Pendahuluan

Package Installation

library(tidyverse) #Include beberapa packages termasuk ggplot
library(dplyr)
library(reshape2)
library(ggcorrplot)

Data

data <- read.csv("https://raw.githubusercontent.com/gerrydito/Sains-Data-S2/master/Praktikum/Visualisasi%20Data/house_price.csv", header = TRUE, sep=",")
str(data)
## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

Visualisasi

Korelasi

ggplot(data, aes(x = sqft_above, y = sqft_lot)) +
  geom_point() +  
  labs(title = "Scatter Plot", x = "sqft_aboce", y = "sqft_lot") +
  theme_classic()

data$kategori_rumah <- ifelse(data$yr_built >= 2000, "rumah 2000an", "rumah 90an")

ggplot(data, aes(x = sqft_above, y = sqft_lot, color = kategori_rumah)) +
  geom_point() +
  labs(title = "Scatter Plot Rumah Tahun 2000an dan 90an", x = "sqft_above", y = "sqft_lot", color = "Kategori") +
  theme_minimal()

ggplot(data, aes(x = sqft_above, y = sqft_lot, color = kategori_rumah)) +
  geom_point() +
  labs(title = "Scatter Plot Rumah Tahun 2000an dan 90an", x = "sqft_above", y = "sqft_lot", color = "Kategori") +
  facet_wrap(~ kategori_rumah) +
  theme_minimal()

Matrix Plot

Heatmap/Corellogram

data_numerik <- select_if(data, is.numeric)
str(data_numerik)
## 'data.frame':    4600 obs. of  13 variables:
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
data_melt <- cor(data_numerik[sapply(data_numerik,is.numeric)])

data_melt <- melt(data_melt) 

ggplot(data_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  labs(title = "Correlation Heatmap",
       x = "Variable 1",
       y = "Variable 2")

ggplot(data_melt, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1,1), name="Korelasi") +
  labs(title = "Corellogram") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

data_corrl <- round(cor(data_numerik), 1)

data_corrl <- cor(data_corrl)
ggcorrplot(data_corrl)

ggcorrplot(data_corrl, method = "circle")

Piecewise Constant (Fungsi Tangga)

mod_tangga = lm(sqft_above ~ cut(sqft_lot,3),data=data)
summary(mod_tangga)
## 
## Call:
## lm(formula = sqft_above ~ cut(sqft_lot, 3), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -1457   -637   -237    473   7583 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1827.02      12.72 143.588   <2e-16 ***
## cut(sqft_lot, 3)(3.58e+05,7.16e+05]   154.64     352.28   0.439    0.661    
## cut(sqft_lot, 3)(7.16e+05,1.08e+06]   182.98     862.43   0.212    0.832    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 862.3 on 4597 degrees of freedom
## Multiple R-squared:  5.169e-05,  Adjusted R-squared:  -0.0003834 
## F-statistic: 0.1188 on 2 and 4597 DF,  p-value: 0.888
ggplot(data,aes(x=sqft_above, y=sqft_lot)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,3), 
               lty = 1, col = "red",se = F)+
  theme_bw()

## Locally Estimated Scatter Plot Smoothing (LOESS)

Non linear Outlier

ggplot(data, aes(x = sqft_above, y = sqft_lot)) +
  geom_point(color = "blue", size = 3, alpha = 0.6) +
  geom_smooth(method = "loess", color = "red", linetype = "dashed", size = 1.5) +
  labs(
    x = "Sqft Above", 
    y = "Sqft Lot", 
    title = "LOESS Visualization of Sqft Above vs. Sqft Lot",
    subtitle = "Smoothed scatterplot with LOESS curve",
    caption = "Datanya Pak Geri"
  ) +
  theme_minimal() 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'