Analisis Data Eksploratori 1

Package yang digunakan:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(gap)
## Warning: package 'gap' was built under R version 4.5.2
## Loading required package: gap.datasets
## Warning: package 'gap.datasets' was built under R version 4.5.2
## gap version 1.6
nyc.data <- read_csv("D:/Kuliah/IPB 2025 Semester 3/Analisis Spasial/Praktikum/Bedah Buku Spasial/Bab2. Eksplorasi Data 1/nyc/nyc.csv")
## Rows: 55 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): NAME, SUBBOROUGH
## dbl (32): bor_subb, CODE, FORHIS06, FORHIS07, FORHIS08, FORHIS09, FORWH06, F...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(nyc.data)
## # A tibble: 6 × 34
##   bor_subb NAME      CODE SUBBOROUGH FORHIS06 FORHIS07 FORHIS08 FORHIS09 FORWH06
##      <dbl> <chr>    <dbl> <chr>         <dbl>    <dbl>    <dbl>    <dbl>   <dbl>
## 1      501 North S…   501 North Sho…     37.1     34.0    27.4      29.3    13.3
## 2      502 Mid-Isl…   502 Mid-Island     28.0     18.1    24.0      31.2    20.1
## 3      503 South S…   503 South Sho…     10.7     12.1     9.69     14.7    10.3
## 4      401 Astoria    401 Astoria        52.1     54.0    54.7      47.8    38.4
## 5      402 Sunnysi…   402 Sunnyside…     62.7     69.4    67.1      58.3    37.1
## 6      403 Jackson…   403 Jackson H…     68.5     68.5    66.5      69.2    34.4
## # ℹ 25 more variables: FORWH07 <dbl>, FORWH08 <dbl>, FORWH09 <dbl>,
## #   HHSIZ1990 <dbl>, HHSIZ00 <dbl>, HHSIZ02 <dbl>, HHSIZ05 <dbl>,
## #   HHSIZ08 <dbl>, KIDS2000 <dbl>, KIDS2005 <dbl>, KIDS2006 <dbl>,
## #   KIDS2007 <dbl>, KIDS2008 <dbl>, KIDS2009 <dbl>, RENT2002 <dbl>,
## #   RENT2005 <dbl>, RENT2008 <dbl>, RENTPCT02 <dbl>, RENTPCT05 <dbl>,
## #   RENTPCT08 <dbl>, PUBAST90 <dbl>, PUBAST00 <dbl>, YRHOM02 <dbl>,
## #   YRHOM05 <dbl>, YRHOM08 <dbl>
names(nyc.data)
##  [1] "bor_subb"   "NAME"       "CODE"       "SUBBOROUGH" "FORHIS06"  
##  [6] "FORHIS07"   "FORHIS08"   "FORHIS09"   "FORWH06"    "FORWH07"   
## [11] "FORWH08"    "FORWH09"    "HHSIZ1990"  "HHSIZ00"    "HHSIZ02"   
## [16] "HHSIZ05"    "HHSIZ08"    "KIDS2000"   "KIDS2005"   "KIDS2006"  
## [21] "KIDS2007"   "KIDS2008"   "KIDS2009"   "RENT2002"   "RENT2005"  
## [26] "RENT2008"   "RENTPCT02"  "RENTPCT05"  "RENTPCT08"  "PUBAST90"  
## [31] "PUBAST00"   "YRHOM02"    "YRHOM05"    "YRHOM08"
nyc.data <- nyc.data %>% rename("kids2009" = "KIDS2009", "kids2000" = "KIDS2000",
                                "pubast00" = "PUBAST00")
names(nyc.data)
##  [1] "bor_subb"   "NAME"       "CODE"       "SUBBOROUGH" "FORHIS06"  
##  [6] "FORHIS07"   "FORHIS08"   "FORHIS09"   "FORWH06"    "FORWH07"   
## [11] "FORWH08"    "FORWH09"    "HHSIZ1990"  "HHSIZ00"    "HHSIZ02"   
## [16] "HHSIZ05"    "HHSIZ08"    "kids2000"   "KIDS2005"   "KIDS2006"  
## [21] "KIDS2007"   "KIDS2008"   "kids2009"   "RENT2002"   "RENT2005"  
## [26] "RENT2008"   "RENTPCT02"  "RENTPCT05"  "RENTPCT08"  "PUBAST90"  
## [31] "pubast00"   "YRHOM02"    "YRHOM05"    "YRHOM08"

1. Membuat plot univariat dasar, yaitu histogram dan box plot

Histogram

Kita mulai dengan perintah histogram sederhana. Seperti di buku kerja GeoDa, kita akan menggunakan variabel kids2009.

ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=7)#bin=7

ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=5)#bin=5

ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=7) +
  xlab("Percent kids in 2009") +
  ylab("Frequency") +
  ggtitle("Example Histogram") +
    theme_minimal()#tema bawaan

ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=7) +
  xlab("Percent kids in 2009") +
  ylab("Frequency") +
  ggtitle("Example Histogram") +
  theme_classic()#tema bawaan

ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=7) +
  xlab("Percent kids in 2009") +
  ylab("Frequency") +
  ggtitle("Example Histogram") +
  theme_tufte()

Menetapkan (sebagian) grafik ke suatu objek

baseplt <- ggplot(data=nyc.data,aes(kids2009)) +
  geom_histogram(bins=7) 
baseplt +
  xlab("Percent kids in 2009") +
  ylab("Frequency") +
  ggtitle("Example Histogram") +
  theme(plot.title = element_text(hjust = 0.5))

Box plot

box.plt <- ggplot(data=nyc.data,aes(x="",y=kids2009)) +
  geom_boxplot()
box.plt

box.dta <- layer_data(box.plt)
box.dta
##     ymin    lower  middle   upper    ymax outliers notchupper notchlower x
## 1 8.6623 26.69425 33.5284 39.6773 48.1308        0    36.2944    30.7624 1
##   flipped_aes PANEL group ymin_final ymax_final  xmin  xmax order xid newx
## 1       FALSE     1     1          0    48.1308 0.625 1.375     1   1    1
##   new_width weight    colour  fill size alpha shape linetype linewidth width
## 1      0.75      1 #333333FF white  1.5    NA    19        1       0.5   0.9
box.desc <- function(box.lyr,mult=1.5) {
  # function to computer lower and upper fence in a box plot
  # box.lyr: a box plot layer_data object
  # mult: the multiplier for the fence calculation, default = 1.5
  iqr <- box.lyr$upper - box.lyr$lower # inter-quartile range
  upfence <- box.lyr$upper + mult * iqr  # upper fence
  lofence <- box.lyr$lower - mult * iqr  # lower fence
  return(c(lofence,upfence))
}

box.desc(box.dta)#fungsi ini untuk memperoleh pagar bawah dan atas.
## [1]  7.219675 59.151875
box.plt3 <- ggplot(data=nyc.data,aes(x="",y=kids2009)) +
  geom_boxplot(coef=3)
box.plt3

box.dta3 <- layer_data(box.plt3)
box.dta3
##   ymin    lower  middle   upper    ymax outliers notchupper notchlower x
## 1    0 26.69425 33.5284 39.6773 48.1308             36.2944    30.7624 1
##   flipped_aes PANEL group ymin_final ymax_final  xmin  xmax order xid newx
## 1       FALSE     1     1          0    48.1308 0.625 1.375     1   1    1
##   new_width weight    colour  fill size alpha shape linetype linewidth width
## 1      0.75      1 #333333FF white  1.5    NA    19        1       0.5   0.9
box.desc(box.dta3,mult=3.0)
## [1] -12.25490  78.62645

Karena pagar bawah negatif, nilai 0 tidak lagi menjadi outlier.

base.plt <- ggplot(data=nyc.data,aes(x="",y=kids2009))
base.plt + 
   geom_point(color="blue",alpha=0.5) +
  geom_boxplot(color="black",fill="purple",outlier.color="red") +
  stat_boxplot(geom="errorbar") + #garis eror
  xlab("") +
  ggtitle("Example Box Plot") +
  theme(plot.title = element_text(hjust=0.5))

2. Membuat diagram sebaran (Scatter Plot)

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   geom_point()

3. Menerapkan berbagai metode penghalusan dalam diagram sebar (linier, loess, dan lowess)

Penghalus linier

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   geom_point() +
   geom_smooth(method=lm, color="blue") +
   ggtitle("Linear Smoother") +
   theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

Loess smoother

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   geom_point() +
   geom_smooth(method=loess, color="blue", se=FALSE) +
   ggtitle("Loess Smoother") +
   theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   stat_smooth(method="loess",span=0.4,color="blue",se=FALSE) +
   geom_point() +
   ggtitle("Loess Smoother - Span=0.4") +
   theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   geom_smooth(method="loess",span=0.2,color="blue",se=FALSE) +
   geom_point() +
   ggtitle("Loess Smoother - Span=0.2") +
   theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

LOWESS smoother

ggplot(data=nyc.data,aes(x=kids2000,y=pubast00)) +
   geom_smooth(color="blue",se = FALSE) +
   geom_point() +
   ggtitle("LOWESS Smoother") +
   theme(plot.title = element_text(hjust=0.5))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data = nyc.data, aes(x = kids2000, y = pubast00)) +
  geom_smooth(method = "loess", span = 0.4, color = "blue", se = FALSE) +
  geom_point() +
  ggtitle("LOWESS Smoother - Span = 0.4") +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'

Menyatukan semuanya

ggplot(data = nyc.data, aes(x = kids2000, y = pubast00)) +
  
  # Lowess smoothing (loess)
  geom_smooth(aes(color = "Lowess"), method = "loess", se = FALSE) +
  
  # Linear regression
  geom_smooth(aes(color = "Linear"), method = "lm", se = FALSE) +
  
  # Loess kedua (opsional, tapi akan sama seperti pertama)
  geom_smooth(aes(color = "Loess"), method = "loess", se = FALSE, linetype = "dashed") +
  
  # Scatter points
  geom_point() +
  
  ggtitle("Comparison of Smoothing Methods") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(color = "Method")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

5. Menguji keteguhan kemiringan regresi (uji Chow)

nyc.select <- nyc.data %>% filter(manbronx == "Select")
nyc.rest <- nyc.data %>% filter(manbronx == "Rest")
dim(nyc.select)
## [1] 20 35
dim(nyc.rest)
## [1] 35 35
reg.select <- lm(pubast00 ~ kids2000,data=nyc.select)
reg.rest <- lm(pubast00 ~ kids2000,data=nyc.rest)
summary(reg.select)
## 
## Call:
## lm(formula = pubast00 ~ kids2000, data = nyc.select)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0486 -1.4829  0.3248  2.0625  4.7156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.74763    1.84198  -2.577    0.019 *  
## kids2000     0.47225    0.05071   9.313 2.64e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.406 on 18 degrees of freedom
## Multiple R-squared:  0.8281, Adjusted R-squared:  0.8186 
## F-statistic: 86.73 on 1 and 18 DF,  p-value: 2.639e-08
summary(reg.rest)
## 
## Call:
## lm(formula = pubast00 ~ kids2000, data = nyc.rest)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4415 -2.8231 -0.3905  1.9686  8.2359 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.01398    3.33123  -2.106 0.042939 *  
## kids2000     0.37260    0.08648   4.308 0.000139 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.957 on 33 degrees of freedom
## Multiple R-squared:   0.36,  Adjusted R-squared:  0.3406 
## F-statistic: 18.56 on 1 and 33 DF,  p-value: 0.0001391
chow <- chow.test(nyc.select$pubast00,nyc.select$kids2000,
                  nyc.rest$pubast00,nyc.rest$kids2000)
chow
##      F value        d.f.1        d.f.2      P value 
## 1.534013e+01 2.000000e+00 5.100000e+01 6.082099e-06

Reference:

https://spatialanalysis.github.io/handsonspatialdata/exploratory-data-analysis-1.html