Tugas STA1232-Analisis Eksplorasi Data, IPB University

1 Input data

1.1 Susenas KP 4.3

library(readr)
df43 <- read_csv("C:/Users/Muhammad Hafiz F/Downloads/2023 Maret JABAR - SUSENAS KP BP 4.3.csv")
## New names:
## Rows: 25890 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (20): ...1, URUT, R101, R102, R105, R301, FOOD, NONFOOD, EXPEND, KAPITA,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(df43)
## # A tibble: 6 × 20
##    ...1   URUT  R101  R102  R105  R301     FOOD NONFOOD EXPEND KAPITA KALORI_KAP
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>   <dbl>  <dbl>  <dbl>      <dbl>
## 1     0 500001    32     7     2     4 2660400   2.30e6 4.96e6 1.24e6      2365.
## 2     1 500002    32    72     1     2 1108714.  5.25e5 1.63e6 8.17e5      2612.
## 3     2 500003    32     6     2     3 2413886.  1.40e6 3.81e6 1.27e6      2527.
## 4     3 500004    32    72     1     7 7770000   4.31e6 1.21e7 1.73e6      3656.
## 5     4 500005    32    77     1     3 4932557.  4.62e7 5.12e7 1.71e7      2331.
## 6     5 500006    32    77     1     2 3272143.  1.14e7 1.47e7 7.34e6      2331.
## # ℹ 9 more variables: PROTE_KAP <dbl>, LEMAK_KAP <dbl>, KARBO_KAP <dbl>,
## #   WERT <dbl>, WEIND <dbl>, PSU <dbl>, SSU <dbl>, WI1 <dbl>, WI2 <dbl>

1.2 Susenas KP 4.2

df42 <- read_csv("C:/Users/Muhammad Hafiz F/Downloads/2023 Maret JABAR - SUSENAS KP BP 4.2.csv")
## New names:
## Rows: 1051815 Columns: 24
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): COICOP dbl (23): ...1, URUT, R101, R102, R105, R301, KODE, KLP, B42K3,
## B42K4, B42K5...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(df42)
## # A tibble: 6 × 24
##    ...1   URUT  R101  R102  R105  R301  KODE   KLP COICOP   B42K3  B42K4  B42K5
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>    <dbl>  <dbl>  <dbl>
## 1     0 500001    32     7     2     4   198     0 <NA>        0  965000 664000
## 2     1 500001    32     7     2     4   199     2 <NA>        1       0      0
## 3     2 500001    32     7     2     4   200   198 04221000    0  758000      0
## 4     3 500001    32     7     2     4   204   198 04310000    0       0 474000
## 5     4 500001    32     7     2     4   205     1 04510000  116.      0      0
## 6     5 500001    32     7     2     4   206   198 04510000    0   75000      0
## # ℹ 12 more variables: B42K3A <dbl>, B42K3B <dbl>, B42K3C <dbl>, B42K3D <dbl>,
## #   B42K3E <dbl>, SEBULAN <dbl>, WERT <dbl>, WEIND <dbl>, PSU <dbl>, SSU <dbl>,
## #   WI1 <dbl>, WI2 <dbl>

1.3 Susenas KOR

dfKor <- read_csv("C:/Users/Muhammad Hafiz F/Downloads/2023 Maret JABAR - SUSENAS KOR Rumah Tangga.csv")
## New names:
## Rows: 25890 Columns: 199
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): R2002_A, R2002_B, R2002_C, R2002_D, R2204C_A, R2204C_B, R2204C_C,... dbl
## (180): ...1, URUT, PSU, SSU, WI1, WI2, R101, R102, R105, NUINFORT, R1701... lgl
## (1): R2204C_F
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(dfKor)
## # A tibble: 6 × 199
##    ...1   URUT   PSU    SSU   WI1    WI2  R101  R102  R105 NUINFORT R1701 R1702
##   <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>
## 1     0 500001 12448 123442 12435 123427    32     7     2        2     5     5
## 2     1 500002 31373 311039 31360 311024    32    72     1        1     1     1
## 3     2 500003 12092 119908 12079 119893    32     6     2        2     5     5
## 4     3 500004 31135 308689 31122 308674    32    72     1        2     5     5
## 5     4 500005 33988 336798 33975 336783    32    77     1        1     5     5
## 6     5 500006 34062 337531 34049 337516    32    77     1        1     5     5
## # ℹ 187 more variables: R1703 <dbl>, R1704 <dbl>, R1705 <dbl>, R1706 <dbl>,
## #   R1707 <dbl>, R1708 <dbl>, NUINFORT1 <dbl>, R1801 <dbl>, R1802 <dbl>,
## #   R1803 <dbl>, R1804 <dbl>, R1805 <dbl>, R1806 <dbl>, R1807 <dbl>,
## #   R1808 <dbl>, R1809A <dbl>, R1809B <dbl>, R1809C <dbl>, R1809D <dbl>,
## #   R1809E <dbl>, R1810A <dbl>, R1810B <dbl>, R1811A <dbl>, R1811B <dbl>,
## #   R1812 <dbl>, R1813A <dbl>, R1813B <dbl>, R1813C <dbl>, R1813D <dbl>,
## #   R1813E <dbl>, R1814A <dbl>, R1814B <dbl>, R1815A <dbl>, R1815B <dbl>, …

2 Data cleaning and formatting

2.1 Susenas KP 4.3

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df43 <- df43 %>% 
  filter(R101 == 32, R102 == 71) %>% 
  select(URUT, R101, R102, FOOD, NONFOOD, EXPEND)

head(df43)
## # A tibble: 6 × 6
##     URUT  R101  R102     FOOD   NONFOOD    EXPEND
##    <dbl> <dbl> <dbl>    <dbl>     <dbl>     <dbl>
## 1 500039    32    71 2770286.  2305333.  5075619.
## 2 500064    32    71  262714.   467083.   729798.
## 3 500165    32    71 9556029. 19716167. 29272195.
## 4 500198    32    71 1819971.  1340583.  3160555.
## 5 500202    32    71 1496143.   958667.  2454810.
## 6 500237    32    71 2315571.  4581667.  6897238.
df43 <- df43 %>% mutate(FOODPerc = FOOD / EXPEND)

head(df43)
## # A tibble: 6 × 7
##     URUT  R101  R102     FOOD   NONFOOD    EXPEND FOODPerc
##    <dbl> <dbl> <dbl>    <dbl>     <dbl>     <dbl>    <dbl>
## 1 500039    32    71 2770286.  2305333.  5075619.    0.546
## 2 500064    32    71  262714.   467083.   729798.    0.360
## 3 500165    32    71 9556029. 19716167. 29272195.    0.326
## 4 500198    32    71 1819971.  1340583.  3160555.    0.576
## 5 500202    32    71 1496143.   958667.  2454810.    0.609
## 6 500237    32    71 2315571.  4581667.  6897238.    0.336

2.2 Susenas KP 4.2

df42 <- df42 %>% 
  filter(R101 == 32, R102 == 71) %>% 
  select(URUT, R101, R102, B42K4)

head(df42)
## # A tibble: 6 × 4
##     URUT  R101  R102   B42K4
##    <dbl> <dbl> <dbl>   <dbl>
## 1 500039    32    71 1594000
## 2 500039    32    71       0
## 3 500039    32    71  731000
## 4 500039    32    71       0
## 5 500039    32    71  147000
## 6 500039    32    71       0
df42 <- df42 %>%
  group_by(URUT) %>%
  mutate(Sum_B42K4 = sum(B42K4, na.rm = TRUE)) %>%
  ungroup() %>%
  select(-B42K4) %>% 
  distinct()

head(df42)
## # A tibble: 6 × 4
##     URUT  R101  R102 Sum_B42K4
##    <dbl> <dbl> <dbl>     <dbl>
## 1 500039    32    71   3498000
## 2 500064    32    71    842000
## 3 500165    32    71  16912000
## 4 500198    32    71   2264000
## 5 500202    32    71   1488000
## 6 500237    32    71   5564000

2.3 Susenas KOR

dfKor <- dfKor %>% 
  filter(R101 == 32, R102 == 71) %>% 
  select(URUT, R101, R102, R2001K)

head(dfKor)
## # A tibble: 6 × 4
##     URUT  R101  R102 R2001K
##    <dbl> <dbl> <dbl>  <dbl>
## 1 500039    32    71      5
## 2 500064    32    71      5
## 3 500165    32    71      1
## 4 500198    32    71      5
## 5 500202    32    71      5
## 6 500237    32    71      1

3 Data merging

df <- df43 %>% 
  select(URUT, FOODPerc) %>% 
  left_join(df42 %>% select(URUT, Sum_B42K4), by = "URUT") %>% 
  left_join(dfKor %>% select(URUT, R2001K), by = "URUT")

head(df)
## # A tibble: 6 × 4
##     URUT FOODPerc Sum_B42K4 R2001K
##    <dbl>    <dbl>     <dbl>  <dbl>
## 1 500039    0.546   3498000      5
## 2 500064    0.360    842000      5
## 3 500165    0.326  16912000      1
## 4 500198    0.576   2264000      5
## 5 500202    0.609   1488000      5
## 6 500237    0.336   5564000      1

4 Data summary

4.1 Function to find mean, stdev, skewness, and kurtosis

library(moments)

stat_summary <- function(data, column) {
  if (!(column %in% colnames(data))) {
    stop("Column not found in dataframe")
  }
  
  col_data <- data[[column]]
  
  result <- list(
    Mean = mean(col_data, na.rm = TRUE),
    Standard_Deviation = sd(col_data, na.rm = TRUE),
    Skewness = skewness(col_data, na.rm = TRUE),
    Kurtosis = kurtosis(col_data, na.rm = TRUE)
  )
  
  return(result)
}

4.2 Statistical summary of total (nonfood) household expense within the last month

stat_summary(df, "Sum_B42K4")
## $Mean
## [1] 8169824
## 
## $Standard_Deviation
## [1] 20935096
## 
## $Skewness
## [1] 14.69717
## 
## $Kurtosis
## [1] 305.7992

4.3 Statistical summary of the propotion of household expense used for food

stat_summary(df, "FOODPerc")
## $Mean
## [1] 0.4859775
## 
## $Standard_Deviation
## [1] 0.1473592
## 
## $Skewness
## [1] -0.5708026
## 
## $Kurtosis
## [1] 2.914345

4.4 Statistical summary of total (nonfood) household expense within the last month, separated between car owners and non-car owners

4.4.1 Car owners

stat_summary(df %>% filter(R2001K == 1), "Sum_B42K4")
## $Mean
## [1] 27291186
## 
## $Standard_Deviation
## [1] 43264527
## 
## $Skewness
## [1] 7.852352
## 
## $Kurtosis
## [1] 78.93799

4.4.2 Non-car owners

stat_summary(df %>% filter(R2001K == 5), "Sum_B42K4")
## $Mean
## [1] 3974540
## 
## $Standard_Deviation
## [1] 5302518
## 
## $Skewness
## [1] 10.30801
## 
## $Kurtosis
## [1] 164.7758