Tugas Pertemuan 3: Studi Kasus Preprocessing Data Dataset: Indonesia Reading Interest 2020-2023 (TGM 2020-2023) Link: https://www.kaggle.com/datasets/imaditia/indonesia-reading-interest-2020-2023 Pada tugas ini, saya menggunakan dataset Indonesia Reading Interest 2020-2023 yang diambil dari survei tingkat kegemaran membaca oleh Perpustakaan Nasional Indonesia. Dataset ini berisi informasi tentang kebiasaan membaca masyarakat di berbagai provinsi di Indonesia dalam kurun waktu 2020-2023.
Dataset ini memiliki beberapa variabel utama, di antaranya: Reading Frequency per week (Seberapa sering seseorang membaca dalam seminggu), Number of Readings per Quarter (Jumlah bacaan dalam satu kuartal), Daily Reading Duration (minutes) (Durasi membaca harian dalam menit), Internet Access Frequency per Week (Seberapa sering seseorang mengakses internet dalam seminggu), Daily Internet Duration (minutes) (Durasi penggunaan internet harian dalam menit), Tingkat Kegemaran Membaca (Reading Interest Score), Category (Kategori berdasarkan skor kegemaran membaca).
Langkah 1: Install dan Import Library
# Import library
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(rlang)
## Warning: package 'rlang' was built under R version 4.3.2
library(janitor)
## Warning: package 'janitor' was built under R version 4.3.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Langkah 2: Load Dataset dengan Penyesuaian
# Load dataset dengan pemisah titik koma (;)
data <- read_csv2("D:/Dataset/TGM 2020-2023_eng.csv", locale = locale(decimal_mark = ","))
## Rows: 140 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (4): Provinsi, Internet Access Frequency per Week, Daily Internet Durati...
## dbl (5): Year, Reading Frequency per week, Number of Readings per Quarter, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Tampilkan 5 baris pertama
head(data)
## # A tibble: 6 × 9
## Provinsi Year `Reading Frequency per week` `Number of Readings per Quarter`
## <chr> <dbl> <dbl> <dbl>
## 1 Aceh 2020 4 2
## 2 Aceh 2021 5.5 4.5
## 3 Aceh 2022 5 5.5
## 4 Aceh 2023 5 5.5
## 5 Bali 2020 4 2.5
## 6 Bali 2021 5.5 4.5
## # ℹ 5 more variables: `Daily Reading Duration (in minutes)` <dbl>,
## # `Internet Access Frequency per Week` <chr>,
## # `Daily Internet Duration (in minutes)` <chr>,
## # `Tingkat Kegemaran Membaca (Reading Interest)` <dbl>, Category <chr>
# Tampilkan ringkasan data sebelum imputasi
summary(data)
## Provinsi Year Reading Frequency per week
## Length:140 Min. :2020 Min. :3.000
## Class :character 1st Qu.:2021 1st Qu.:4.500
## Mode :character Median :2022 Median :4.750
## Mean :2022 Mean :4.664
## 3rd Qu.:2022 3rd Qu.:5.000
## Max. :2023 Max. :6.000
## Number of Readings per Quarter Daily Reading Duration (in minutes)
## Min. :1.500 Min. : 60.00
## 1st Qu.:2.875 1st Qu.: 90.90
## Median :4.500 Median : 97.20
## Mean :4.221 Mean : 98.22
## 3rd Qu.:5.500 3rd Qu.:106.00
## Max. :7.000 Max. :129.00
## Internet Access Frequency per Week Daily Internet Duration (in minutes)
## Length:140 Length:140
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## Tingkat Kegemaran Membaca (Reading Interest) Category
## Min. :36.01 Length:140
## 1st Qu.:56.40 Class :character
## Median :62.20 Mode :character
## Mean :61.20
## 3rd Qu.:65.75
## Max. :73.27
# Ubah "N/A" menjadi NA
data[data == "N/A"] <- NA
# Cek kembali jumlah missing value
colSums(is.na(data))
## Provinsi
## 0
## Year
## 0
## Reading Frequency per week
## 0
## Number of Readings per Quarter
## 0
## Daily Reading Duration (in minutes)
## 0
## Internet Access Frequency per Week
## 35
## Daily Internet Duration (in minutes)
## 35
## Tingkat Kegemaran Membaca (Reading Interest)
## 0
## Category
## 0
# Membersihkan nama kolom agar tidak ada spasi atau karakter khusus
data <- data %>% clean_names()
# Cek ulang nama kolom setelah dibersihkan
colnames(data)
## [1] "provinsi"
## [2] "year"
## [3] "reading_frequency_per_week"
## [4] "number_of_readings_per_quarter"
## [5] "daily_reading_duration_in_minutes"
## [6] "internet_access_frequency_per_week"
## [7] "daily_internet_duration_in_minutes"
## [8] "tingkat_kegemaran_membaca_reading_interest"
## [9] "category"
str(data)
## spc_tbl_ [140 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ provinsi : chr [1:140] "Aceh" "Aceh" "Aceh" "Aceh" ...
## $ year : num [1:140] 2020 2021 2022 2023 2020 ...
## $ reading_frequency_per_week : num [1:140] 4 5.5 5 5 4 5.5 5 5 4.5 5 ...
## $ number_of_readings_per_quarter : num [1:140] 2 4.5 5.5 5.5 2.5 4.5 5.5 5.5 4.5 5.5 ...
## $ daily_reading_duration_in_minutes : num [1:140] 95 103 94.3 95 91 ...
## $ internet_access_frequency_per_week : chr [1:140] NA "5" "5,5" "5,5" ...
## $ daily_internet_duration_in_minutes : chr [1:140] NA "83,9" "123,4" "104" ...
## $ tingkat_kegemaran_membaca_reading_interest: num [1:140] 54.7 64.1 65.8 66.6 57 ...
## $ category : chr [1:140] "Moderate" "High" "High" "Moderate" ...
## - attr(*, "spec")=
## .. cols(
## .. Provinsi = col_character(),
## .. Year = col_double(),
## .. `Reading Frequency per week` = col_double(),
## .. `Number of Readings per Quarter` = col_double(),
## .. `Daily Reading Duration (in minutes)` = col_double(),
## .. `Internet Access Frequency per Week` = col_character(),
## .. `Daily Internet Duration (in minutes)` = col_character(),
## .. `Tingkat Kegemaran Membaca (Reading Interest)` = col_double(),
## .. Category = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Ganti koma dengan titik sebagai pemisah desimal, lalu konversi ke numerik
data$internet_access_frequency_per_week <- as.numeric(gsub(",", ".", data$internet_access_frequency_per_week))
data$daily_internet_duration_in_minutes <- as.numeric(gsub(",", ".", data$daily_internet_duration_in_minutes))
# Cek ulang tipe data
str(data)
## spc_tbl_ [140 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ provinsi : chr [1:140] "Aceh" "Aceh" "Aceh" "Aceh" ...
## $ year : num [1:140] 2020 2021 2022 2023 2020 ...
## $ reading_frequency_per_week : num [1:140] 4 5.5 5 5 4 5.5 5 5 4.5 5 ...
## $ number_of_readings_per_quarter : num [1:140] 2 4.5 5.5 5.5 2.5 4.5 5.5 5.5 4.5 5.5 ...
## $ daily_reading_duration_in_minutes : num [1:140] 95 103 94.3 95 91 ...
## $ internet_access_frequency_per_week : num [1:140] NA 5 5.5 5.5 NA 5.5 5.5 5.5 5.5 5.5 ...
## $ daily_internet_duration_in_minutes : num [1:140] NA 83.9 123.4 104 NA ...
## $ tingkat_kegemaran_membaca_reading_interest: num [1:140] 54.7 64.1 65.8 66.6 57 ...
## $ category : chr [1:140] "Moderate" "High" "High" "Moderate" ...
## - attr(*, "spec")=
## .. cols(
## .. Provinsi = col_character(),
## .. Year = col_double(),
## .. `Reading Frequency per week` = col_double(),
## .. `Number of Readings per Quarter` = col_double(),
## .. `Daily Reading Duration (in minutes)` = col_double(),
## .. `Internet Access Frequency per Week` = col_character(),
## .. `Daily Internet Duration (in minutes)` = col_character(),
## .. `Tingkat Kegemaran Membaca (Reading Interest)` = col_double(),
## .. Category = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Langkah 3: Imputasi menggunakan MICE
# Cek pola missing value
md.pattern(data)
## provinsi year reading_frequency_per_week number_of_readings_per_quarter
## 105 1 1 1 1
## 35 1 1 1 1
## 0 0 0 0
## daily_reading_duration_in_minutes
## 105 1
## 35 1
## 0
## tingkat_kegemaran_membaca_reading_interest category
## 105 1 1
## 35 1 1
## 0 0
## internet_access_frequency_per_week daily_internet_duration_in_minutes
## 105 1 1 0
## 35 0 0 2
## 35 35 70
# Imputasi menggunakan metode default MICE (Predictive Mean Matching)
imputed_data <- mice(data, method = "pmm", m = 5, seed = 777)
##
## iter imp variable
## 1 1 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 1 2 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 1 3 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 1 4 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 1 5 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 2 1 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 2 2 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 2 3 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 2 4 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 2 5 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 3 1 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 3 2 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 3 3 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 3 4 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 3 5 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 4 1 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 4 2 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 4 3 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 4 4 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 4 5 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 5 1 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 5 2 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 5 3 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 5 4 internet_access_frequency_per_week daily_internet_duration_in_minutes
## 5 5 internet_access_frequency_per_week daily_internet_duration_in_minutes
## Warning: Number of logged events: 2
# Konversi hasil imputasi menjadi dataset lengkap
completed_data <- complete(imputed_data)
# Cek apakah masih ada missing value setelah imputasi
colSums(is.na(completed_data))
## provinsi
## 0
## year
## 0
## reading_frequency_per_week
## 0
## number_of_readings_per_quarter
## 0
## daily_reading_duration_in_minutes
## 0
## internet_access_frequency_per_week
## 0
## daily_internet_duration_in_minutes
## 0
## tingkat_kegemaran_membaca_reading_interest
## 0
## category
## 0
# Cek apakah masih ada missing value setelah imputasi
colSums(is.na(completed_data))
## provinsi
## 0
## year
## 0
## reading_frequency_per_week
## 0
## number_of_readings_per_quarter
## 0
## daily_reading_duration_in_minutes
## 0
## internet_access_frequency_per_week
## 0
## daily_internet_duration_in_minutes
## 0
## tingkat_kegemaran_membaca_reading_interest
## 0
## category
## 0
Langkah 4: Menangani Outliers dengan Interquartile Range (IQR)
# Menghitung batas IQR
Q1 <- quantile(data$daily_reading_duration_in_minutes, 0.25, na.rm = TRUE)
Q3 <- quantile(data$daily_reading_duration_in_minutes, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Menangani outliers dengan winsorizing
data$daily_reading_duration_in_minutes <- ifelse(data$daily_reading_duration_in_minutes < lower_bound, lower_bound, data$daily_reading_duration_in_minutes)
data$daily_reading_duration_in_minutes <- ifelse(data$daily_reading_duration_in_minutes > upper_bound, upper_bound, data$daily_reading_duration_in_minutes)
Langkah 5: Encoding Data Kategorikal
data$category <- as.factor(data$category)
Langkah 6: Simpan dataset yang sudah cleaned
write.csv(completed_data, "D:/Dataset/TGM_2020-2023_imputed.csv", row.names = FALSE)