# ----- Load Packages -----
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(ggplot2)
library(ggcorrplot)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(treemap)
library(flexdashboard)
options(warn=-1)
# ----- Read Dataset -----
adult <- read.csv("data_input/adult_full.csv", na.strings = "?")
str(adult)
## 'data.frame': 48842 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: chr "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
anyNA(adult)
## [1] TRUE
# ----- Data Preparation -----
adult <- adult %>% mutate(
income = as.factor(income),
occupation = as.factor(occupation),
education = as.factor (education),
marital.status = as.factor(marital.status),
race = as.factor(race),
sex = as.factor(sex),
native.country = as.factor(native.country)
)
str(adult)
## 'data.frame': 48842 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education.num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital.gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital.loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours.per.week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 4 levels "<=50K","<=50K.",..: 1 1 1 1 1 1 1 3 3 3 ...
# ----- **1. Analisis Demografi Pendapatan** -----
## ----- Bar Chart: Perbandingan Income -----
plot1 <- ggplot(adult, aes(x = income, fill = income)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribusi Pendapatan", x = "Pendapatan ($)", y = "Jumlah Orang")
plotly::ggplotly(plot1)
## ----- Histogram Usia -----
plot2 <- ggplot(adult, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribusi Usia", x = "Usia (tahun)", y = "Jumlah Orang")
plotly::ggplotly(plot2)
Insight Distribusi Pendapatan: - Lebih banyak orang (24,720 orang) yang memiliki pendapatan kurang dari $50.0000 - Hanya sekitar 3,846 orang yang memiliki pendapatan lebih dari $50.0000
Insight Distribusi Usia: - Usia paling mendominasi adalah 35 tahun - Usia paling sedikit jumlahnya adalah 85 tahun
# ----- **2. Faktor yang Berpengaruh terhadap Pendapatan** -----
## ----- Correlation Heatmap -----
num_vars <- adult %>% select_if(is.numeric) # Select only numeric variables
corr_matrix <- cor(num_vars, use = "complete.obs") # compute correlation matrix, ignoring missing values
plot3 <- ggcorrplot(corr_matrix, method = "circle", type = "lower", lab = TRUE) # generate heatmap of correlations
plotly::ggplotly(plot3)
## ----- Box Plot: Distribusi Jam Kerja berdasarkan Pekerjaan -----
plot4 <- ggplot(adult, aes(x = occupation, y = hours.per.week, fill = income)) +
geom_boxplot() +
theme(axis.text = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribusi Jam Kerja berdasarkan Pekerjaan")
plotly::ggplotly(plot4)
# ----- **Dashboard Interaktif** -----
## ----- Treemap: Proporsi Jenis Pekerjaan -----
adult <- adult %>% drop_na(fnlwgt)
adult <- adult %>% mutate(fnlwgt_scaled = log10(fnlwgt))
plot5 <- treemap(adult,
index = "occupation",
vSize = "fnlwgt_scaled",
title = "Proporsi Jenis Pekerjaan",
palette = "Set3")
plot5
## $tm
## occupation vSize vColor stdErr vColorValue level x0
## 1 Adm-clerical 29253.48925 5611 29253.48925 NA 1 0.2667648
## 2 Armed-Forces 79.68566 15 79.68566 NA 1 0.9929390
## 3 Craft-repair 31845.38472 6112 31845.38472 NA 1 0.0000000
## 4 Exec-managerial 31630.18366 6086 31630.18366 NA 1 0.2667648
## 5 Farming-fishing 7629.24218 1490 7629.24218 NA 1 0.8938265
## 6 Handlers-cleaners 10847.65832 2072 10847.65832 NA 1 0.7428635
## 7 Machine-op-inspct 15800.40356 3022 15800.40356 NA 1 0.5208747
## 8 Other-service 25564.62312 4923 25564.62312 NA 1 0.7741842
## 9 Priv-house-serv 1263.93406 242 1263.93406 NA 1 0.8809412
## 10 Prof-specialty 32070.33203 6172 32070.33203 NA 1 0.0000000
## 11 Protective-serv 5148.58453 983 5148.58453 NA 1 0.8809412
## 12 Sales 28677.17651 5504 28677.17651 NA 1 0.5208747
## 13 Tech-support 7529.29055 1446 7529.29055 NA 1 0.7428635
## 14 Transport-moving 12255.81188 2355 12255.81188 NA 1 0.5208747
## y0 w h color
## 1 0.00000000 0.254109934 0.48048168 #8DD3C7
## 2 0.00000000 0.007060988 0.04710158 #FFFFB3
## 3 0.00000000 0.266764763 0.49824028 #BEBADA
## 4 0.48048168 0.254109934 0.51951832 #FB8072
## 5 0.22758902 0.106173489 0.29990666 #80B1D3
## 6 0.22758902 0.150963058 0.29990666 #FDB462
## 7 0.23042623 0.221988756 0.29706946 #B3DE69
## 8 0.52749568 0.225815845 0.47250432 #FCCDE5
## 9 0.00000000 0.111997858 0.04710158 #D9D9D9
## 10 0.49824028 0.266764763 0.50175972 #BC80BD
## 11 0.04710158 0.119058845 0.18048744 #CCEBC5
## 12 0.52749568 0.253309459 0.47250432 #FFED6F
## 13 0.00000000 0.138077701 0.22758902 #8DD3C7
## 14 0.00000000 0.221988756 0.23042623 #FFFFB3
##
## $type
## [1] "index"
##
## $vSize
## [1] "fnlwgt_scaled"
##
## $vColor
## [1] NA
##
## $stdErr
## [1] "fnlwgt_scaled"
##
## $algorithm
## [1] "pivotSize"
##
## $vpCoorX
## [1] 0.02812148 0.97187852
##
## $vpCoorY
## [1] 0.01968504 0.91031496
##
## $aspRatio
## [1] 1.483512
##
## $range
## [1] NA
##
## $mapping
## [1] NA NA NA
##
## $draw
## [1] TRUE
## ----- Scatter Plot: Hubungan Capital Gain, Jam Kerja, dan Pendapatan -----
plot6 <- ggplot(adult, aes(x = capital.gain, y = hours.per.week, color = income)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = "Hubungan Capital Gain, Jam Kerja, dan Pendapatan")
plotly::ggplotly(plot6)
# ----- **Analisis Jam Kerja dan Pendapatan** -----
adult <- adult %>% drop_na(hours.per.week, income)
## ----- Line Chart: Tren Jam Kerja terhadap Pendapatan -----
plot7 <- ggplot(adult, aes(x = hours.per.week, y = after_stat(count), color = income)) +
geom_density() +
labs(title = "Tren Jam Kerja terhadap Pendapatan")
plotly::ggplotly(plot7)
## ----- Violin Plot: Distribusi Jam Kerja berdasarkan Pendapatan -----
plot8 <- ggplot(adult, aes(x = income, y = hours.per.week, fill = income)) +
geom_violin() +
theme_minimal() +
labs(title = "Distribusi Jam Kerja berdasarkan Pendapatan")
plotly::ggplotly(plot8)