# ----- Load Packages -----
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(ggplot2)
library(ggcorrplot)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(treemap)
library(flexdashboard)
options(warn=-1)
# ----- Read Dataset -----
adult <- read.csv("data_input/adult_full.csv", na.strings = "?")
str(adult)
## 'data.frame':    48842 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : chr  "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: chr  "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
##  $ occupation    : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship  : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race          : chr  "White" "White" "White" "Black" ...
##  $ sex           : chr  "Male" "Male" "Male" "Male" ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: chr  "United-States" "United-States" "United-States" "United-States" ...
##  $ income        : chr  "<=50K" "<=50K" "<=50K" "<=50K" ...
anyNA(adult)
## [1] TRUE
# ----- Data Preparation -----
adult <- adult %>% mutate(
  income = as.factor(income),
  occupation = as.factor(occupation),
  education = as.factor (education),
  marital.status = as.factor(marital.status),
  race = as.factor(race),
  sex = as.factor(sex),
  native.country = as.factor(native.country)
)
str(adult)
## 'data.frame':    48842 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : chr  "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 4 levels "<=50K","<=50K.",..: 1 1 1 1 1 1 1 3 3 3 ...
# ----- **1. Analisis Demografi Pendapatan** -----

## ----- Bar Chart: Perbandingan Income -----
plot1 <- ggplot(adult, aes(x = income, fill = income)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribusi Pendapatan", x = "Pendapatan ($)", y = "Jumlah Orang")
plotly::ggplotly(plot1)
## ----- Histogram Usia -----
plot2 <- ggplot(adult, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Distribusi Usia", x = "Usia (tahun)", y = "Jumlah Orang")
plotly::ggplotly(plot2)

Insight Distribusi Pendapatan: - Lebih banyak orang (24,720 orang) yang memiliki pendapatan kurang dari $50.0000 - Hanya sekitar 3,846 orang yang memiliki pendapatan lebih dari $50.0000

Insight Distribusi Usia: - Usia paling mendominasi adalah 35 tahun - Usia paling sedikit jumlahnya adalah 85 tahun

# ----- **2. Faktor yang Berpengaruh terhadap Pendapatan** -----

## ----- Correlation Heatmap -----
num_vars <- adult %>% select_if(is.numeric) # Select only numeric variables
corr_matrix <- cor(num_vars, use = "complete.obs") # compute correlation matrix, ignoring missing values
plot3 <- ggcorrplot(corr_matrix, method = "circle", type = "lower", lab = TRUE) # generate heatmap of correlations
plotly::ggplotly(plot3)
## ----- Box Plot: Distribusi Jam Kerja berdasarkan Pekerjaan -----
plot4 <- ggplot(adult, aes(x = occupation, y = hours.per.week, fill = income)) +
  geom_boxplot() +
  theme(axis.text = element_text(angle = 45, hjust = 1)) + 
  labs(title = "Distribusi Jam Kerja berdasarkan Pekerjaan")
plotly::ggplotly(plot4)
# ----- **Dashboard Interaktif** -----
## ----- Treemap: Proporsi Jenis Pekerjaan -----
adult <- adult %>% drop_na(fnlwgt)
adult <- adult %>% mutate(fnlwgt_scaled = log10(fnlwgt))

plot5 <- treemap(adult,
        index = "occupation",
        vSize = "fnlwgt_scaled",
        title = "Proporsi Jenis Pekerjaan",
        palette = "Set3")

plot5
## $tm
##           occupation       vSize vColor      stdErr vColorValue level        x0
## 1       Adm-clerical 29253.48925   5611 29253.48925          NA     1 0.2667648
## 2       Armed-Forces    79.68566     15    79.68566          NA     1 0.9929390
## 3       Craft-repair 31845.38472   6112 31845.38472          NA     1 0.0000000
## 4    Exec-managerial 31630.18366   6086 31630.18366          NA     1 0.2667648
## 5    Farming-fishing  7629.24218   1490  7629.24218          NA     1 0.8938265
## 6  Handlers-cleaners 10847.65832   2072 10847.65832          NA     1 0.7428635
## 7  Machine-op-inspct 15800.40356   3022 15800.40356          NA     1 0.5208747
## 8      Other-service 25564.62312   4923 25564.62312          NA     1 0.7741842
## 9    Priv-house-serv  1263.93406    242  1263.93406          NA     1 0.8809412
## 10    Prof-specialty 32070.33203   6172 32070.33203          NA     1 0.0000000
## 11   Protective-serv  5148.58453    983  5148.58453          NA     1 0.8809412
## 12             Sales 28677.17651   5504 28677.17651          NA     1 0.5208747
## 13      Tech-support  7529.29055   1446  7529.29055          NA     1 0.7428635
## 14  Transport-moving 12255.81188   2355 12255.81188          NA     1 0.5208747
##            y0           w          h   color
## 1  0.00000000 0.254109934 0.48048168 #8DD3C7
## 2  0.00000000 0.007060988 0.04710158 #FFFFB3
## 3  0.00000000 0.266764763 0.49824028 #BEBADA
## 4  0.48048168 0.254109934 0.51951832 #FB8072
## 5  0.22758902 0.106173489 0.29990666 #80B1D3
## 6  0.22758902 0.150963058 0.29990666 #FDB462
## 7  0.23042623 0.221988756 0.29706946 #B3DE69
## 8  0.52749568 0.225815845 0.47250432 #FCCDE5
## 9  0.00000000 0.111997858 0.04710158 #D9D9D9
## 10 0.49824028 0.266764763 0.50175972 #BC80BD
## 11 0.04710158 0.119058845 0.18048744 #CCEBC5
## 12 0.52749568 0.253309459 0.47250432 #FFED6F
## 13 0.00000000 0.138077701 0.22758902 #8DD3C7
## 14 0.00000000 0.221988756 0.23042623 #FFFFB3
## 
## $type
## [1] "index"
## 
## $vSize
## [1] "fnlwgt_scaled"
## 
## $vColor
## [1] NA
## 
## $stdErr
## [1] "fnlwgt_scaled"
## 
## $algorithm
## [1] "pivotSize"
## 
## $vpCoorX
## [1] 0.02812148 0.97187852
## 
## $vpCoorY
## [1] 0.01968504 0.91031496
## 
## $aspRatio
## [1] 1.483512
## 
## $range
## [1] NA
## 
## $mapping
## [1] NA NA NA
## 
## $draw
## [1] TRUE
## ----- Scatter Plot: Hubungan Capital Gain, Jam Kerja, dan Pendapatan -----
plot6 <- ggplot(adult, aes(x = capital.gain, y = hours.per.week, color = income)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(title = "Hubungan Capital Gain, Jam Kerja, dan Pendapatan")
plotly::ggplotly(plot6)
# ----- **Analisis Jam Kerja dan Pendapatan** -----
adult <- adult %>% drop_na(hours.per.week, income)

## ----- Line Chart: Tren Jam Kerja terhadap Pendapatan -----
plot7 <- ggplot(adult, aes(x = hours.per.week, y = after_stat(count), color = income)) +
  geom_density() +
  labs(title = "Tren Jam Kerja terhadap Pendapatan")
plotly::ggplotly(plot7)
## ----- Violin Plot: Distribusi Jam Kerja berdasarkan Pendapatan -----
plot8 <- ggplot(adult, aes(x = income, y = hours.per.week, fill = income)) +
  geom_violin() +
  theme_minimal() +
  labs(title = "Distribusi Jam Kerja berdasarkan Pendapatan")
plotly::ggplotly(plot8)