Tugas Visdat Praktikum 10

Library

library(ggplot2)            
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("ggplot2")
library("sf")

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library("rnaturalearth")
library("rnaturalearthdata")

## 
## Attaching package: 'rnaturalearthdata'
## 
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(sf)
library(ggspatial)
library(dplyr)
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ggcorrplot)
library(ggforce)
library(readxl)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Data

data <- read_excel("/Users/user/Downloads/Documents/Anreg /Tugas Akhir/DATASET ANREG2.xlsx")
data[] <- lapply(data, as.numeric)

## Warning in lapply(data, as.numeric): NAs introduced by coercion

data <- subset(data, select = -c(Wilayah))
data

## # A tibble: 27 × 11
##     IPKM melek miskin rumah_sakit kepadatan  lama pengangguran     upah  PDRB
##    <dbl> <dbl>  <dbl>       <dbl>     <dbl> <dbl>        <dbl>    <dbl> <dbl>
##  1  79.5  98.4   475.          30      1861  8.34        10.6  4217206  30173
##  2  79.3  99.5   186.           9       674  7.11         7.77 3125445. 17953
##  3  77.8  99.2   247.           5       700  7.2          8.41 2699814. 13590
##  4  83.1  99.5   259.          15      2136  9.08         6.98 3241930. 23782
##  5  79.8  99.6   277.           7       847  7.83         7.6  1975221. 15991
##  6  76.8  99.0   194.           2       705  7.73         4.17 2326772. 13829
##  7  80.5  98.6    94            6       782  8            3.75 1897867. 19169
##  8  83.1  98.0   140.          12      1003  7.88         9.81 1908102. 15426
##  9  80.7  94.8   266.          12      2150  7.4          8.11 2279983. 15342
## 10  78.1  98.2   147.           5      1004  7.49         4.16 2027619. 18197
## # ℹ 17 more rows
## # ℹ 2 more variables: sekolah <dbl>, sampah <dbl>

data2 <- read_csv("/Users/user/Downloads/Documents/Visdat 📊/BBCA.JK-2.csv")

## Rows: 61 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (6): Open, High, Low, Close, Adj Close, Volume
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(data2)

## spc_tbl_ [61 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date     : Date[1:61], format: "2019-05-01" "2019-06-01" ...
##  $ Open     : num [1:61] 5750 5820 5995 6190 6100 ...
##  $ High     : num [1:61] 5895 6190 6290 6270 6105 ...
##  $ Low      : num [1:61] 5140 5790 5865 5765 5780 ...
##  $ Close    : num [1:61] 5820 5995 6190 6100 6070 ...
##  $ Adj Close: num [1:61] 5229 5386 5562 5481 5454 ...
##  $ Volume   : num [1:61] 1.70e+09 1.18e+09 1.05e+09 1.53e+09 1.26e+09 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_date(format = ""),
##   ..   Open = col_double(),
##   ..   High = col_double(),
##   ..   Low = col_double(),
##   ..   Close = col_double(),
##   ..   `Adj Close` = col_double(),
##   ..   Volume = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

dataset <- read_excel("/Users/user/Downloads/Kurs Transaksi USD  .xlsx")
str(dataset)

## tibble [4,996 × 5] (S3: tbl_df/tbl/data.frame)
##  $ NO       : num [1:4996] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Nilai    : num [1:4996] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Kurs Jual: num [1:4996] 16283 16357 16330 16303 16289 ...
##  $ Kurs Beli: num [1:4996] 16121 16195 16168 16141 16127 ...
##  $ Tanggal  : chr [1:4996] "5/3/2024 12:00:00 AM" "5/2/2024 12:00:00 AM" "4/30/2024 12:00:00 AM" "4/29/2024 12:00:00 AM" ...

Data Peubah Numerik

Korelasi

plot_X1 <- ggplot(data, aes(x = melek, y = IPKM)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x = "Angka Melek Huruf", y = "IPKM")

plot_X2 <- ggplot(data, aes(x = miskin, y = IPKM)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x = "Jumlah Penduduk Miskin", y = "IPKM")

plot_X3 <- ggplot(data, aes(x = rumah_sakit, y = IPKM)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x = "Jumlah Rumah Sakit", y = "IPKM")

plot_X4 <- ggplot(data, aes(x = kepadatan, y = IPKM)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(x = "Kepadatan Penduduk", y = "IPKM")

# Menggabungkan scatter plot ke dalam satu layout
grid.arrange(plot_X1, plot_X2, plot_X3, plot_X4, nrow = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Terlihat bahwa angka melek huruf, kepadatan penduduk, dan jumlah rumah sakit menunjukkan hubungan linear positif yang menandakan bahwa semakin tinggi angka melek huruf dan kepadatan penduduk, serta semakin banyak jumlah rumah sakit, maka nilai IPKM cenderung meningkat. Sebaliknya, jumlah penduduk miskin menunjukkan hubungan linear negatif yang menandakan bahwa semakin sedikit jumlah penduduk miskin, maka nilai IPKM cenderung menurun.

data$kategori <- ifelse(data$kepadatan >= 2500, "padat penduduk", "tidak padat penduduk")

ggplot(data, aes(x = sampah, y = IPKM, color = kategori)) +
  geom_point() +
  labs(title = "Scatter Plot Tingkatan IPKM vs. jumlah sampah ditangani", x = "Jumlah Sampah Ditangani", y = "IPKM", color = "Kategori") +
  theme_minimal()

Scatter plot tersebut menggambarkan bahwa wilayah-wilayah dengan kepadatan penduduk tinggi cenderung memiliki nilai IPKM yang lebih tinggi dan menangani jumlah sampah yang lebih besar dibandingkan dengan wilayah yang memiliki kepadatan penduduk rendah. Dari gambar tersebut juga bisa diindikasikan bahwa jumlah sampah ditangani memiliki hubungan linear positif dengan IPKM, yang berarti semakin banyak sampah yang ditangani maka nilai IPKM juga akan meningkat.

Matrix Plot

data_numerik <- select_if(data, is.numeric)
str(data_numerik)

## tibble [27 × 11] (S3: tbl_df/tbl/data.frame)
##  $ IPKM        : num [1:27] 79.5 79.3 77.8 83.1 79.8 ...
##  $ melek       : num [1:27] 98.4 99.5 99.2 99.5 99.6 ...
##  $ miskin      : num [1:27] 475 186 247 259 277 ...
##  $ rumah_sakit : num [1:27] 30 9 5 15 7 2 6 12 12 5 ...
##  $ kepadatan   : num [1:27] 1861 674 700 2136 847 ...
##  $ lama        : num [1:27] 8.34 7.11 7.2 9.08 7.83 7.73 8 7.88 7.4 7.49 ...
##  $ pengangguran: num [1:27] 10.64 7.77 8.41 6.98 7.6 ...
##  $ upah        : num [1:27] 4217206 3125445 2699814 3241930 1975221 ...
##  $ PDRB        : num [1:27] 30173 17953 13590 23782 15991 ...
##  $ sekolah     : num [1:27] 6513 5012 4398 422 3983 ...
##  $ sampah      : num [1:27] 397 352 457 560 330 ...

data_melt <- cor(data_numerik[sapply(data_numerik,is.numeric)])

data_melt <- melt(data_melt) 

ggplot(data_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  labs(title = "Correlation Heatmap",
       x = "Variable 1",
       y = "Variable 2")

ggplot(data_melt, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1,1), name="Korelasi") +
  labs(title = "Corellogram") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

data_corrl <- round(cor(data_numerik), 1)

data_corrl <- cor(data_corrl)
ggcorrplot(data_corrl)

ggcorrplot(data_corrl, method = "circle")

Dari gambar-gambar di atas, dapat disimpulkan bahwa terdapat indikasi multikolinieritas antara beberapa variabel X, seperti antara kepadatan penduduk dan rata-rata lama sekolah. Korelasi yang cukup besar antara kedua variabel ini, yang ditunjukkan dengan warna yang lebih gelap atau lebih terang dalam plot, menandakan adanya hubungan yang signifikan di antara keduanya. Oleh karena itu, dapat disimpulkan bahwa kepadatan penduduk dan rata-rata lama sekolah saling mempengaruhi karena adanya korelasi yang signifikan.

Piecewise Constant

mod_tangga = lm(Sepal.Length ~ cut(Petal.Length,3),data=iris)
summary(mod_tangga)

## 
## Call:
## lm(formula = Sepal.Length ~ cut(Petal.Length, 3), data = iris)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0111 -0.3111 -0.0060  0.2889  1.2261 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.00600    0.06913  72.409   <2e-16 ***
## cut(Petal.Length, 3)(2.97,4.93]  0.90511    0.09594   9.434   <2e-16 ***
## cut(Petal.Length, 3)(4.93,6.91]  1.66791    0.09987  16.700   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4889 on 147 degrees of freedom
## Multiple R-squared:  0.6562, Adjusted R-squared:  0.6515 
## F-statistic: 140.3 on 2 and 147 DF,  p-value: < 2.2e-16

ggplot(iris,aes(x=Petal.Length, y=Sepal.Length)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,3), 
               lty = 1, col = "red",se = F)+
  theme_bw()

Gambar tersebut menunjukkan scatter plot yang menggambarkan hubungan antara petal length dan sepal length. Garis merah yang tergambar mewakili fungsi berpotongan atau fungsi langkah untuk memodelkan atau menginterpretasi distribusi titik-titik data. Terdapat tiga kelompok titik abu-abu yang berbeda, menunjukkan adanya tiga kelompok bunga dengan karakteristik yang berbeda berdasarkan panjang kelopak dan kelopak daunnya.Teridentifikasi juga bahwa petal length dan sepal length memiliki hubungan linear positif yang berarti semakin panjang kelopak bunga maka semakin panjang pula kelompak daun.

ggplot(iris, aes(x = Petal.Length, y = Sepal.Length)) +
  geom_point(color = "blue", size = 3, alpha = 0.6) +
  geom_smooth(method = "loess", color = "red", linetype = "dashed", size = 1.5) +
  labs(
    x = "Petal Length", 
    y = "Sepal Length", 
    title = "LOESS Visualization of Petal Length vs. Sepal Length",
    subtitle = "Smoothed scatterplot with LOESS curve",
    caption = "Data Iris"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

Gambar tersebut menampilkan scatter plot yang telah disesuaikan dengan kurva LOESS, yang menggambarkan hubungan antara kedua variabel dengan lebih halus. Kurva ini mengidentifikasi korelasi positif antar kedua variabel. Hal ini mengindikasikan hubungan yang sejalan antara dua variabel tersebut, yang diperkuat oleh hasil smoothing menggunakan metode LOESS, yang menyajikan pola hubungan dengan lebih halus dan kontinu.

Data Time Series

Scatter Plot Time Series

data2 <- head(data2, -1)
ggplot(data2, aes(x =Date, y = Open)) +
  geom_point() +
  labs(title = "Scatter Plot of Time Series Data (BBCA)",
       x = "Date",
       y = "Value (open)")

ggplot(data2, aes(x =Date, y = Open)) +
  geom_point() +
  geom_line() + 
  labs(title = "Scatter Plot of Time Series Data (BBCA)",
       x = "Date",
       y = "Value (open)")

ggplot(data2, aes(x =Date, y = Open)) +
  geom_line() +
  labs(title = "Scatter Plot of Time Series Data (BBCA)",
       x = "Date",
       y = "Value (open)")

calculate_moving_average <- function(data2, window_size) {
  ma_values <- zoo::rollmean(data2$Open, k = window_size, align = "right", fill = NA)
  ma_values_padded <- c(rep(NA, window_size - 1), ma_values)
  data2$ma <- ma_values_padded[1:nrow(data2)]  # Extract only the necessary number of rows
  return(data2)
}

window_size <- 3
data2 <- calculate_moving_average(data2, window_size)

# Create the plot
ggplot(data2, aes(x = Date)) +
  geom_line(aes(y = Open), color = "white", size = 1) +  
  geom_line(aes(y = ma), color = "red", linetype = "dashed", size = 1) +  # emulusan
  geom_ribbon(aes(ymin = -Inf, ymax = ma), fill = "red", alpha = 0.2) +  # Area under curve
  labs(title = paste("BBCA data with Moving Average (Window Size:", window_size, ")"),
       x = "Date",
       y = "Value (open)") +
  theme_minimal()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

data_open <- data2$Open

data.ts <- ts(data_open)

plot(data.ts, xlab ="Waktu", ylab = "Data Open BBCA", col="red", main = "Plot Data Open BBCA")
points(data.ts)

Grafik-grafik tersebut menggambarkan pola kenaikan dan penurunan harga saham BBCA seiring waktu. Terlihat bahwa saham BBCA mengalami fluktuasi harga, namun secara keseluruhan mengalami tren kenaikan yang signifikan. Titik-titik puncak pada grafik menunjukkan periode-periode di mana harga saham mencapai level tertinggi, yang terlihat menurun pada akhir tahun 2020, 2021, dan mendekati pertengahan tahun 2022. Di sisi lain, terlihat bahwa saham BBCA mengalami peningkatan yang cukup konsisten pada pertengahan tahun 2021, awal hingga pertengahan tahun 2022, serta akhir tahun 2023 hingga saat ini. Hal ini mencerminkan adanya pola tren kenaikan yang berkelanjutan dalam jangka waktu tersebut.

Plot Perbandingan Time Series

data1 <- read_csv("/Users/user/Downloads/Documents/Visdat 📊/BBCA.JK-2.csv")

## Rows: 61 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (6): Open, High, Low, Close, Adj Close, Volume
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data2 <- read_csv("/Users/user/Downloads/Documents/Visdat 📊/BBNI.JK-2.csv")

## Rows: 61 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (6): Open, High, Low, Close, Adj Close, Volume
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data1 <- head(data1, -1)
data2 <- head(data2, -1)
# Memplot kedua seri waktu
ggplot() +
  geom_line(data = data1, aes(x = Date, y = Open, color = "BBCA")) +
  geom_line(data = data2, aes(x = Date, y = Open, color = "BBNI")) +
  labs(title = "Perbandingan Dua Data Time Series (BBCA vs. BBNI)",
       x = "Date",
       y = "Value",
       color = "Series") +
  scale_color_manual(values = c("BBCA" = "blue", "BBNI" = "red")) +
  theme_minimal()

Grafik di atas mengilustrasikan perbandingan harga saham pembukaan (open) antara BBCA dan BBNI. Meskipun pola grafik keduanya hampir serupa, terlihat bahwa harga saham BBCA selalu berada di atas harga saham BBNI sepanjang rentang waktu dari awal tahun 2019 hingga awal tahun 2024. Hal ini menunjukkan dominasi BBCA atas BBNI dalam hal nilai harga saham pembukaannya selama periode yang diamati.

Data Spasial

world <- ne_countries(scale = "medium", returnclass = "sf")
ggplot(data = world) +
    geom_sf()

ggplot(data = world) +
    geom_sf(aes(fill = pop_est)) +
    scale_fill_viridis_c(option = "volcano")

## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option):
## Option 'volcano' does not exist. Defaulting to 'viridis'.

spasial <- read.csv("/Users/user/Downloads/world_population.csv")
world_map <- map_data("world")
colnames(spasial)[colnames(spasial) == "Country.Territory"] <- "region"
merged_data <- left_join(world_map, spasial, by = c("region" = "region"))
ggplot(merged_data, aes(x = long, y = lat, group = group, fill = `X2022.Population`)) +
  geom_polygon(color = "black") +
  scale_fill_gradient(name = "Population (2022)", low = "pink", high = "red", guide = "legend") +
  theme_void() +
  labs(title = "World Population by Country/Territory (2022)")

Dari kumpulan map chart yang disajikan di atas, terlihat distribusi populasi global pada tahun 2022. Warna kuning dan merah menunjukkan negara-negara dengan populasi yang tinggi, sementara warna biru dan pink menandakan populasi yang lebih rendah. Dari peta tersebut, terlihat dengan jelas bahwa China dan India memiliki populasi yang sangat besar. Namun, pada gambar ketiga, terdapat negara dengan warna abu-abu yang menandakan adanya ketidaksesuaian dalam format nama negara dengan data yang seharusnya.