This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
online_retail <- read_xlsx("online_retail.xlsx")
head(online_retail)
## # A tibble: 6 Ă— 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEAR… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEARTS… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLAG… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE … 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NES… 2 2010-12-01 08:26:00 7.65
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
library(readxl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data_raw <- read_xlsx("online_retail.xlsx")
cat("Data berhasil dimuat. jumlah baris awal:", nrow(data_raw), "\n")
## Data berhasil dimuat. jumlah baris awal: 541909
online_retail_clean <- data_raw %>%
mutate(TotalPrice = Quantity * UnitPrice) %>%
filter(!is.na(CustomerID)) %>%
filter(!grepl("^C", InvoiceNo)) %>%
filter(Quantity > 0, UnitPrice > 0) %>%
filter(TotalPrice < 1000)
cat("Data setelah cleaning dan engineering. Jumlah baris akhir:", nrow(online_retail_clean), "\n")
## Data setelah cleaning dan engineering. Jumlah baris akhir: 397568
online_retail_clean <- online_retail_clean %>%
mutate(TotalPrice = Quantity * UnitPrice)
data_model <- online_retail_clean %>%
select(X = Quantity, Y = TotalPrice)
cat("\nVariabel X (Independen): Quantity\n")
##
## Variabel X (Independen): Quantity
cat("Variabel Y (Dependen): TotalPrice\n")
## Variabel Y (Dependen): TotalPrice
model_regresi <- lm(Y ~ X, data = data_model)
cat("\n--- RINGKASAN MODEL REGRESI LINEAR ---\n")
##
## --- RINGKASAN MODEL REGRESI LINEAR ---
print(summary(model_regresi))
##
## Call:
## lm(formula = Y ~ X, data = data_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2225.85 -10.08 -5.13 2.10 896.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.064902 0.057192 193.5 <2e-16 ***
## X 0.755162 0.001566 482.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.02 on 397566 degrees of freedom
## Multiple R-squared: 0.369, Adjusted R-squared: 0.369
## F-statistic: 2.325e+05 on 1 and 397566 DF, p-value: < 2.2e-16
plot_regresi <- ggplot(data_model, aes(x = X, y = Y)) +
geom_point(alpha = 0.3, color = "#007BFF") +
geom_smooth(method = "lm", col = "#DC3545", se = TRUE) +
labs(title = "Regresi Linear: Total Harga vs. Kuantitas Barang",
x = "Kuantitas Barang (Quantity) - Variabel X",
y = "Total Harga (TotalPrice) - Variabel Y",
caption = "Data dibatasi TotalPrice < 1000 GBP untuk visualisasi") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
print(plot_regresi)
## `geom_smooth()` using formula = 'y ~ x'
koefisien <- coef(model_regresi)
cat("\nPersamaan Regresi (Y = intercept + slope * X):\n")
##
## Persamaan Regresi (Y = intercept + slope * X):
cat(sprintf("TotalPrice = %.2f + %.2f * Quantity\n", koefisien[1], koefisien[2]))
## TotalPrice = 11.06 + 0.76 * Quantity
online_retail_clean <- data_raw %>%
filter(!is.na(CustomerID))
cat(" - Aksi 1: Hapus NA di CustomerID. Baris terhapus:", nrow(data_raw) - nrow(online_retail_clean), "\n")
## - Aksi 1: Hapus NA di CustomerID. Baris terhapus: 135080
cat(" - Sisa baris setelah Aksi 1:", nrow(online_retail_clean), "\n\n")
## - Sisa baris setelah Aksi 1: 406829
cat("3. Perbaikan Struktur Data (Tipe Data):\n")
## 3. Perbaikan Struktur Data (Tipe Data):
cat(" - InvoiceDate diubah menjadi Tipe Data: ", class(online_retail_clean$InvoiceDate), "\n")
## - InvoiceDate diubah menjadi Tipe Data: POSIXct POSIXt
cat(" - CustomerID diubah menjadi Tipe Data: ", class(online_retail_clean$CustomerID), "\n\n")
## - CustomerID diubah menjadi Tipe Data: numeric
online_retail_clean <- online_retail_clean %>%
filter(Quantity > 0, UnitPrice > 0)
cat(" - Aksi 4: Hapus Quantity <= 0 dan UnitPrice <= 0. Sisa baris:", nrow(online_retail_clean), "\n\n")
## - Aksi 4: Hapus Quantity <= 0 dan UnitPrice <= 0. Sisa baris: 397884
online_retail_final <- online_retail_clean %>%
mutate(TotalPrice = Quantity * UnitPrice)
cat("5. Ringkasan Proses Cleaning:\n")
## 5. Ringkasan Proses Cleaning:
cat(" - Total baris awal:", nrow(data_raw), "\n")
## - Total baris awal: 541909
cat(" - Total baris akhir:", nrow(online_retail_final), "\n")
## - Total baris akhir: 397884
cat(" - Struktur Data Akhir:\n")
## - Struktur Data Akhir:
print(glimpse(online_retail))
## Rows: 541,909
## Columns: 8
## $ InvoiceNo <chr> "536365", "536365", "536365", "536365", "536365", "536365"…
## $ StockCode <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752", …
## $ Description <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTERN…
## $ Quantity <dbl> 6, 6, 8, 6, 6, 2, 6, 6, 6, 32, 6, 6, 8, 6, 6, 3, 2, 3, 3, …
## $ InvoiceDate <dttm> 2010-12-01 08:26:00, 2010-12-01 08:26:00, 2010-12-01 08:2…
## $ UnitPrice <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 1.69…
## $ CustomerID <dbl> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17…
## $ Country <chr> "United Kingdom", "United Kingdom", "United Kingdom", "Uni…
## # A tibble: 541,909 Ă— 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEA… 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTE… 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEART… 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLA… 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE… 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NE… 2 2010-12-01 08:26:00 7.65
## 7 536365 21730 GLASS STAR FROSTE… 6 2010-12-01 08:26:00 4.25
## 8 536366 22633 HAND WARMER UNION… 6 2010-12-01 08:28:00 1.85
## 9 536366 22632 HAND WARMER RED P… 6 2010-12-01 08:28:00 1.85
## 10 536367 84879 ASSORTED COLOUR B… 32 2010-12-01 08:34:00 1.69
## # ℹ 541,899 more rows
## # ℹ 2 more variables: CustomerID <dbl>, Country <chr>
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats 1.0.1 âś” stringr 1.5.2
## âś” lubridate 1.9.4 âś” tibble 3.3.0
## âś” purrr 1.2.0 âś” tidyr 1.3.1
## âś” readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
## Warning: package 'psych' was built under R version 4.5.2
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
data <- read_excel("online_retail.xlsx")
library(dplyr)
library(corrplot)
library(psych)
summary(select(data, where(is.numeric)))
## Quantity UnitPrice CustomerID
## Min. :-80995.000 Min. :-11062.060 Min. :12346
## 1st Qu.: 1.000 1st Qu.: 1.250 1st Qu.:13953
## Median : 3.000 Median : 2.080 Median :15152
## Mean : 9.552 Mean : 4.611 Mean :15288
## 3rd Qu.: 10.000 3rd Qu.: 4.130 3rd Qu.:16791
## Max. : 80995.000 Max. : 38970.000 Max. :18287
## NA's :135080
numeric_data <- select(data, where(is.numeric))
cor_matrix <- cor(numeric_data, use = "complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper", tl.cex = 0.8)
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
pressure2 <- pressure
plot(pressure2)
You can also embed plots, for example:
library(tidyverse)
library(readxl)
# 1. Baca file Excel
online_retail <- read_excel("online_retail.xlsx")
# 2. Buat kolom TotalSales (Quantity * UnitPrice)
online_retail <- online_retail %>%
mutate(TotalSales = Quantity * UnitPrice)
# 3. Hitung 5 produk terlaris berdasarkan StockCode
produk_terlaris <- online_retail %>%
group_by(StockCode) %>%
summarise(TotalSales = sum(TotalSales), .groups = "drop") %>%
arrange(desc(TotalSales)) %>%
slice_head(n = 5)
# 4. Buat grafik batang horizontal
ggplot(produk_terlaris, aes(x = reorder(StockCode, TotalSales), y = TotalSales)) +
geom_bar(stat = "identity", fill = "maroon") +
coord_flip() +
labs(
title = "5 Produk Terlaris Berdasarkan StockCode",
x = "StockCode",
y = "Total Penjualan"
) +
theme_minimal() +
theme(axis.text.y = element_text(size = 10))
colnames(online_retail)
## [1] "InvoiceNo" "StockCode" "Description" "Quantity" "InvoiceDate"
## [6] "UnitPrice" "CustomerID" "Country" "TotalSales"
library(readxl)
library(ggplot2)
# Baca file Excel
online_retail <- read_excel("online_retail.xlsx")
# Buat kolom bulan
online_retail$bulan <- format(online_retail$InvoiceDate, "%b")
# Hitung jumlah transaksi per bulan
tab <- as.data.frame(table(online_retail$bulan))
names(tab) <- c("bulan", "realisasi")
# Urutkan bulan Jan–Dec
tab$bulan <- factor(tab$bulan, levels = month.abb)
tab <- tab[order(tab$bulan), ]
# Tambah target, selisih, status
tab$target <- round(mean(tab$realisasi) * 1.10)
tab$selisih <- tab$realisasi - tab$target
tab$status <- ifelse(tab$realisasi >= tab$target, "Tercapai", "Belum Tercapai")
# Grafik
ggplot(tab, aes(x = bulan)) +
geom_col(aes(y = target), fill = "lightblue", alpha = 0.6) +
geom_line(aes(y = realisasi, group = 1), color = "darkblue", linewidth = 1) +
geom_point(aes(y = realisasi, color = status), size = 3) +
scale_color_manual(values = c("Tercapai" = "forestgreen", "Belum Tercapai" = "red")) +
theme_minimal()