Cheat Sheet

Cek tipe data

x <- "woi"
str(data)
## function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"), 
##     envir = .GlobalEnv, overwrite = TRUE)
typeof(x)
## [1] "character"
class(x)
## [1] "character"

Eksplorasi & Pembersihan Data : Data Preparation

# Contoh baca data
data <- read.csv("data_uts.csv", header = TRUE)

#Cek dan Ringkas Data
head(data)
##    nama usia kelas nilai tahun2022 tahun2023 kategori
## 1  Andi   19     A    80        75        85    Sains
## 2  Budi   20     B    70        65        72   Sosial
## 3 Citra   21     A    90        88        92    Sains
## 4  Dedi   22     B    65        60        67   Bisnis
## 5   Evi   20     C    85        80        88    Sains
## 6 Farah   19     C    75        73        77   Sosial
summary(data)
##      nama                usia          kelas               nilai      
##  Length:10          Min.   :19.00   Length:10          Min.   :60.00  
##  Class :character   1st Qu.:20.00   Class :character   1st Qu.:70.00  
##  Mode  :character   Median :20.50   Mode  :character   Median :77.50  
##                     Mean   :20.70                      Mean   :77.80  
##                     3rd Qu.:21.75                      3rd Qu.:87.25  
##                     Max.   :23.00                      Max.   :95.00  
##    tahun2022       tahun2023       kategori        
##  Min.   :58.00   Min.   :63.00   Length:10         
##  1st Qu.:65.75   1st Qu.:72.25   Class :character  
##  Median :74.00   Median :81.00   Mode  :character  
##  Mean   :74.30   Mean   :80.40                     
##  3rd Qu.:83.00   3rd Qu.:89.50                     
##  Max.   :92.00   Max.   :97.00
dim(data)
## [1] 10  7
#Menangani Missing Values
is.na(data)
##        nama  usia kelas nilai tahun2022 tahun2023 kategori
##  [1,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [2,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [3,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [4,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [5,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [6,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [7,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [8,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
##  [9,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
## [10,] FALSE FALSE FALSE FALSE     FALSE     FALSE    FALSE
sum(is.na(data))
## [1] 0
data <- na.omit(data)

#Mengganti Nilai
data$nama[data$nama == 'Andi' ] <- 'budi'

#Mengubah Tipe Data
# data$kategori <- as.numeric(data$kategori)

Eksplorasi & Pembersihan Data : Data Wrangling di R

#Pakai paket dplyr dan tidyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Filter, Select, Arrange
data %>% filter(usia > 20)
##    nama usia kelas nilai tahun2022 tahun2023 kategori
## 1 Citra   21     A    90        88        92    Sains
## 2  Dedi   22     B    65        60        67   Bisnis
## 3 Galih   23     A    60        58        63   Bisnis
## 4  Hana   22     B    95        92        97    Sains
## 5 Indra   21     C    70        68        73   Sosial
data %>% select(nama, usia)
##     nama usia
## 1   budi   19
## 2   Budi   20
## 3  Citra   21
## 4   Dedi   22
## 5    Evi   20
## 6  Farah   19
## 7  Galih   23
## 8   Hana   22
## 9  Indra   21
## 10  Joko   20
data %>% arrange(desc(usia))
##     nama usia kelas nilai tahun2022 tahun2023 kategori
## 1  Galih   23     A    60        58        63   Bisnis
## 2   Dedi   22     B    65        60        67   Bisnis
## 3   Hana   22     B    95        92        97    Sains
## 4  Citra   21     A    90        88        92    Sains
## 5  Indra   21     C    70        68        73   Sosial
## 6   Budi   20     B    70        65        72   Sosial
## 7    Evi   20     C    85        80        88    Sains
## 8   Joko   20     A    88        84        90    Sains
## 9   budi   19     A    80        75        85    Sains
## 10 Farah   19     C    75        73        77   Sosial
#Mutate & Summarize
data %>% mutate(usia2 = usia^2)
##     nama usia kelas nilai tahun2022 tahun2023 kategori usia2
## 1   budi   19     A    80        75        85    Sains   361
## 2   Budi   20     B    70        65        72   Sosial   400
## 3  Citra   21     A    90        88        92    Sains   441
## 4   Dedi   22     B    65        60        67   Bisnis   484
## 5    Evi   20     C    85        80        88    Sains   400
## 6  Farah   19     C    75        73        77   Sosial   361
## 7  Galih   23     A    60        58        63   Bisnis   529
## 8   Hana   22     B    95        92        97    Sains   484
## 9  Indra   21     C    70        68        73   Sosial   441
## 10  Joko   20     A    88        84        90    Sains   400
data %>% group_by(kelas) %>% summarize(rata = mean(nilai))
## # A tibble: 3 × 2
##   kelas  rata
##   <chr> <dbl>
## 1 A      79.5
## 2 B      76.7
## 3 C      76.7
#Reshape Data
# pivot_longer(data, cols = starts_with("tahun"), names_to = "Tahun", values_to = "Nilai")
# pivot_wider(data, names_from = kategori, values_from = nilai)

Visualisasi Data Dasar

library(ggplot2)

# Scatter plot
ggplot(data, aes(x=usia, y=nilai)) + geom_point()

# Histogram
ggplot(data, aes(x=nilai)) + geom_histogram(binwidth=5, fill="blue")

# Bar chart
ggplot(data, aes(x=kelas, fill=kelas)) + geom_bar()

# Boxplot
ggplot(data, aes(x=kelas, y=nilai)) + geom_boxplot()

Pengenalan Machine Learning

#Split Data
set.seed(123)
index <- sample(1:nrow(iris), 0.8*nrow(iris))
train <- iris[index, ]
test <- iris[-index, ]

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data=train)
summary(model)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82577 -0.21712  0.02843  0.18999  0.85864 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.89882    0.28429   6.679 8.74e-10 ***
## Sepal.Width   0.63637    0.07606   8.367 1.52e-13 ***
## Petal.Length  0.69826    0.06208  11.247  < 2e-16 ***
## Petal.Width  -0.53303    0.13964  -3.817 0.000218 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3258 on 116 degrees of freedom
## Multiple R-squared:  0.856,  Adjusted R-squared:  0.8523 
## F-statistic: 229.9 on 3 and 116 DF,  p-value: < 2.2e-16
#Predict
pred <- predict(model, newdata=test)

#Evaluasi
mse <- mean((test$Sepal.Length - pred)^2)
print(mse)
## [1] 0.07225017
#Klasifikasi, Decicion Tree
library(rpart)
tree <- rpart(Species ~ ., data=train, method="class")
pred <- predict(tree, test, type="class")

Latihan data preparation, data wrangling, dan visualisasi menggunakan data ‘International football results from 1872 to 2025’

Link datasets: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017 Ini adalah latihan project mandiri untuk persiapan UTS program pendidikan pengantar sains data angkatan 2025 Universitas Negeri Semarang

Setup

Import library dan dataset yang dibutuhkan.

library(dplyr); library(lubridate); library(readr)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
rawdf <- read_csv("results.csv")
## Rows: 48532 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): home_team, away_team, tournament, city, country
## dbl  (2): home_score, away_score
## lgl  (1): neutral
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data preperation

Task:

  • Parse date ke tipe tanggal.
  • Buat total_goals = home_score + away_score, goal_diff = home_score - away_score.
  • Buat result ∈ {home_win,draw,away_win}.
  • Cek duplikat exact row (jika ada), dan laporkan jumlahnya.

Ubah kolom date menjadi tipe data date, buat kolom total_goal, goal_diff, dan result

df <- rawdf |>
  mutate(
    date = ymd(date),
    total_goals = home_score + away_score,
    goal_diff = home_score - away_score,
    result = case_when(
      home_score > away_score ~ "home_win",
      away_score > home_score ~ "away_win",
      TRUE ~ "draw"
    )
  )
head(df, n = 5)
## # A tibble: 5 × 12
##   date       home_team away_team home_score away_score tournament city   country
##   <date>     <chr>     <chr>          <dbl>      <dbl> <chr>      <chr>  <chr>  
## 1 1872-11-30 Scotland  England            0          0 Friendly   Glasg… Scotla…
## 2 1873-03-08 England   Scotland           4          2 Friendly   London England
## 3 1874-03-07 Scotland  England            2          1 Friendly   Glasg… Scotla…
## 4 1875-03-06 England   Scotland           2          2 Friendly   London England
## 5 1876-03-04 Scotland  England            3          0 Friendly   Glasg… Scotla…
## # ℹ 4 more variables: neutral <lgl>, total_goals <dbl>, goal_diff <dbl>,
## #   result <chr>

Outlier handling menggunakan batas atas interquartile IQR

iqr_val <- IQR(df$total_goals)
q3 <- quantile(df$total_goals, 0.75)
upper = q3 + 1.5 * iqr_val

df <- df |>
  mutate(is_outlier = total_goals > upper)
head(df, n = 5)
## # A tibble: 5 × 13
##   date       home_team away_team home_score away_score tournament city   country
##   <date>     <chr>     <chr>          <dbl>      <dbl> <chr>      <chr>  <chr>  
## 1 1872-11-30 Scotland  England            0          0 Friendly   Glasg… Scotla…
## 2 1873-03-08 England   Scotland           4          2 Friendly   London England
## 3 1874-03-07 Scotland  England            2          1 Friendly   Glasg… Scotla…
## 4 1875-03-06 England   Scotland           2          2 Friendly   London England
## 5 1876-03-04 Scotland  England            3          0 Friendly   Glasg… Scotla…
## # ℹ 5 more variables: neutral <lgl>, total_goals <dbl>, goal_diff <dbl>,
## #   result <chr>, is_outlier <lgl>

Cek outlier

normal <- df |>
  filter(is_outlier == FALSE)
nrow(normal)
## [1] 47710
outlier <- df |>
  filter(is_outlier == TRUE)
nrow(outlier)
## [1] 822

Cek duplikasi

dup_n <- df |>
  distinct() |>
  nrow() |>
  {\(x) nrow(df) - x}()
dup_n
## [1] 0

data wrangling

Task:

  • Bentuk tabel tim dengan metrik: matches, wins, draws, losses, goals_for, goals_against, goal_diff, win_rate.
  • Gabungkan kontribusi home & away secara fair.
  • Filter hanya tim dengan matches ≥ 100.
  • Tampilkan 15 tim dengan win_rate tertinggi.

Bentuk tabel tim dengan metrik: matches, wins, draws, losses, goals_for, goals_against, goal_diff, win_rate. Dan gabungkan kontribusi home & away secara fair.

home <- df |> transmute(
  nama = home_team,
  goal_for = home_score,
  goal_against = away_score,
  diff_score = goal_diff,
  result = case_when(
    home_score > away_score ~ "W",
    home_score < away_score ~ "L",
    TRUE ~ "D"
  )
)

away <- df |> transmute(
  nama = away_team,
  goal_for = away_score,
  goal_against = home_score,
  diff_score = goal_diff,
  result = case_when(
    home_score < away_score ~ "W",
    home_score > away_score ~ "L",
    TRUE ~ "D"
  )
)

#Gabungkan kontribusi home & away secara fair
semua <- bind_rows(home, away)

Filter tim yang hanya memiliki pertandingan >= 100 dan Tampilkan 15 tim dengan win_rate tertinggi.

all_time <- semua |>
  summarise(
    .by = nama,
    match = n(),
    win = sum(result == "W"), lose = sum(result == "L"), draw = sum(result == "D"),
    goal_for = sum(goal_for), goal_against = sum(goal_against)
  ) |>
  mutate(
    winrate = win/match,
    goal_diff = goal_for - goal_against
  )

all_time_match_100 <- filter(all_time, match >= 100)

show_all_time_wr <- all_time_match_100 |>
  arrange(desc(winrate)) |>
  head(n = 15)
show_all_time_wr
## # A tibble: 15 × 9
##    nama          match   win  lose  draw goal_for goal_against winrate goal_diff
##    <chr>         <int> <int> <int> <int>    <dbl>        <dbl>   <dbl>     <dbl>
##  1 Jersey          235   153    62    20      645          285   0.651       360
##  2 Brazil         1051   667   169   215     2283          947   0.635      1336
##  3 Guernsey        240   145    70    25      616          313   0.604       303
##  4 Spain           775   456   140   179     1583          699   0.588       884
##  5 Germany        1023   591   219   213     2295         1193   0.578      1102
##  6 England        1082   619   206   257     2364         1039   0.572      1325
##  7 Iran            604   345   117   142     1141          478   0.571       663
##  8 Argentina      1059   583   219   257     2000         1072   0.551       928
##  9 Tahiti          242   131    80    31      657          371   0.541       286
## 10 South Korea     999   533   214   252     1781          904   0.534       877
## 11 Italy           885   471   172   242     1551          870   0.532       681
## 12 Czech Republ…   352   187    96    69      624          360   0.531       264
## 13 Croatia         388   204    81   103      674          390   0.526       284
## 14 Netherlands     871   448   227   196     1823         1071   0.514       752
## 15 Russia          740   380   165   195     1286          728   0.514       558

Home Advantage vs Neutral Venue

Task

  • Definisikan home-at-home = country == home_team & neutral == FALSE
  • Hitung rata-rata goal_diff saat:
    1. home-at-home,
    2. non-home (home main di negara lain),
    3. venue netral (neutral == TRUE).
  • Bandingkan juga win_rate home di ketiga kondisi.
  • Visualisasikan perbandingan (bar chart).
library(ggplot2)

df2 <- df |>
  mutate(goal_diff = home_score - away_score,
         bucket = case_when(
           neutral ~ "Neutral",
           country == home_team ~ "Home-at-home",
           TRUE ~ "Home-away"
         ),
         home_win = home_score > away_score)

summ <- df2 |>
  summarise(.by = bucket,
            avg_goal_diff = mean(goal_diff),
            home_win_rate = mean(home_win))

ggplot(summ, aes(x=bucket, y=avg_goal_diff)) +
  geom_col() +
  labs(title="Rata-rata Goal Diff (Home) per Tipe Venue", x=NULL, y="Avg goal diff")