int, doublefactor,
characterTRUE, FALSEDate,
POSIXctx <- "woi"
str(data)
## function (..., list = character(), package = NULL, lib.loc = NULL, verbose = getOption("verbose"),
## envir = .GlobalEnv, overwrite = TRUE)
typeof(x)
## [1] "character"
class(x)
## [1] "character"
# Contoh baca data
data <- read.csv("data_uts.csv", header = TRUE)
#Cek dan Ringkas Data
head(data)
## nama usia kelas nilai tahun2022 tahun2023 kategori
## 1 Andi 19 A 80 75 85 Sains
## 2 Budi 20 B 70 65 72 Sosial
## 3 Citra 21 A 90 88 92 Sains
## 4 Dedi 22 B 65 60 67 Bisnis
## 5 Evi 20 C 85 80 88 Sains
## 6 Farah 19 C 75 73 77 Sosial
summary(data)
## nama usia kelas nilai
## Length:10 Min. :19.00 Length:10 Min. :60.00
## Class :character 1st Qu.:20.00 Class :character 1st Qu.:70.00
## Mode :character Median :20.50 Mode :character Median :77.50
## Mean :20.70 Mean :77.80
## 3rd Qu.:21.75 3rd Qu.:87.25
## Max. :23.00 Max. :95.00
## tahun2022 tahun2023 kategori
## Min. :58.00 Min. :63.00 Length:10
## 1st Qu.:65.75 1st Qu.:72.25 Class :character
## Median :74.00 Median :81.00 Mode :character
## Mean :74.30 Mean :80.40
## 3rd Qu.:83.00 3rd Qu.:89.50
## Max. :92.00 Max. :97.00
dim(data)
## [1] 10 7
#Menangani Missing Values
is.na(data)
## nama usia kelas nilai tahun2022 tahun2023 kategori
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [9,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [10,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(data))
## [1] 0
data <- na.omit(data)
#Mengganti Nilai
data$nama[data$nama == 'Andi' ] <- 'budi'
#Mengubah Tipe Data
# data$kategori <- as.numeric(data$kategori)
#Pakai paket dplyr dan tidyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Filter, Select, Arrange
data %>% filter(usia > 20)
## nama usia kelas nilai tahun2022 tahun2023 kategori
## 1 Citra 21 A 90 88 92 Sains
## 2 Dedi 22 B 65 60 67 Bisnis
## 3 Galih 23 A 60 58 63 Bisnis
## 4 Hana 22 B 95 92 97 Sains
## 5 Indra 21 C 70 68 73 Sosial
data %>% select(nama, usia)
## nama usia
## 1 budi 19
## 2 Budi 20
## 3 Citra 21
## 4 Dedi 22
## 5 Evi 20
## 6 Farah 19
## 7 Galih 23
## 8 Hana 22
## 9 Indra 21
## 10 Joko 20
data %>% arrange(desc(usia))
## nama usia kelas nilai tahun2022 tahun2023 kategori
## 1 Galih 23 A 60 58 63 Bisnis
## 2 Dedi 22 B 65 60 67 Bisnis
## 3 Hana 22 B 95 92 97 Sains
## 4 Citra 21 A 90 88 92 Sains
## 5 Indra 21 C 70 68 73 Sosial
## 6 Budi 20 B 70 65 72 Sosial
## 7 Evi 20 C 85 80 88 Sains
## 8 Joko 20 A 88 84 90 Sains
## 9 budi 19 A 80 75 85 Sains
## 10 Farah 19 C 75 73 77 Sosial
#Mutate & Summarize
data %>% mutate(usia2 = usia^2)
## nama usia kelas nilai tahun2022 tahun2023 kategori usia2
## 1 budi 19 A 80 75 85 Sains 361
## 2 Budi 20 B 70 65 72 Sosial 400
## 3 Citra 21 A 90 88 92 Sains 441
## 4 Dedi 22 B 65 60 67 Bisnis 484
## 5 Evi 20 C 85 80 88 Sains 400
## 6 Farah 19 C 75 73 77 Sosial 361
## 7 Galih 23 A 60 58 63 Bisnis 529
## 8 Hana 22 B 95 92 97 Sains 484
## 9 Indra 21 C 70 68 73 Sosial 441
## 10 Joko 20 A 88 84 90 Sains 400
data %>% group_by(kelas) %>% summarize(rata = mean(nilai))
## # A tibble: 3 × 2
## kelas rata
## <chr> <dbl>
## 1 A 79.5
## 2 B 76.7
## 3 C 76.7
#Reshape Data
# pivot_longer(data, cols = starts_with("tahun"), names_to = "Tahun", values_to = "Nilai")
# pivot_wider(data, names_from = kategori, values_from = nilai)
library(ggplot2)
# Scatter plot
ggplot(data, aes(x=usia, y=nilai)) + geom_point()
# Histogram
ggplot(data, aes(x=nilai)) + geom_histogram(binwidth=5, fill="blue")
# Bar chart
ggplot(data, aes(x=kelas, fill=kelas)) + geom_bar()
# Boxplot
ggplot(data, aes(x=kelas, y=nilai)) + geom_boxplot()
#Split Data
set.seed(123)
index <- sample(1:nrow(iris), 0.8*nrow(iris))
train <- iris[index, ]
test <- iris[-index, ]
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data=train)
summary(model)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82577 -0.21712 0.02843 0.18999 0.85864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.89882 0.28429 6.679 8.74e-10 ***
## Sepal.Width 0.63637 0.07606 8.367 1.52e-13 ***
## Petal.Length 0.69826 0.06208 11.247 < 2e-16 ***
## Petal.Width -0.53303 0.13964 -3.817 0.000218 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3258 on 116 degrees of freedom
## Multiple R-squared: 0.856, Adjusted R-squared: 0.8523
## F-statistic: 229.9 on 3 and 116 DF, p-value: < 2.2e-16
#Predict
pred <- predict(model, newdata=test)
#Evaluasi
mse <- mean((test$Sepal.Length - pred)^2)
print(mse)
## [1] 0.07225017
#Klasifikasi, Decicion Tree
library(rpart)
tree <- rpart(Species ~ ., data=train, method="class")
pred <- predict(tree, test, type="class")
Link datasets: https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017 Ini adalah latihan project mandiri untuk persiapan UTS program pendidikan pengantar sains data angkatan 2025 Universitas Negeri Semarang
library(dplyr); library(lubridate); library(readr)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
rawdf <- read_csv("results.csv")
## Rows: 48532 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): home_team, away_team, tournament, city, country
## dbl (2): home_score, away_score
## lgl (1): neutral
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
total_goals = home_score + away_score, goal_diff = home_score - away_score.df <- rawdf |>
mutate(
date = ymd(date),
total_goals = home_score + away_score,
goal_diff = home_score - away_score,
result = case_when(
home_score > away_score ~ "home_win",
away_score > home_score ~ "away_win",
TRUE ~ "draw"
)
)
head(df, n = 5)
## # A tibble: 5 × 12
## date home_team away_team home_score away_score tournament city country
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1872-11-30 Scotland England 0 0 Friendly Glasg… Scotla…
## 2 1873-03-08 England Scotland 4 2 Friendly London England
## 3 1874-03-07 Scotland England 2 1 Friendly Glasg… Scotla…
## 4 1875-03-06 England Scotland 2 2 Friendly London England
## 5 1876-03-04 Scotland England 3 0 Friendly Glasg… Scotla…
## # ℹ 4 more variables: neutral <lgl>, total_goals <dbl>, goal_diff <dbl>,
## # result <chr>
iqr_val <- IQR(df$total_goals)
q3 <- quantile(df$total_goals, 0.75)
upper = q3 + 1.5 * iqr_val
df <- df |>
mutate(is_outlier = total_goals > upper)
head(df, n = 5)
## # A tibble: 5 × 13
## date home_team away_team home_score away_score tournament city country
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1872-11-30 Scotland England 0 0 Friendly Glasg… Scotla…
## 2 1873-03-08 England Scotland 4 2 Friendly London England
## 3 1874-03-07 Scotland England 2 1 Friendly Glasg… Scotla…
## 4 1875-03-06 England Scotland 2 2 Friendly London England
## 5 1876-03-04 Scotland England 3 0 Friendly Glasg… Scotla…
## # ℹ 5 more variables: neutral <lgl>, total_goals <dbl>, goal_diff <dbl>,
## # result <chr>, is_outlier <lgl>
normal <- df |>
filter(is_outlier == FALSE)
nrow(normal)
## [1] 47710
outlier <- df |>
filter(is_outlier == TRUE)
nrow(outlier)
## [1] 822
dup_n <- df |>
distinct() |>
nrow() |>
{\(x) nrow(df) - x}()
dup_n
## [1] 0
matches, wins, draws, losses, goals_for, goals_against, goal_diff, win_rate.home & away
secara fair.matches ≥ 100.home <- df |> transmute(
nama = home_team,
goal_for = home_score,
goal_against = away_score,
diff_score = goal_diff,
result = case_when(
home_score > away_score ~ "W",
home_score < away_score ~ "L",
TRUE ~ "D"
)
)
away <- df |> transmute(
nama = away_team,
goal_for = away_score,
goal_against = home_score,
diff_score = goal_diff,
result = case_when(
home_score < away_score ~ "W",
home_score > away_score ~ "L",
TRUE ~ "D"
)
)
#Gabungkan kontribusi home & away secara fair
semua <- bind_rows(home, away)
all_time <- semua |>
summarise(
.by = nama,
match = n(),
win = sum(result == "W"), lose = sum(result == "L"), draw = sum(result == "D"),
goal_for = sum(goal_for), goal_against = sum(goal_against)
) |>
mutate(
winrate = win/match,
goal_diff = goal_for - goal_against
)
all_time_match_100 <- filter(all_time, match >= 100)
show_all_time_wr <- all_time_match_100 |>
arrange(desc(winrate)) |>
head(n = 15)
show_all_time_wr
## # A tibble: 15 × 9
## nama match win lose draw goal_for goal_against winrate goal_diff
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Jersey 235 153 62 20 645 285 0.651 360
## 2 Brazil 1051 667 169 215 2283 947 0.635 1336
## 3 Guernsey 240 145 70 25 616 313 0.604 303
## 4 Spain 775 456 140 179 1583 699 0.588 884
## 5 Germany 1023 591 219 213 2295 1193 0.578 1102
## 6 England 1082 619 206 257 2364 1039 0.572 1325
## 7 Iran 604 345 117 142 1141 478 0.571 663
## 8 Argentina 1059 583 219 257 2000 1072 0.551 928
## 9 Tahiti 242 131 80 31 657 371 0.541 286
## 10 South Korea 999 533 214 252 1781 904 0.534 877
## 11 Italy 885 471 172 242 1551 870 0.532 681
## 12 Czech Republ… 352 187 96 69 624 360 0.531 264
## 13 Croatia 388 204 81 103 674 390 0.526 284
## 14 Netherlands 871 448 227 196 1823 1071 0.514 752
## 15 Russia 740 380 165 195 1286 728 0.514 558
country == home_team & neutral == FALSEgoal_diff saat:
(neutral == TRUE).win_rate home di ketiga kondisi.library(ggplot2)
df2 <- df |>
mutate(goal_diff = home_score - away_score,
bucket = case_when(
neutral ~ "Neutral",
country == home_team ~ "Home-at-home",
TRUE ~ "Home-away"
),
home_win = home_score > away_score)
summ <- df2 |>
summarise(.by = bucket,
avg_goal_diff = mean(goal_diff),
home_win_rate = mean(home_win))
ggplot(summ, aes(x=bucket, y=avg_goal_diff)) +
geom_col() +
labs(title="Rata-rata Goal Diff (Home) per Tipe Venue", x=NULL, y="Avg goal diff")