BAB 1 #PART 1
# Deklarasi Data Frame
df <- data.frame(
student = c("Ami", "Budi", "Caca", "Doni", "Edo", "Fani", "Gita", "Hamim", "Ian", "Jacob"),
sex = c("F", "M", "F", "M", "M", "F", "F", "M", "M", "M"),
semester = c(2, 2, 6, 2, 3, 6, 7, 2, 4, 6),
household.income = c(15888, 12790, 4912, 48242, 237505, 5561, 37183, 51135, 43616, 13877),
final.score = c("88", "85", "85", "83", "83", "74", "74", "76", "76", "84"))
head(df, 3)
## student sex semester household.income final.score
## 1 Ami F 2 15888 88
## 2 Budi M 2 12790 85
## 3 Caca F 6 4912 85
tail(df)
## student sex semester household.income final.score
## 5 Edo M 3 237505 83
## 6 Fani F 6 5561 74
## 7 Gita F 7 37183 74
## 8 Hamim M 2 51135 76
## 9 Ian M 4 43616 76
## 10 Jacob M 6 13877 84
head(df, 3)
## student sex semester household.income final.score
## 1 Ami F 2 15888 88
## 2 Budi M 2 12790 85
## 3 Caca F 6 4912 85
tail(df, 8)
## student sex semester household.income final.score
## 3 Caca F 6 4912 85
## 4 Doni M 2 48242 83
## 5 Edo M 3 237505 83
## 6 Fani F 6 5561 74
## 7 Gita F 7 37183 74
## 8 Hamim M 2 51135 76
## 9 Ian M 4 43616 76
## 10 Jacob M 6 13877 84
# Pemotongan Data Frame
df$household.income # keluaran vector
## [1] 15888 12790 4912 48242 237505 5561 37183 51135 43616 13877
df[['household.income']]
## [1] 15888 12790 4912 48242 237505 5561 37183 51135 43616 13877
df['household.income'] # keluaran data.frame
## household.income
## 1 15888
## 2 12790
## 3 4912
## 4 48242
## 5 237505
## 6 5561
## 7 37183
## 8 51135
## 9 43616
## 10 13877
df[c('student', 'household.income')]
## student household.income
## 1 Ami 15888
## 2 Budi 12790
## 3 Caca 4912
## 4 Doni 48242
## 5 Edo 237505
## 6 Fani 5561
## 7 Gita 37183
## 8 Hamim 51135
## 9 Ian 43616
## 10 Jacob 13877
df[3, ] # ambil per baris
## student sex semester household.income final.score
## 3 Caca F 6 4912 85
df[c(3, 5), ]
## student sex semester household.income final.score
## 3 Caca F 6 4912 85
## 5 Edo M 3 237505 83
# Perubahan Tipe Data
df$final.score; class(df$final.score)
## [1] "88" "85" "85" "83" "83" "74" "74" "76" "76" "84"
## [1] "character"
df['final.score'] <- as.numeric(df[['final.score']]) # harus input vector
df$final.score; class(df$final.score)
## [1] 88 85 85 83 83 74 74 76 76 84
## [1] "numeric"
df$sex; class(df$sex)
## [1] "F" "M" "F" "M" "M" "F" "F" "M" "M" "M"
## [1] "character"
df['sex'] <- factor(df[['sex']], levels = c("F", "M", "NB"),
labels = c("Female", "Male", "Non-Binary"),
ordered = F)
df$sex; class(df$sex)
## [1] Female Male Female Male Male Female Female Male Male Male
## Levels: Female Male Non-Binary
## [1] "factor"
df$semester; class(df$semester)
## [1] 2 2 6 2 3 6 7 2 4 6
## [1] "numeric"
df['semester'] <- factor(df$semester, levels = 1:14, ordered = T)
df$semester; class(df$semester)
## [1] 2 2 6 2 3 6 7 2 4 6
## Levels: 1 < 2 < 3 < 4 < 5 < 6 < 7 < 8 < 9 < 10 < 11 < 12 < 13 < 14
## [1] "ordered" "factor"
# Struktur Data
str(df)
## 'data.frame': 10 obs. of 5 variables:
## $ student : chr "Ami" "Budi" "Caca" "Doni" ...
## $ sex : Factor w/ 3 levels "Female","Male",..: 1 2 1 2 2 1 1 2 2 2
## $ semester : Ord.factor w/ 14 levels "1"<"2"<"3"<"4"<..: 2 2 6 2 3 6 7 2 4 6
## $ household.income: num 15888 12790 4912 48242 237505 ...
## $ final.score : num 88 85 85 83 83 74 74 76 76 84
# Ringkasan Data
summary(df)
## student sex semester household.income final.score
## Length:10 Female :4 2 :4 Min. : 4912 Min. :74.00
## Class :character Male :6 6 :3 1st Qu.: 13062 1st Qu.:76.00
## Mode :character Non-Binary:0 3 :1 Median : 26536 Median :83.00
## 4 :1 Mean : 47071 Mean :80.80
## 7 :1 3rd Qu.: 47086 3rd Qu.:84.75
## 1 :0 Max. :237505 Max. :88.00
## (Other):0
# Fungsi Apply
length(df)
## [1] 5
sapply(df, length)
## student sex semester household.income
## 10 10 10 10
## final.score
## 10
sapply(df[c('household.income', 'final.score')], sum)
## household.income final.score
## 470709 808
sapply(df[c('household.income', 'final.score')], mean)
## household.income final.score
## 47070.9 80.8
sapply(df[c('household.income', 'final.score')], sd)
## household.income final.score
## 69235.969079 5.223877
levels(df$sex)
## [1] "Female" "Male" "Non-Binary"
levels(df$semester)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
sapply(df[c('sex', 'semester')], levels)
## $sex
## [1] "Female" "Male" "Non-Binary"
##
## $semester
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
# Contoh Akhir
q.df <- sapply(df[c('household.income', 'final.score')], quantile,
probs= c(.25, .75))
q.df
## household.income final.score
## 25% 13061.75 76.00
## 75% 47085.50 84.75
q.df["75%", ] - q.df["25%", ] # to find IQR
## household.income final.score
## 34023.75 8.75
#PART 2
library(MASS)
df.boston <- Boston
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Pipeline
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x <- c(1, 3, 2)
x %>% mean(); mean(x)
## [1] 2
## [1] 2
x %>% quantile(c(.25, .50, .75)); quantile(x, c(.25, .50, .75))
## 25% 50% 75%
## 1.5 2.0 2.5
## 25% 50% 75%
## 1.5 2.0 2.5
head(
round(
cor(
df.boston), 2))
## crim zn indus chas nox rm age dis rad tax ptratio black
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29 -0.39
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39 0.18
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38 -0.36
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12 0.05
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19 -0.38
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36 0.13
## lstat medv
## crim 0.46 -0.39
## zn -0.41 0.36
## indus 0.60 -0.48
## chas -0.05 0.18
## nox 0.59 -0.43
## rm -0.61 0.70
df.boston %>%
cor() %>%
round(2) %>%
head()
## crim zn indus chas nox rm age dis rad tax ptratio black
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29 -0.39
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39 0.18
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38 -0.36
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12 0.05
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19 -0.38
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36 0.13
## lstat medv
## crim 0.46 -0.39
## zn -0.41 0.36
## indus 0.60 -0.48
## chas -0.05 0.18
## nox 0.59 -0.43
## rm -0.61 0.70
plot(quantile(df.boston$age, c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)),
main = "Grafik", xlab = "", ylab = "age")
library(dplyr)
df.boston$age %>%
quantile(c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)) %>%
plot(main = "Grafik", xlab = "", ylab = "age")
# Help
help(runif)
## starting httpd help server ...
## done
help("data.frame")
BAB 2 PART 1 ## Data Dataset berikut merupakan data yang akan digunakan dalam ilustrasi histogram dan boxplot
df <- read.csv2("C:/Users/user/Downloads/data.csv")
head(df)
## X PID county state area poptotal popdensity popwhite popblack
## 1 1 561 ADAMS IL 0.052 66090 1270.9615 63917 1702
## 2 2 562 ALEXANDER IL 0.014 10626 759.0000 7054 3496
## 3 3 563 BOND IL 0.022 14991 681.4091 14477 429
## 4 4 564 BOONE IL 0.017 30806 1812.1176 29344 127
## 5 5 565 BROWN IL 0.018 5836 324.2222 5264 547
## 6 6 566 BUREAU IL 0.050 35688 713.7600 35157 50
## popamerindian popasian popother percwhite percblack percamerindan percasian
## 1 98 249 124 96.71206 2.5752761 0.1482826 0.37675897
## 2 19 48 9 66.38434 32.9004329 0.1788067 0.45172219
## 3 35 16 34 96.57128 2.8617170 0.2334734 0.10673071
## 4 46 150 1139 95.25417 0.4122574 0.1493216 0.48691813
## 5 14 5 6 90.19877 9.3728581 0.2398903 0.08567512
## 6 65 195 221 98.51210 0.1401031 0.1821340 0.54640215
## percother popadults perchsd percollege percprof poppovertyknown
## 1 0.18762294 43298 75.10740 19.63139 4.355859 63628
## 2 0.08469791 6724 59.72635 11.24331 2.870315 10529
## 3 0.22680275 9669 69.33499 17.03382 4.488572 14235
## 4 3.69733169 19272 75.47219 17.27895 4.197800 30337
## 5 0.10281014 3979 68.86152 14.47600 3.367680 4815
## 6 0.61925577 23444 76.62941 18.90462 3.275891 35107
## percpovertyknown percbelowpoverty percchildbelowpovert percadultpoverty
## 1 96.27478 13.151443 18.01172 11.009776
## 2 99.08714 32.244278 45.82651 27.385647
## 3 94.95697 12.068844 14.03606 10.852090
## 4 98.47757 7.209019 11.17954 5.536013
## 5 82.50514 13.520249 13.02289 11.143211
## 6 98.37200 10.399635 14.15882 8.179287
## percelderlypoverty inmetro category
## 1 12.443812 0 AAR
## 2 25.228976 0 LHR
## 3 12.697410 0 AAR
## 4 6.217047 1 ALU
## 5 19.200000 0 AAR
## 6 11.008586 0 AAR
hist(df$percollege)
hist(df$percollege, freq = F)
hist (df$percollege,
main ='Histogram persentase berpendidikan perguruan tinggi ',
sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
ylab = 'Frekuensi',
xlab = 'Persentase berpendidikan perguruan tinggi',
col = 'light blue')
hist (df$percollege,
main ='Histogram persentase berpendidikan perguruan tinggi ',
sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
ylab = 'Frekuensi',
xlab = 'Persentase berpendidikan perguruan tinggi',
col = 'light blue',
breaks = seq(min(df$percollege), max(df$percollege), length.out = 21+1))
hist (df$percollege,
main ='Histogram persentase berpendidikan perguruan tinggi ',
sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
ylab = 'Frekuensi',
xlab = 'Persentase berpendidikan perguruan tinggi',
col = 'light blue',
breaks = seq(min(df$percollege), max(df$percollege), length.out = 21+1))
abline(v=median(df$percollege),col="coral",lwd=2)
abline(v=mean(df$percollege), col="green", lwd=5)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
ggplot(df, aes(x=percollege)) +
geom_histogram(color="black", fill="white")+
geom_vline(aes(xintercept=mean(percollege)),
color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x=percollege)) +
geom_histogram(color="dark blue", fill="light blue", linetype="dashed")+
geom_vline(aes(xintercept=mean(percollege)),
color="green", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x=percollege, color=state)) +
geom_histogram(fill="white")+
theme(legend.position="top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(plyr)
## Warning: package 'plyr' was built under R version 4.1.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
rataan<-ddply(df, "state", summarise, rata2=mean(percollege))
ggplot(df, aes(x=percollege, color=state)) +
geom_histogram(fill="white")+
geom_vline(data=rataan, aes(xintercept=rata2, color=state),
linetype="dashed")+
theme(legend.position="top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x=percollege, color=state)) +
geom_histogram(fill="white")+
geom_vline(data=rataan, aes(xintercept=rata2, color=state),
linetype="dashed")+
theme(legend.position="top")+
scale_color_grey()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df, aes(x=percollege))+
geom_histogram(color="black", fill="white")+
facet_grid(state ~ .)+
geom_vline(data=rataan, aes(xintercept=rata2, color="red"),
linetype="dashed")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
ggplot(data = df, aes(x=percollege)) +
geom_histogram(aes(fill=state)) +
scale_fill_brewer(palette="Set2") +
facet_wrap( ~ state, ncol=1) +
xlab("Persentase berpendidikan perguruan tinggi") +
ylab("Frekuensi") +
theme_bw() +
ggtitle("Percollege by State\n")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(df$percchildbelowpovert)
boxplot(df$percchildbelowpovert, horizontal = T,
main = "% of Child Below\nThe Poverty Line in The Midwest", col = "red")
boxplot(percchildbelowpovert ~ state, data = df)
boxplot(percchildbelowpovert ~ inmetro, data = df)
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100))
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100),
col = c("red", "blue"))
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100),
col = c("red", "blue"), xaxt = "n")
# untuk memodifikasi salah satu sumbu
axis(1, at = 1:2, labels = c("No", "Yes"))
boxplot(percchildbelowpovert ~ state, data = df)
boxplot(df$percchildbelowpovert ~
reorder(df$state, df$percchildbelowpovert, FUN = median))
state.reord <- reorder(df$state, df$percchildbelowpovert, FUN = median)
boxplot(df$percchildbelowpovert ~ state.reord)
#####membalikkan urutan kategori menggunakan tidyverse::fct_rev()
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.5 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plyr::arrange() masks dplyr::arrange()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x plyr::rename() masks dplyr::rename()
## x dplyr::select() masks MASS::select()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
boxplot(df$percchildbelowpovert ~ fct_rev(state.reord)) # reverse factor order
boxplot(df$percchildbelowpovert ~ state.reord,
col = c("darkred", "red", "white", "blue", "darkblue"))
boxplot(percchildbelowpovert ~ state.reord, data = df,
col = c("darkred", "red", "white", "blue", "darkblue"))
boxplot(df$percchildbelowpovert ~ state.reord,
col = c("darkblue", "blue", "white", "red", "darkred"),
main = "Child Poverty Rate in Midwest Counties,\nGrouped by States",
xlab = "State", ylab = "% of Child Living Under Poverty Line",
ylim = c(0, 100))
# menambahkan garis horizontal dan teks
abline(h = 50, lty = "dashed", col = "grey")
text(x = 2, y = 64.31 + 7, labels = "Menominee, WI", cex = .725)
max(df$percchildbelowpovert)
## [1] 64.30848
df$county[df$percchildbelowpovert == max(df$percchildbelowpovert)]
## [1] "MENOMINEE"
library(ggplot2)
ggplot(data = df, mapping = aes(y = percchildbelowpovert)) + geom_boxplot()
bp.child.povr <- ggplot(data = df, mapping = aes(y = percchildbelowpovert)) +
geom_boxplot()
bp.child.povr + theme_light()
bp.child.povr + theme_bw()
bp.child.povr + theme_classic()
bp.child.povr + theme_minimal()
bp.child.povr + theme_void() # akan berguna jika menggunakan peta choropleth
bp.cp.inmet <- ggplot(
data = df, mapping = aes(x = as.factor(inmetro), # faktor krn kategorik
y = percchildbelowpovert)
) + geom_boxplot()
bp.cp.inmet + theme_classic()
cp.inmet.col <- ggplot(
data = df, mapping = aes(x = as.factor(inmetro),
y = percchildbelowpovert,
fill = as.factor(inmetro))
)
cp.inmet.col + geom_boxplot(fill = "orange") + theme_classic()
cp.inmet.col + geom_boxplot(fill = "orange", alpha = .35) + theme_classic()
fill (warna isi) vs color (warna garis/border)
cp.inmet.col + geom_boxplot(fill = "orange", color = "brown") + theme_classic()
cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic()
cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic()
cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic() +
xlab("County considered in a metro area?") +
ylab("% of Child Living Under Poverty") +
scale_x_discrete(breaks = c(0, 1), labels = c("No", "Yes")) +
ylim(0, 100)
ggplot(df, aes(x = reorder(state, percchildbelowpovert, median),
y = percchildbelowpovert)) +
geom_boxplot()
# state.reord telah dideklarasikan sebelumnya
cp.state <- ggplot(df, aes(x = state.reord, y = percchildbelowpovert,
color = state.reord))
cp.state + geom_boxplot()
cp.state + geom_boxplot() + scale_x_discrete(limits = rev(levels(state.reord)))
boxplot + jitter
cp.state + geom_boxplot() + geom_jitter(alpha = .5) + scale_x_discrete() +
theme(legend.position="none") # menghilangkan legenda
BAB 3 #PP PLOT
Deteksi sebaran dari suatu data secara eksploratif selain menggunakan QQ plot juga dapat menggunakan PP plot (Probability-Probability plot). Plot ini hampir sama dengan QQ plot yaitu berupa plot pencar yang digunakan untuk melihat kecocokan dari 2 dataset atau dataset terhadap sebaran tertentu. Namun pada PP plot, yang dibandingkan adalah peluang sebaran komulatif. Kedua dataset dikatakan identik atau suatu dataset dikatakan mengikuti sebaran tertentu jika titik-titik mendekati garis lurus diagonal y=x dimana dimulai dari titik (0,0) hingga (1,1). PP plot dalam mendeteksi sebaran dari suatu data menggunakan R dapat dibangun menggunakan langkah-langkah berikut:
Simulasi deteksi sebaran menggunakan PP plotdalam R dapat mengikuti syntax berikut:
#Membangktikan data (ex: menyebar normal)
set.seed(1)
x<-rnorm(100,mean=2, sd=1)
x<-sort(x) #mengurutkan data empiris dari yang terkecil
i<-seq(1:length(x))
df<-data.frame(i,x)
m<-mean(x)
s<-sd(x)
n<-length(x)
head(df, 10)
## i x
## 1 1 -0.2146999
## 2 2 0.0106483
## 3 3 0.1950414
## 4 4 0.4764332
## 5 5 0.5292476
## 6 6 0.6229404
## 7 7 0.7234078
## 8 8 0.7463666
## 9 9 0.7753874
## 10 10 0.8706369
#Memeriksa data duplikat
library(dplyr)
uniqe_x<-distinct(df,x)
dim(uniqe_x) #semua data unik, banyak data unik sama dengan banyak data yang dibangkitkan
## [1] 100 1
#Peluang kumulatif data empiris
cdf_x<-0
sum=0
for(i in 1:n){
sum=sum+(1/n)
cdf_x[i]<-sum
}
df$cdf_x<-cdf_x
head(df, 10)
## i x cdf_x
## 1 1 -0.2146999 0.01
## 2 2 0.0106483 0.02
## 3 3 0.1950414 0.03
## 4 4 0.4764332 0.04
## 5 5 0.5292476 0.05
## 6 6 0.6229404 0.06
## 7 7 0.7234078 0.07
## 8 8 0.7463666 0.08
## 9 9 0.7753874 0.09
## 10 10 0.8706369 0.10
#Peluang kumulatif sebaran teoritis (Normal)
cdf_normal<-pnorm(df$x, mean=m, sd=s)
#atau dapat menggunakan sebaran normal baku
z_score<-(df$x-m)/s
cdf_norm<-pnorm(z_score, mean=0, sd=1)
df$cdf_norm<-cdf_norm
head(df, 10)
## i x cdf_x cdf_norm
## 1 1 -0.2146999 0.01 0.004841632
## 2 2 0.0106483 0.02 0.009744305
## 3 3 0.1950414 0.03 0.016554518
## 4 4 0.4764332 0.04 0.034572267
## 5 5 0.5292476 0.05 0.039316458
## 6 6 0.6229404 0.06 0.049026987
## 7 7 0.7234078 0.07 0.061475114
## 8 8 0.7463666 0.08 0.064640041
## 9 9 0.7753874 0.09 0.068820007
## 10 10 0.8706369 0.10 0.084010290
library(ggplot2)
histogram<-ggplot(df) +
geom_histogram(aes(x = x),fill="darkred",col="darkred") +
ggtitle("Histogram data empiris") +
ylab("Frekuensi") +
xlab("x") +
theme(plot.title = element_text(hjust = 0.5))
pp_norm<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_norm),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Normal") +
ylab("Peluang kumulatif sebaran normal") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_norm
#Peluang kumulatif sebaran teoritis (Chi-square)
cdf_chisq<-pchisq(df$x, df=m)
df$cdf_chisq<-cdf_chisq
head(df,10)
## i x cdf_x cdf_norm cdf_chisq
## 1 1 -0.2146999 0.01 0.004841632 0.000000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216
## 3 3 0.1950414 0.03 0.016554518 0.079814878
## 4 4 0.4764332 0.04 0.034572267 0.190789559
## 5 5 0.5292476 0.05 0.039316458 0.210400610
## 6 6 0.6229404 0.06 0.049026987 0.244188936
## 7 7 0.7234078 0.07 0.061475114 0.278998367
## 8 8 0.7463666 0.08 0.064640041 0.286748071
## 9 9 0.7753874 0.09 0.068820007 0.296435838
## 10 10 0.8706369 0.10 0.084010290 0.327392723
pp_chisq<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_chisq),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Chi-square") +
ylab("Peluang kumulatif sebaran chi-square") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_chisq
#Peluang kumulatif sebaran teoritis (Eksponensial)
cdf_exp<-pexp(df$x, rate=m)
df$cdf_exp<-cdf_exp
head(df,10)
## i x cdf_x cdf_norm cdf_chisq cdf_exp
## 1 1 -0.2146999 0.01 0.004841632 0.000000000 0.00000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216 0.02220581
## 3 3 0.1950414 0.03 0.016554518 0.079814878 0.33722538
## 4 4 0.4764332 0.04 0.034572267 0.190789559 0.63386163
## 5 5 0.5292476 0.05 0.039316458 0.210400610 0.67245296
## 6 6 0.6229404 0.06 0.049026987 0.244188936 0.73117945
## 7 7 0.7234078 0.07 0.061475114 0.278998367 0.78250633
## 8 8 0.7463666 0.08 0.064640041 0.286748071 0.79278598
## 9 9 0.7753874 0.09 0.068820007 0.296435838 0.80508752
## 10 10 0.8706369 0.10 0.084010290 0.327392723 0.84055802
pp_exp<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_exp),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Exponensial") +
ylab("Peluang kumulatif sebaran exponensial") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_exp
#Peluang kumulatif sebaran teoritis (Gamma)
cdf_gamma<-pgamma(df$x, shape = 1, rate = 1) #alfa=1, beta=1
df$cdf_gamma<-cdf_gamma
head(df,10)
## i x cdf_x cdf_norm cdf_chisq cdf_exp cdf_gamma
## 1 1 -0.2146999 0.01 0.004841632 0.000000000 0.00000000 0.00000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216 0.02220581 0.01059181
## 3 3 0.1950414 0.03 0.016554518 0.079814878 0.33722538 0.17719938
## 4 4 0.4764332 0.04 0.034572267 0.190789559 0.63386163 0.37900559
## 5 5 0.5292476 0.05 0.039316458 0.210400610 0.67245296 0.41095201
## 6 6 0.6229404 0.06 0.049026987 0.244188936 0.73117945 0.46363503
## 7 7 0.7234078 0.07 0.061475114 0.278998367 0.78250633 0.51490367
## 8 8 0.7463666 0.08 0.064640041 0.286748071 0.79278598 0.52591403
## 9 9 0.7753874 0.09 0.068820007 0.296435838 0.80508752 0.53947465
## 10 10 0.8706369 0.10 0.084010290 0.327392723 0.84055802 0.58131520
pp_gamma<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_gamma),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Gamma") +
ylab("Peluang kumulatif sebaran gamma") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_gamma
#menggabungkan semua plot
par(mfrow=c(2,2))
pp_norm
pp_chisq
pp_exp
pp_gamma
Berdasarkan PP-plot di atas, data paling mendekati sebaran normal. Titik-titik pada plot mendekati garis diagonal y=x. Hal ini sesuai dengan data yang memang dibangkitkan dari sebaran normal.
BAB 3 #QQ PLOT # QQ-Plot QQ Plot/Plot kuantil merupakan grafik yang membandingkan antara nilai-nilai kuantil dari sebaran suatu data terhadap kuantil-kuantil milik sebaran lain. Jika kedua sebaran berbentuk sama/hampir sama, maka tiap nilai amatannya akan berada di suatu garis lurus. Plot ini dapat digunakan untuk mencari tahu sebaran teoretis yang paling mendekati sebaran suatu data secara visual.
Pada sesi ini kita akan menggunakan data ggplot2::midwest$percollege (persentase penduduk yang berpendidikan )
library(ggplot2)
df <- as.data.frame(midwest)
var1 <- df$percollege
str(var1)
## num [1:437] 19.6 11.2 17 17.3 14.5 ...
Berikut adalah histogram dan plot densitas bagi var1 (plot densitas akan dipelajari lebih lanjut di minggu ke-4). Menggunakan QQ Plot, kita akan berusaha untuk mencari sebaran teoretis (normal, exponential, chi-square, etc.) yang paling mendekati sebaran kepekatan data yang kita miliki.
qqplot(x, y, plot.it = TRUE, xlab = deparse1(substitute(x)), ylab = deparse1(substitute(y)), …)
x: The first sample for qqplot. y: The second or only data sample.
qqline(y, datax = FALSE, distribution = qnorm, probs = c(0.25, 0.75), qtype = 7, …)
# Normal
set.seed(42)
qqnorm(var1, cex = .5)
set.seed(42)
qqline(var1, distribution = qnorm, col = "red", lty = "dashed", lwd = .1)
# Normal
set.seed(42)
qqplot(rnorm(n = length(var1), mean = mean(var1), sd = sd(var1)), var1)
set.seed(42)
qqline(distribution = function(p) qnorm(p, mean = mean(var1), sd = sd(var1)),
var1)
X ~ Lognormal -> log(X) ~ Normal
# Lognormal
set.seed(42)
qqplot(rlnorm(n = length(var1), meanlog = mean(log(var1)),
sdlog = sd(log(var1))), var1)
set.seed(42)
qqline(distribution = function(p) qlnorm(p, meanlog = mean(log(var1)),
sdlog = sd(log(var1))), var1,
col = "red")
# Chi-Squared
qqplot(rchisq(n = length(var1), df = mean(var1)), var1)
qqline(distribution = function(p) qchisq(p, df = mean(var1)), var1, col = "red")
# Exponential
set.seed(42)
qqplot(rexp(n = length(var1), rate = 1/mean(var1)), var1)
set.seed(42)
qqline(distribution = function(p) qexp(p, rate = 1/mean(var1)), var1, col = "red")
hist(var1, breaks = 20, freq = F, xlim = c(0, 50), border = "white",
col = "skyblue", main = "var1 vs Exponential")
curve(dexp(x, rate = 1/mean(var1)), from = 0, to = 50, add = T, lwd = 4)
stat_qq( mapping = NULL, data = NULL, geom = “point”, position = “identity”, …, distribution = stats::qnorm, dparams = list(), na.rm = FALSE, show.legend = NA, inherit.aes = TRUE )
stat_qq_line( mapping = NULL, data = NULL, geom = “path”, position = “identity”, …, distribution = stats::qnorm, dparams = list(), line.p = c(0.25, 0.75), fullrange = FALSE, na.rm = FALSE, show.legend = NA, inherit.aes = TRUE )
library(ggplot2)
gg.percol <- ggplot(data = df, mapping = aes(sample = percollege))
# Normal
gg.percol + stat_qq() + stat_qq_line()
# Lognormal
param.lnorm <- list(meanlog = mean(log(var1)), sdlog = sd(log(var1)))
gg.percol + stat_qq(distribution = qlnorm, dparams = param.lnorm) +
stat_qq_line(distribution = qlnorm, dparams = param.lnorm)
# Chi-Squared
param.chisq <- list(df = mean(var1))
gg.percol + stat_qq(distribution = qchisq, dparams = param.chisq) +
stat_qq_line(distribution = qchisq, dparams = param.chisq, color = "red") +
theme_classic()
# Exponential
param.exp <- list(rate = 1/mean(var1))
gg.percol + stat_qq(distribution = qexp, dparams = param.exp) +
stat_qq_line(distribution = qexp, dparams = param.exp, color = "red") +
theme_classic() + xlab("Theoretical Quantiles") + ylab("Sample Quantiles")
qqPlot(x, distribution=“norm”, groups, layout, ylim=range(x, na.rm=TRUE), ylab=deparse(substitute(x)), xlab=paste(distribution, “quantiles”), glab=deparse(substitute(groups)), main=NULL, las=par(“las”), envelope=TRUE, col=carPalette()[1], col.lines=carPalette()[2], lwd=2, pch=1, cex=par(“cex”), line=c(“quartiles”, “robust”, “none”), id=TRUE, grid=TRUE, …)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
# Normal
qqPlot(var1, distribution = "norm", mean = mean(var1), sd = sd(var1))
## [1] 275 378
# Lognormal
qqPlot(var1, distribution = "lnorm", meanlog = mean(log(var1)),
sdlog = sd(log(var1)))
## [1] 275 378
# Chi-Squared
qqPlot(var1, distribution = "chisq", df = mean(var1))
## [1] 275 378
# Exponential
qqPlot(var1, distribution = "exp", rate = 1/mean(var1))
## [1] 275 378
Kolmogorov-Smirnov Test : Numerik Anderson-Darling Test : Numerik Chi-Squared Test : Kategorik
In a nutshell: H0: Sebaran 1 == Sebaran 2 H1: Sebaran 1 != Sebaran 2
p-value > alpha, Terima H0 p-value < alpha, Tolak H0
ks.test(x, y, …, alternative = c(“two.sided”, “less”, “greater”), exact = NULL)
# Normal
set.seed(42)
ks.test(var1, "pnorm", mean = mean(var1), sd = sd(var1))
##
## One-sample Kolmogorov-Smirnov test
##
## data: var1
## D = 0.1289, p-value = 9.859e-07
## alternative hypothesis: two-sided
# Lognormal
set.seed(42)
ks.test(var1, "plnorm", meanlog = mean(log(var1)), sdlog = sd(log(var1)))
##
## One-sample Kolmogorov-Smirnov test
##
## data: var1
## D = 0.061947, p-value = 0.0699
## alternative hypothesis: two-sided
# Chi-Squared
set.seed(42)
ks.test(var1, "pchisq", df = mean(var1))
##
## One-sample Kolmogorov-Smirnov test
##
## data: var1
## D = 0.08466, p-value = 0.003806
## alternative hypothesis: two-sided
# Exponential
set.seed(42)
ks.test(log(var1), "pexp", rate = 1/mean(var1))
##
## One-sample Kolmogorov-Smirnov test
##
## data: log(var1)
## D = 0.81103, p-value < 2.2e-16
## alternative hypothesis: two-sided
ad.test(…, data = NULL, method = c(“asymptotic”, “simulated”, “exact”), dist = FALSE, Nsim = 10000)
library(kSamples)
## Loading required package: SuppDists
# Normal
set.seed(42)
ad.test(var1, rnorm(n = length(var1), mean = mean(var1), sd = sd(var1)))
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 2
## Sample sizes: 437, 437
## Number of ties: 0
##
## Mean of Anderson-Darling Criterion: 1
## Standard deviation of Anderson-Darling Criterion: 0.75972
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 5.962 6.531 0.0009721
## version 2: 5.980 6.555 0.0009560
# Lognormal
set.seed(42)
ad.test(var1, rlnorm(n = length(var1), meanlog = mean(log(var1)),
sdlog = sd(log(var1))))
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 2
## Sample sizes: 437, 437
## Number of ties: 0
##
## Mean of Anderson-Darling Criterion: 1
## Standard deviation of Anderson-Darling Criterion: 0.75972
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 1.052 0.06866 0.3306
## version 2: 1.060 0.07271 0.3292
# Chi-Squared
set.seed(42)
ad.test(var1, rchisq(n = length(var1), df = mean(var1)))
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 2
## Sample sizes: 437, 437
## Number of ties: 0
##
## Mean of Anderson-Darling Criterion: 1
## Standard deviation of Anderson-Darling Criterion: 0.75972
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 4.30 4.344 0.006241
## version 2: 4.31 4.361 0.006162
# Exponential
set.seed(42)
ad.test(log(var1), rexp(n = length(var1), rate = 1/mean(var1)))
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 2
## Sample sizes: 437, 437
## Number of ties: 0
##
## Mean of Anderson-Darling Criterion: 1
## Standard deviation of Anderson-Darling Criterion: 0.75972
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 220.5 288.9 1.430e-121
## version 2: 221.0 289.0 3.343e-121
chisq.test(x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)
var2 <- df$state
str(var2)
## chr [1:437] "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" ...
O : Data observasi yang ada di dataset E : Data ekspektasi andaikan mengikuti suatu sebaran teoretik tertentu
Observed <- factor(var2)
str(Observed)
## Factor w/ 5 levels "IL","IN","MI",..: 1 1 1 1 1 1 1 1 1 1 ...
table(Observed)
## Observed
## IL IN MI OH WI
## 102 92 83 88 72
# Theoretical Uniform
set.seed(42)
Expected <- sample(levels(Observed), size = length(var2), replace = T)
Expected <- factor(Expected)
str(Expected)
## Factor w/ 5 levels "IL","IN","MI",..: 1 5 1 1 2 4 2 2 1 4 ...
table(Expected)
## Expected
## IL IN MI OH WI
## 93 103 76 83 82
sample(x, size, replace = FALSE, prob = NULL)
parameter prob dapat diisi oleh vektor berisi peluang muncul suatu kategori, sehingga nanti dapat diubah menjadi bentuk sebaran teoretik yang diinginkan.
chisq.test(Observed, Expected)
##
## Pearson's Chi-squared test
##
## data: Observed and Expected
## X-squared = 12.879, df = 16, p-value = 0.6816
p-val > alpha -> sebaran df$state adalah uniform
#PP-Plot
Deteksi sebaran dari suatu data secara eksploratif selain menggunakan QQ plot juga dapat menggunakan PP plot (Probability-Probability plot). Plot ini hampir sama dengan QQ plot yaitu berupa plot pencar yang digunakan untuk melihat kecocokan dari 2 dataset atau dataset terhadap sebaran tertentu. Namun pada PP plot, yang dibandingkan adalah peluang sebaran komulatif. Kedua dataset dikatakan identik atau suatu dataset dikatakan mengikuti sebaran tertentu jika titik-titik mendekati garis lurus diagonal y=x dimana dimulai dari titik (0,0) hingga (1,1). PP plot dalam mendeteksi sebaran dari suatu data menggunakan R dapat dibangun menggunakan langkah-langkah berikut:
Simulasi deteksi sebaran menggunakan PP plotdalam R dapat mengikuti syntax berikut:
#Membangktikan data (ex: menyebar normal)
set.seed(1)
x<-rnorm(100,mean=2, sd=1)
x<-sort(x) #mengurutkan data empiris dari yang terkecil
i<-seq(1:100)
df<-data.frame(i,x)
m<-mean(x)
s<-sd(x)
n<-length(x)
head(df, 10)
## i x
## 1 1 -0.2146999
## 2 2 0.0106483
## 3 3 0.1950414
## 4 4 0.4764332
## 5 5 0.5292476
## 6 6 0.6229404
## 7 7 0.7234078
## 8 8 0.7463666
## 9 9 0.7753874
## 10 10 0.8706369
#Memeriksa data duplikat
library(dplyr)
uniqe_x<-distinct(df,x)
dim(uniqe_x) #semua data unik, banyak data unik sama dengan banyak data yang dibangkitkan
## [1] 100 1
#Peluang kumulatif data empiris
cdf_x<-0
sum=0
for(i in 1:n){
sum=sum+(1/n)
cdf_x[i]<-sum
}
df$cdf_x<-cdf_x
head(df, 10)
## i x cdf_x
## 1 1 -0.2146999 0.01
## 2 2 0.0106483 0.02
## 3 3 0.1950414 0.03
## 4 4 0.4764332 0.04
## 5 5 0.5292476 0.05
## 6 6 0.6229404 0.06
## 7 7 0.7234078 0.07
## 8 8 0.7463666 0.08
## 9 9 0.7753874 0.09
## 10 10 0.8706369 0.10
#Peluang kumulatif sebaran teoritis (Normal)
cdf_normal<-pnorm(df$x, mean=m, sd=s)
#atau dapat menggunakan sebaran normal baku
z_score<-(df$x-m)/s
cdf_norm<-pnorm(z_score, mean=0, sd=1)
df$cdf_norm<-cdf_norm
head(df, 10)
## i x cdf_x cdf_norm
## 1 1 -0.2146999 0.01 0.004841632
## 2 2 0.0106483 0.02 0.009744305
## 3 3 0.1950414 0.03 0.016554518
## 4 4 0.4764332 0.04 0.034572267
## 5 5 0.5292476 0.05 0.039316458
## 6 6 0.6229404 0.06 0.049026987
## 7 7 0.7234078 0.07 0.061475114
## 8 8 0.7463666 0.08 0.064640041
## 9 9 0.7753874 0.09 0.068820007
## 10 10 0.8706369 0.10 0.084010290
library(ggplot2)
histogram<-ggplot(df) +
geom_histogram(aes(x = x),fill="darkred",col="darkred") +
ggtitle("Histogram data empiris") +
ylab("Frekuensi") +
xlab("x") +
theme(plot.title = element_text(hjust = 0.5))
pp_norm<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_norm),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Normal") +
ylab("Peluang kumulatif sebaran normal") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_norm
#Peluang kumulatif sebaran teoritis (Chi-square)
cdf_chisq<-pchisq(df$x, df=m)
df$cdf_chisq<-cdf_chisq
head(df,10)
## i x cdf_x cdf_norm cdf_chisq
## 1 1 -0.2146999 0.01 0.004841632 0.000000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216
## 3 3 0.1950414 0.03 0.016554518 0.079814878
## 4 4 0.4764332 0.04 0.034572267 0.190789559
## 5 5 0.5292476 0.05 0.039316458 0.210400610
## 6 6 0.6229404 0.06 0.049026987 0.244188936
## 7 7 0.7234078 0.07 0.061475114 0.278998367
## 8 8 0.7463666 0.08 0.064640041 0.286748071
## 9 9 0.7753874 0.09 0.068820007 0.296435838
## 10 10 0.8706369 0.10 0.084010290 0.327392723
pp_chisq<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_chisq),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Chi-square") +
ylab("Peluang kumulatif sebaran chi-square") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_chisq
#Peluang kumulatif sebaran teoritis (Eksponensial)
cdf_exp<-pexp(df$x, rate=m)
df$cdf_exp<-cdf_exp
head(df,10)
## i x cdf_x cdf_norm cdf_chisq cdf_exp
## 1 1 -0.2146999 0.01 0.004841632 0.000000000 0.00000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216 0.02220581
## 3 3 0.1950414 0.03 0.016554518 0.079814878 0.33722538
## 4 4 0.4764332 0.04 0.034572267 0.190789559 0.63386163
## 5 5 0.5292476 0.05 0.039316458 0.210400610 0.67245296
## 6 6 0.6229404 0.06 0.049026987 0.244188936 0.73117945
## 7 7 0.7234078 0.07 0.061475114 0.278998367 0.78250633
## 8 8 0.7463666 0.08 0.064640041 0.286748071 0.79278598
## 9 9 0.7753874 0.09 0.068820007 0.296435838 0.80508752
## 10 10 0.8706369 0.10 0.084010290 0.327392723 0.84055802
pp_exp<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_exp),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Exponensial") +
ylab("Peluang kumulatif sebaran exponensial") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_exp
#Peluang kumulatif sebaran teoritis (Gamma)
cdf_gamma<-pgamma(df$x, shape = 1, rate = 1) #alfa=1, beta=1
df$cdf_gamma<-cdf_gamma
head(df,10)
## i x cdf_x cdf_norm cdf_chisq cdf_exp cdf_gamma
## 1 1 -0.2146999 0.01 0.004841632 0.000000000 0.00000000 0.00000000
## 2 2 0.0106483 0.02 0.009744305 0.003898216 0.02220581 0.01059181
## 3 3 0.1950414 0.03 0.016554518 0.079814878 0.33722538 0.17719938
## 4 4 0.4764332 0.04 0.034572267 0.190789559 0.63386163 0.37900559
## 5 5 0.5292476 0.05 0.039316458 0.210400610 0.67245296 0.41095201
## 6 6 0.6229404 0.06 0.049026987 0.244188936 0.73117945 0.46363503
## 7 7 0.7234078 0.07 0.061475114 0.278998367 0.78250633 0.51490367
## 8 8 0.7463666 0.08 0.064640041 0.286748071 0.79278598 0.52591403
## 9 9 0.7753874 0.09 0.068820007 0.296435838 0.80508752 0.53947465
## 10 10 0.8706369 0.10 0.084010290 0.327392723 0.84055802 0.58131520
pp_gamma<-ggplot(df) +
geom_point(aes(x = cdf_x,y = cdf_gamma),
color="steelblue",size=2) +
ggtitle("PP-plot Terhadap Sebaran Gamma") +
ylab("Peluang kumulatif sebaran gamma") +
xlab("Peluang kumulatif data empiris") +
theme(plot.title = element_text(hjust = 0.5))+
theme_linedraw()+
geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)
par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_gamma
par(mfrow=c(2,2))
pp_norm
pp_chisq
pp_exp
pp_gamma
BAB 4
Plot densitas adalah kurva yang menggambarkan hubungan antara x dan y = f(x) dari suatu fungsi peluang. Pada suatu sebaran peluang teoretis, setiap nilai f(x) telah terdefinisi dengan jelas. Akan tetapi, pada suatu dataset asli, maka plot densitas hanya dapat diestimasi menggunakan metode-metode pendugaan densitas kernel. Baca lebih lanjut: Pendugaan Kernel untuk Plot Densitas (RPubs Pak Bagus).
curve(df(x, df1 = 50, df2 = 20), from = 0, to = 3.5,
lwd = 4, col = "blue")
myCol <- rgb(1, 0, 0, alpha = .6)
scales::show_col(myCol)
polygon(curve(df(x, df1 = 50, df2 = 25), from = 0, to = 3.5),
col = myCol, border = myCol)
Gunakan set.seed() setiap kali ingin menggunakan r* (runif, rnorm, rchisq, dll.) supaya bisa dapat contoh yang mampu direproduksi kembali. Fungsi d* (dunif, dnorm, dchisq, dll.) tidak perlu set.seed() karena hasilnya akan selalu sama setiap waktu.
set.seed(42)
hist(rf(n = 100, df1 = 50, df2 = 20), breaks = 10)
set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
border = "white", col = "grey70")
set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
border = "white", col = "grey70")
curve(df(x, df1 = 50, df2 = 20), from = 0, to = 3.5, add = T,
lwd = 4, col = "blue")
set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
border = "white", col = "grey70")
polygon(curve(df(x, df1 = 50, df2 = 25), add = T),
col = myCol, border = myCol)
vec1 <- ggplot2::midwest$percbelowpoverty
hist(vec1)
plot(density(vec1, bw = .2), type = "l", col = "grey50", lwd = 2)
lines(density(vec1, bw = 1), col = "blue", lwd = 2)
lines(density(vec1, bw = 5), col = "red", lwd = 2)
legend("topright", legend = c("bw = 0.2", "bw = 1", "bw = 5"),
lwd = 2, col = c("grey50", "blue", "red"))
hist(vec1, freq = F, breaks = 50, border = "white", col = rgb(.95, .95, 0, .6))
lines(density(vec1, bw = .2), col = "grey50", lwd = 2)
lines(density(vec1, bw = 1), col = "blue", lwd = 2)
lines(density(vec1, bw = 5), col = "red", lwd = 2)
legend("topright", legend = c("bw = 0.2", "bw = 1", "bw = 5"),
lwd = 2, col = c("grey50", "blue", "red"))
myGrey <- rgb(.5, .5, .5, alpha = .6)
scales::show_col(myGrey)
plot(density(vec1, bw = 2), col = myGrey, type = "l")
polygon(density(vec1, bw = 2), col = myGrey, border = myGrey)
curve(dchisq(x, df = mean(vec1)), from = 0, to = 50, add = T,
lwd = 2, col = "red")
legend("topright", legend = c("Penduga Densitas Kernel", "Chi-Squared"),
col = c("black", "red"), lwd = 2)
Kita dapat mencoba uji formal untuk memastikan apakah antara kedua kurva adalah sebaran yang sama atau tidak.
set.seed(42)
kSamples::ad.test(vec1, rchisq(n = length(vec1), df = mean(vec1)))
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 2
## Sample sizes: 437, 437
## Number of ties: 0
##
## Mean of Anderson-Darling Criterion: 1
## Standard deviation of Anderson-Darling Criterion: 0.75972
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 1.918 1.209 0.1017
## version 2: 1.920 1.208 0.1018
Hasil uji Anderson-Darling: Terima H0 atau “kedua sampel berasal dari populasi yang sama” atau “kedua sebaran memiliki bentuk yang sama”. Namun, p-value tidak jauh di atas 0.05, sehingga masih perlu dicari alternatif lain untuk dicari kepastian bentuk sebarannya.
Tidak ada banyak perbedaan pada setiap metode kernel, terutama ketika jumlah sampel besar.
par(mfrow = c(2, 3))
plot(density(vec1, kernel = "gaussian", bw = .2), lwd = 2, type = "l",
main = "gaussian")
plot(density(vec1, kernel = "epanechnikov", bw = .2), lwd = 2, type = "l",
main = "epanechnikov")
plot(density(vec1, kernel = "triangular", bw = .2), lwd = 2, type = "l",
main = "triangular")
plot(density(vec1, kernel = "biweight", bw = .2), lwd = 2, type = "l",
main = "biweight")
plot(density(vec1, kernel = "cosine", bw = .2), lwd = 2, type = "l",
main = "cosine")
plot(density(vec1, kernel = "optcosine", bw = .2), lwd = 2, type = "l",
main = "optcosine")
par(mfrow = c(2, 3))
plot(density(vec1, kernel = "gaussian"), lwd = 2, type = "l",
main = "gaussian")
plot(density(vec1, kernel = "epanechnikov"), lwd = 2, type = "l",
main = "epanechnikov")
plot(density(vec1, kernel = "triangular"), lwd = 2, type = "l",
main = "triangular")
plot(density(vec1, kernel = "biweight"), lwd = 2, type = "l",
main = "biweight")
plot(density(vec1, kernel = "cosine"), lwd = 2, type = "l",
main = "cosine")
plot(density(vec1, kernel = "optcosine"), lwd = 2, type = "l",
main = "optcosine")
library(ggplot2)
set.seed(42)
ggExamp <- ggplot(mapping = aes(x = rbeta(n = 1000, shape1 = 2, shape2 = 5)))
ggExamp + geom_density() + theme_classic()
Paramater adjust: jika semakin tinggi, maka semakin mulus namun semakin tidak merepresentasikan sebaran data aslinya. Begitupun sebaliknya.
ggExamp + geom_density(adjust = .2) + theme_classic()
ggExamp + geom_density(adjust = 5) + theme_classic()
Histogram dapat digabungkan dengan kurva densitas.
ggExamp + geom_histogram(aes(y = ..density..), bins = 20,
fill = "white", col = "black") +
geom_density(adjust = 2, col = "darkorange", fill = "orange", alpha = .6) +
theme_classic()
ggExamp2 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty))
ggExamp2 + geom_histogram(aes(y = ..density..), bins = 50,
color = "grey35", fill = "white") +
geom_density(color = "orange", size = 1) +
theme_classic()
Modifikasi parameter “color =” pada aes() agar dapat dikelompokkan berdasarkan warna garis.
ggGrouped1 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty,
color = state))
ggGrouped1 + geom_density(adjust = 2) + theme_classic()
Gunakan library RColorBrewer untuk menggunakan bermacam palet warna.
library(RColorBrewer)
par(cex = .6)
display.brewer.all()
scale_color_manual() untuk menyesuaikan warna pada tiap garis kurva.
ggGrouped1 + geom_density(adjust = 2) + theme_classic() +
scale_color_manual(values = brewer.pal(n = 5, "Reds"))
Modifikasi parameter fill pada aes() agar pengelompokan dapat didasarkan pada warna isi. parameter alpha pada geom_density() untuk mengatur transparansi warna isi pada kurva denstitas.
ggGrouped2 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty,
color = state, fill = state))
ggGrouped2 + geom_density(adjust = 2, alpha = .1) + theme_classic()
scale_fill_manual() untuk menyesuaikan warna pada tiap isi kurva.
ggGrouped2 + geom_density(adjust = 2, alpha = .1) + theme_classic() +
scale_color_manual(values = brewer.pal(n = 9, "Blues")[5:9]) +
scale_fill_manual(values = brewer.pal(n = 9, "Blues")[5:9])
stat_function() dapat digunakan untuk memasukkan suatu sebaran teoritis terhadap suatu sebaran sampel.
ggExamp3 <- ggplot(midwest, aes(x = percbelowpoverty))
ggExamp3 + geom_density(size = 1, color = "blue", adjust = 2) +
theme_classic() +
stat_function(
fun = dnorm,
args = with(midwest, c(mean = mean(percbelowpoverty),
sd = sd(percbelowpoverty))),
size = 1, color = "red"
)
library(ggplot2)
data(midwest)
df<-midwest
ggplot(df, aes(state, percollege)) +
geom_violin(alpha = 0.2)
ggplot(df, aes(state, percollege)) +
geom_boxplot() +
geom_violin(alpha = 0.2)
ggplot(df,aes(state,percollege))+
geom_violin()+
geom_boxplot(width=.1,fill="black",outlier.colour=NA)+
stat_summary(fun.y=median,geom="point",fill="blue",shape=21,size=2.5)
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(df, aes(state, percollege)) +
geom_violin(draw_quantiles = 0.5) #kuantil 2
ggplot(df, aes(state, percollege)) +
geom_violin(aes(col = state), fill = NA, draw_quantiles = c(0.25,0.5,0.75)) +
# kuantil 1,2, dan 3
labs(title = "Persentase Penduduk Berendidikan Perguruan Tinggi",
subtitle = "Berdasarkan Negara Bagian",
caption = "Data obtained from the ggplot2 package")
ggplot(df, aes(state, percollege)) +
geom_violin(aes(fill = state)) +
scale_fill_viridis_d(option = "B") +
labs(title = "Persentase Penduduk Berendidikan Perguruan Tinggi",
subtitle = "Berdasarkan Negara Bagian",
caption = "Data obtained from the ggplot2 package")
ggplot(df, aes(state, percollege))+
geom_violin(fill='lightblue',alpha=0.5)+
geom_jitter(position = position_jitter(width = 0.2), col="red", size=1.5)
ggplot(df, aes(state, percollege))+
geom_violin(fill='lightblue', col="darkred",alpha=0.5, linetype="dotted")
BAB 5 SCATTER PLOT`
df <- as.data.frame(ggplot2::midwest)
df2 <- df[c("percwhite", "percblack", "percamerindan", "percasian",
"perchsd", "percollege", "percprof",
"percpovertyknown", "percbelowpoverty", "percchildbelowpovert")]
head(df2)
## percwhite percblack percamerindan percasian perchsd percollege percprof
## 1 96.71206 2.5752761 0.1482826 0.37675897 75.10740 19.63139 4.355859
## 2 66.38434 32.9004329 0.1788067 0.45172219 59.72635 11.24331 2.870315
## 3 96.57128 2.8617170 0.2334734 0.10673071 69.33499 17.03382 4.488572
## 4 95.25417 0.4122574 0.1493216 0.48691813 75.47219 17.27895 4.197800
## 5 90.19877 9.3728581 0.2398903 0.08567512 68.86152 14.47600 3.367680
## 6 98.51210 0.1401031 0.1821340 0.54640215 76.62941 18.90462 3.275891
## percpovertyknown percbelowpoverty percchildbelowpovert
## 1 96.27478 13.151443 18.01172
## 2 99.08714 32.244278 45.82651
## 3 94.95697 12.068844 14.03606
## 4 98.47757 7.209019 11.17954
## 5 82.50514 13.520249 13.02289
## 6 98.37200 10.399635 14.15882
round(cor(df2), 2)
## percwhite percblack percamerindan percasian perchsd
## percwhite 1.00 -0.76 -0.60 -0.32 -0.02
## percblack -0.76 1.00 -0.05 0.32 0.01
## percamerindan -0.60 -0.05 1.00 -0.06 -0.09
## percasian -0.32 0.32 -0.06 1.00 0.47
## perchsd -0.02 0.01 -0.09 0.47 1.00
## percollege -0.21 0.24 -0.08 0.75 0.78
## percprof -0.22 0.26 -0.09 0.79 0.63
## percpovertyknown 0.13 -0.14 0.01 -0.35 -0.12
## percbelowpoverty -0.38 0.22 0.37 -0.04 -0.59
## percchildbelowpovert -0.41 0.29 0.36 -0.15 -0.62
## percollege percprof percpovertyknown percbelowpoverty
## percwhite -0.21 -0.22 0.13 -0.38
## percblack 0.24 0.26 -0.14 0.22
## percamerindan -0.08 -0.09 0.01 0.37
## percasian 0.75 0.79 -0.35 -0.04
## perchsd 0.78 0.63 -0.12 -0.59
## percollege 1.00 0.89 -0.27 -0.28
## percprof 0.89 1.00 -0.40 -0.15
## percpovertyknown -0.27 -0.40 1.00 -0.18
## percbelowpoverty -0.28 -0.15 -0.18 1.00
## percchildbelowpovert -0.36 -0.28 -0.02 0.95
## percchildbelowpovert
## percwhite -0.41
## percblack 0.29
## percamerindan 0.36
## percasian -0.15
## perchsd -0.62
## percollege -0.36
## percprof -0.28
## percpovertyknown -0.02
## percbelowpoverty 0.95
## percchildbelowpovert 1.00
plot(df2)
plot(df$perchsd, df$percbelowpoverty)
fit1 <- lm(percbelowpoverty ~ perchsd, data = df)
coef(fit1)
## (Intercept) perchsd
## 50.7850813 -0.5174649
Model yang diperoleh: Y = 50.79 - 0.52*X
plot(df$perchsd, df$percbelowpoverty, cex = .5)
abline(fit1, col = "red", lwd = 2)
plot(df$perchsd, df$percbelowpoverty, cex = 1, pch = 16,
col = rgb(.8, 0, 0, alpha = .5))
abline(fit1, col = "black", lwd = 2)
myBlue <- rgb(0, 0, .8, .8)
myRed <- rgb(.8, 0, 0, .8)
col2 <- ifelse(df$inmetro == 1, myBlue, myRed)
plot(df$perchsd, df$percbelowpoverty, col = col2, pch = 16, cex = 1.25)
legend("topright", legend = c("In Metro Area", "Not In Metro Area"), pch = 16,
col = c("blue", "red"))
map <- data.frame(find = c("IL", "IN", "MI", "OH", "WI"),
replace = c("darkred", "red", "purple", "blue", "darkblue"))
col <- as.character(map[match(df$state, map$find), "replace"])
data.frame(df$state, col)[c(100, 150, 200, 250, 300, 350, 400), ]
## df.state col
## 100 IL darkred
## 150 IN red
## 200 MI purple
## 250 MI purple
## 300 OH blue
## 350 OH blue
## 400 WI darkblue
plot(df$perchsd, df$percbelowpoverty, col = col, pch = 16)
legend("topright", legend = map$find, col = map$replace, pch = 16)
plot(df$perchsd, df$percbelowpoverty, pch = 16,
col = densCols(df$perchsd, df$percbelowpoverty, nbin = 3))
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'
plot(df$perchsd, df$percbelowpoverty, pch = 16,
col = densCols(df$perchsd, df$percbelowpoverty, nbin = 60))
plot(df$perchsd, df$percbelowpoverty, pch = 16,
col = densCols(df$perchsd, df$percbelowpoverty))
smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 30)
smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 30, col = NULL)
smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 50, col = NULL)
smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 200, col = NULL)
smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 50,
pch = 16, col = "red", nrpoints = 30)
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.1.2
plot(hexbin(df$perchsd, df$percbelowpoverty))
plot(hexbin(df$perchsd, df$percbelowpoverty),
colramp = colorRampPalette(c("white", "blue")))
plot(hexbin(df$perchsd, df$percbelowpoverty),
colramp = colorRampPalette(c("white", "skyblue", "blue")))
plot(hexbin(df$perchsd, df$percbelowpoverty),
colramp = colorRampPalette(c("pink", "red", "black")))
Menggunakan ukuran sebagai dimensi ke-3:
plot(df$perchsd, df$percwhite, cex = df$percbelowpoverty/100*5)
# X -> % of Highschool Diploma
# Y -> % of White Population
# Size -> % of Population Below Poverty Line
Menggunakan transparansi sebagai dimensi ke-3:
col2 <- rgb(1, 0, 0, df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, col = col2, pch = 16)
Menggunakan transparansi dan ukuran sebagai dimensi ke-3:
col2 <- rgb(1, 0, 0, df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, col = col2, cex = df$percbelowpoverty/100*5,
pch = 16)
Menggunakan warna dan ukuran sebagai dimensi ke-3:
col3 <- rgb(df$percbelowpoverty/100*2,
1 - df$percbelowpoverty/100*2,
1 - df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, main = "Midwest Counties",
xlab = "% of High School Diploma", ylab = "% of White",
col = col3, cex = df$percbelowpoverty/100*5, pch = 16)
centerPoint <- c(5, 15, 25, 35, 45)
legend("bottomright", pch = 16, title = "% Below Poverty",
pt.cex = centerPoint/100*5,
col = rgb(centerPoint/50, 1 - centerPoint/50, 1 - centerPoint/50),
legend = c(" 0-10", "10-20", "20-30", "30-40", "40-50"))
library(rgl)
plot3d(df$perchsd, df$percwhite, df$percbelowpoverty)
| BAB 6 title: “Korelasi menggunakan corrplot” |
| ## Data Simulasi Data yang digunakan dalam simulasi syntax corrplot untuk menunjukkan korelasi antar peubah adalah data midwest yang tersedia dalam library ggplot2. Peubah yang akan digunakan dalam visualisasi korelasi menggunakan corrplot hanya peubah numerik. |
r library(ggplot2) data(midwest) dt<-midwest[,4:26] |
| ## Visualisasi corrplot ### Visualisasi sederhana |
r library(corrplot) |
## Warning: package 'corrplot' was built under R version 4.1.2 |
## corrplot 0.92 loaded |
r names(dt)<-paste("X", 1:23) #inisialisasi untuk mempersingkan nama peubah cordt<-cor(dt, method = "pearson") #contoh: korelasi pearson corrplot(cordt) |
| ### Modifikasi bentuk |
r corrplot(cordt, title = "Correlation Plot", method = "square" ) |
| ### Modifikasi penggerombolan |
r corrplot(cordt, method = "square", order = "hclust", addrect = 3) |
| ### Modifikasi kemunculan nilai tertentu Pada umumnya, matriks korlasi akan menampilkan seluruh nilai korelasi dari peubah-peubah yang digunakan. Modifikasi berikut dapat digunaan ketika hanya ingin menampilkan nilai korelasi tertentu. Contoh yang akan disimulasikan yaitu hanya menampilkan nilai korelasi lebih dari atau sama dengan 0.7 baik berkorelasi secara negatif atau positif. |
| ```r ind<-abs(cordt)<0.7 #fungsi abs() digunakan untuk merperhitungkan korelasi positif dan negatif #jika hanya ingin menampilkan korelasi positif saja, fungsi abs() dapat dihilangkan #ind adalah suatu objek sebagai index yang menandakan nilai korelasi kurang dari 0.7 |
| cor_baru<-cordt cor_baru[ind]<-NA #nilai korelasi pada cor_baru yang bersesuaian posisi dengan ind akan diganti dengan “NA” sehingga yang akan tampil adalah nilai korelasi lebih dari atau sama dengan 0.7 baik korelasi positif maupun negatif |
| corrplot(cor_baru, na.label.col = “white”, type = “lower”) ``` |
r #data korelasi yang bernilai NA akan diberi warna "white" #argumen type= digunakan untuk menampilkan segitiga bawah atau atas saja dari matriks korelasi |
BAB 7 title: “Tukey’s Median-Median Resistant Line”
Dataset yang akan terdiri dari dua peubah, x (persentase populasi yang hidup di bawah garis kemiskinan) dan y (persentase populasi kulit putih). Setiap amatan mewakili data pada suatu County (setingkat di atas kota dan setingkat di bawah negara bagian) di kawasan Midwest, Amerika Serikat.
df <- ggplot2::midwest
x <- df$percwhite # % of white population
y <- df$percbelowpoverty # % of population below poverty line
head(cbind(x, y))
## x y
## [1,] 96.71206 13.151443
## [2,] 66.38434 32.244278
## [3,] 96.57128 12.068844
## [4,] 95.25417 7.209019
## [5,] 90.19877 13.520249
## [6,] 98.51210 10.399635
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
main = "Midwest Dataset",
xlab = "% of white population", ylab = "% below poverty line")
OLS (Ordinary Least Squares) atau MKT (Metode Kuadrat Terkecil) adalah metode regresi parametrik yang paling umum digunakan dalam statistika untuk mencari besaran hubungan linear antar peubah. Berikut adalah koefisien persamaan yang dihasilkan oleh regresi OLS. Di bawahnya adalah grafik yang menggambarkan persamaan tersebut.
linReg <- lm(y ~ x)
coef(linReg)
## (Intercept) x
## 38.6604624 -0.2736541
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
main = "Midwest Dataset",
xlab = "% of white population", ylab = "% below poverty line")
abline(linReg, lwd = 2)
text(40, 35, "OLS: y = 38.66 - 0.27X", cex = .8)
STEP 1: Urutkan setiap data sesuai dengan peubah x-nya.
y <- y[order(x)] # y dirutkan (sesuai dengan urutan dari x)
x <- sort(x) # x diurutkan
head(cbind(x, y)) # sehingga hasil akhir tetap berpasangan
## x y
## [1,] 10.69409 48.69110
## [2,] 57.39520 20.06530
## [3,] 62.77972 14.19830
## [4,] 66.38434 32.24428
## [5,] 66.88821 30.18411
## [6,] 70.27065 13.80515
STEP 2: Data akan dibagi menjadi 3 kelompok, maka carilah panjang yang sesuai untuk tiap kelompok
n <- round(length(x)/3)
group1 <- 1:n
group2 <- (n + 1):(length(x) - n)
group3 <- (length(x) - n + 1):length(x)
## min max length
## All 1 437 437
## Group1 1 146 146
## Group2 147 291 145
## Group3 292 437 146
STEP 3: Carilah median di setiap x & y per kelompok.
x1 <- median(x[group1])
y1 <- median(y[group1])
x2 <- median(x[group2])
y2 <- median(y[group2])
x3 <- median(x[group3])
y3 <- median(y[group3])
## x y
## Group 1 92.79061 13.13309
## Group 2 98.03274 11.10143
## Group 3 99.27902 11.51767
plot(x, y, col = rgb(0, 0, 0, .5), pch = 16, cex = .5,
main = "Garis Resisten MML Akan Berusaha
Menghubungkan Ketiga Titik Median (warna merah)")
points(c(x1, x2, x3), c(y1, y2, y3), col = "red", pch = 16)
STEP 4: Carilah rataan untuk setiap median-median x dan y.
xbar <- mean(c(x1, x2, x3))
ybar <- mean(c(y1, y2, y3))
STEP 5: Carilah intercept dan slope dari garis MML.
b1 <- (y3 - y1)/(x3 - x1)
b0 <- ybar - b1*xbar
## Intercept Slope
## OLS 38.6605 -0.2737
## MML 35.9930 -0.2490
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
main = "Midwest Dataset",
xlab = "% of white population", ylab = "% below poverty line")
abline(linReg, lwd = 2) # visualisasi OLS
abline(b0, b1, col = "blue", lwd = 2) # visualisasi MML
legend("bottomleft", legend = c("OLS", "MML"), col = c("black", "blue"),
lty = 1)
text(40, 31.8, "y = 38.66 - 0.27X", cex = .8)
text(30, 25, "y = 35.99 - 0.25X", cex = .8, col = "blue")
geom_point() untuk scatter plot.
geom_smooth() untuk OLS.
geom_abline() untuk MML.
scale_color_manual() untuk meanmpilkan legenda.
theme_bw() untuk mengubah tema.
library(ggplot2)
ggplot(data = df, mapping = aes(x = percwhite, y = percbelowpoverty)) +
geom_point(colour = "red", alpha = .5, size = 1) +
geom_smooth(method = "lm", formula = y ~ x, mapping = aes(col = "black"),
size = 1.5, se = F) +
geom_abline(mapping = aes(intercept = b0, slope = b1, col = "blue"),
size = 1.5, show.legend = F) +
scale_colour_manual(name = "",
labels = c("OLS", "MML"),
values = c("black", "blue")) +
theme_bw()