BAB 1 #PART 1

# Deklarasi Data Frame
df <- data.frame(
  student = c("Ami", "Budi", "Caca", "Doni", "Edo", "Fani", "Gita", "Hamim", "Ian", "Jacob"),
  sex = c("F", "M", "F", "M", "M", "F", "F", "M", "M", "M"),
  semester = c(2, 2, 6, 2, 3, 6, 7, 2, 4, 6),
  household.income = c(15888, 12790, 4912, 48242, 237505, 5561, 37183, 51135, 43616, 13877),
  final.score = c("88", "85", "85", "83", "83", "74", "74", "76", "76", "84"))

head(df, 3)
##   student sex semester household.income final.score
## 1     Ami   F        2            15888          88
## 2    Budi   M        2            12790          85
## 3    Caca   F        6             4912          85
tail(df)
##    student sex semester household.income final.score
## 5      Edo   M        3           237505          83
## 6     Fani   F        6             5561          74
## 7     Gita   F        7            37183          74
## 8    Hamim   M        2            51135          76
## 9      Ian   M        4            43616          76
## 10   Jacob   M        6            13877          84
head(df, 3)
##   student sex semester household.income final.score
## 1     Ami   F        2            15888          88
## 2    Budi   M        2            12790          85
## 3    Caca   F        6             4912          85
tail(df, 8)
##    student sex semester household.income final.score
## 3     Caca   F        6             4912          85
## 4     Doni   M        2            48242          83
## 5      Edo   M        3           237505          83
## 6     Fani   F        6             5561          74
## 7     Gita   F        7            37183          74
## 8    Hamim   M        2            51135          76
## 9      Ian   M        4            43616          76
## 10   Jacob   M        6            13877          84
# Pemotongan Data Frame
df$household.income  # keluaran vector
##  [1]  15888  12790   4912  48242 237505   5561  37183  51135  43616  13877
df[['household.income']]
##  [1]  15888  12790   4912  48242 237505   5561  37183  51135  43616  13877
df['household.income']  # keluaran data.frame
##    household.income
## 1             15888
## 2             12790
## 3              4912
## 4             48242
## 5            237505
## 6              5561
## 7             37183
## 8             51135
## 9             43616
## 10            13877
df[c('student', 'household.income')]
##    student household.income
## 1      Ami            15888
## 2     Budi            12790
## 3     Caca             4912
## 4     Doni            48242
## 5      Edo           237505
## 6     Fani             5561
## 7     Gita            37183
## 8    Hamim            51135
## 9      Ian            43616
## 10   Jacob            13877
df[3, ]  # ambil per baris
##   student sex semester household.income final.score
## 3    Caca   F        6             4912          85
df[c(3, 5), ]
##   student sex semester household.income final.score
## 3    Caca   F        6             4912          85
## 5     Edo   M        3           237505          83
# Perubahan Tipe Data
df$final.score; class(df$final.score)
##  [1] "88" "85" "85" "83" "83" "74" "74" "76" "76" "84"
## [1] "character"
df['final.score'] <- as.numeric(df[['final.score']])  # harus input vector
df$final.score; class(df$final.score)
##  [1] 88 85 85 83 83 74 74 76 76 84
## [1] "numeric"
df$sex; class(df$sex)
##  [1] "F" "M" "F" "M" "M" "F" "F" "M" "M" "M"
## [1] "character"
df['sex'] <- factor(df[['sex']], levels = c("F", "M", "NB"),
                    labels = c("Female", "Male", "Non-Binary"),
                    ordered = F)
df$sex; class(df$sex)
##  [1] Female Male   Female Male   Male   Female Female Male   Male   Male  
## Levels: Female Male Non-Binary
## [1] "factor"
df$semester; class(df$semester)
##  [1] 2 2 6 2 3 6 7 2 4 6
## [1] "numeric"
df['semester'] <- factor(df$semester, levels = 1:14, ordered = T)
df$semester; class(df$semester)
##  [1] 2 2 6 2 3 6 7 2 4 6
## Levels: 1 < 2 < 3 < 4 < 5 < 6 < 7 < 8 < 9 < 10 < 11 < 12 < 13 < 14
## [1] "ordered" "factor"
# Struktur Data
str(df)
## 'data.frame':    10 obs. of  5 variables:
##  $ student         : chr  "Ami" "Budi" "Caca" "Doni" ...
##  $ sex             : Factor w/ 3 levels "Female","Male",..: 1 2 1 2 2 1 1 2 2 2
##  $ semester        : Ord.factor w/ 14 levels "1"<"2"<"3"<"4"<..: 2 2 6 2 3 6 7 2 4 6
##  $ household.income: num  15888 12790 4912 48242 237505 ...
##  $ final.score     : num  88 85 85 83 83 74 74 76 76 84
# Ringkasan Data
summary(df)
##    student                  sex       semester household.income  final.score   
##  Length:10          Female    :4   2      :4   Min.   :  4912   Min.   :74.00  
##  Class :character   Male      :6   6      :3   1st Qu.: 13062   1st Qu.:76.00  
##  Mode  :character   Non-Binary:0   3      :1   Median : 26536   Median :83.00  
##                                    4      :1   Mean   : 47071   Mean   :80.80  
##                                    7      :1   3rd Qu.: 47086   3rd Qu.:84.75  
##                                    1      :0   Max.   :237505   Max.   :88.00  
##                                    (Other):0
# Fungsi Apply
length(df)
## [1] 5
sapply(df, length)
##          student              sex         semester household.income 
##               10               10               10               10 
##      final.score 
##               10
sapply(df[c('household.income', 'final.score')], sum)
## household.income      final.score 
##           470709              808
sapply(df[c('household.income', 'final.score')], mean)
## household.income      final.score 
##          47070.9             80.8
sapply(df[c('household.income', 'final.score')], sd)
## household.income      final.score 
##     69235.969079         5.223877
levels(df$sex)
## [1] "Female"     "Male"       "Non-Binary"
levels(df$semester)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
sapply(df[c('sex', 'semester')], levels)
## $sex
## [1] "Female"     "Male"       "Non-Binary"
## 
## $semester
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
# Contoh Akhir
q.df <- sapply(df[c('household.income', 'final.score')], quantile,
               probs= c(.25, .75))
q.df
##     household.income final.score
## 25%         13061.75       76.00
## 75%         47085.50       84.75
q.df["75%", ] - q.df["25%", ]  # to find IQR
## household.income      final.score 
##         34023.75             8.75

#PART 2

library(MASS)
df.boston <- Boston
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# Pipeline
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
x <- c(1, 3, 2)
x %>% mean();    mean(x)
## [1] 2
## [1] 2
x %>% quantile(c(.25, .50, .75));    quantile(x, c(.25, .50, .75))
## 25% 50% 75% 
## 1.5 2.0 2.5
## 25% 50% 75% 
## 1.5 2.0 2.5
head(
    round(
    cor(
        df.boston), 2))
##        crim    zn indus  chas   nox    rm   age   dis   rad   tax ptratio black
## crim   1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58    0.29 -0.39
## zn    -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31   -0.39  0.18
## indus  0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72    0.38 -0.36
## chas  -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04   -0.12  0.05
## nox    0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67    0.19 -0.38
## rm    -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29   -0.36  0.13
##       lstat  medv
## crim   0.46 -0.39
## zn    -0.41  0.36
## indus  0.60 -0.48
## chas  -0.05  0.18
## nox    0.59 -0.43
## rm    -0.61  0.70
df.boston %>%
    cor() %>%
    round(2) %>%
    head()
##        crim    zn indus  chas   nox    rm   age   dis   rad   tax ptratio black
## crim   1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58    0.29 -0.39
## zn    -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31   -0.39  0.18
## indus  0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72    0.38 -0.36
## chas  -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04   -0.12  0.05
## nox    0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67    0.19 -0.38
## rm    -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29   -0.36  0.13
##       lstat  medv
## crim   0.46 -0.39
## zn    -0.41  0.36
## indus  0.60 -0.48
## chas  -0.05  0.18
## nox    0.59 -0.43
## rm    -0.61  0.70
plot(quantile(df.boston$age, c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)),
     main = "Grafik", xlab = "", ylab = "age")

library(dplyr)
df.boston$age %>%
  quantile(c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1)) %>%
  plot(main = "Grafik", xlab = "", ylab = "age")

# Help
help(runif)
## starting httpd help server ...
##  done
help("data.frame")

BAB 2 PART 1 ## Data Dataset berikut merupakan data yang akan digunakan dalam ilustrasi histogram dan boxplot

df <- read.csv2("C:/Users/user/Downloads/data.csv")
head(df)
##   X PID    county state  area poptotal popdensity popwhite popblack
## 1 1 561     ADAMS    IL 0.052    66090  1270.9615    63917     1702
## 2 2 562 ALEXANDER    IL 0.014    10626   759.0000     7054     3496
## 3 3 563      BOND    IL 0.022    14991   681.4091    14477      429
## 4 4 564     BOONE    IL 0.017    30806  1812.1176    29344      127
## 5 5 565     BROWN    IL 0.018     5836   324.2222     5264      547
## 6 6 566    BUREAU    IL 0.050    35688   713.7600    35157       50
##   popamerindian popasian popother percwhite  percblack percamerindan  percasian
## 1            98      249      124  96.71206  2.5752761     0.1482826 0.37675897
## 2            19       48        9  66.38434 32.9004329     0.1788067 0.45172219
## 3            35       16       34  96.57128  2.8617170     0.2334734 0.10673071
## 4            46      150     1139  95.25417  0.4122574     0.1493216 0.48691813
## 5            14        5        6  90.19877  9.3728581     0.2398903 0.08567512
## 6            65      195      221  98.51210  0.1401031     0.1821340 0.54640215
##    percother popadults  perchsd percollege percprof poppovertyknown
## 1 0.18762294     43298 75.10740   19.63139 4.355859           63628
## 2 0.08469791      6724 59.72635   11.24331 2.870315           10529
## 3 0.22680275      9669 69.33499   17.03382 4.488572           14235
## 4 3.69733169     19272 75.47219   17.27895 4.197800           30337
## 5 0.10281014      3979 68.86152   14.47600 3.367680            4815
## 6 0.61925577     23444 76.62941   18.90462 3.275891           35107
##   percpovertyknown percbelowpoverty percchildbelowpovert percadultpoverty
## 1         96.27478        13.151443             18.01172        11.009776
## 2         99.08714        32.244278             45.82651        27.385647
## 3         94.95697        12.068844             14.03606        10.852090
## 4         98.47757         7.209019             11.17954         5.536013
## 5         82.50514        13.520249             13.02289        11.143211
## 6         98.37200        10.399635             14.15882         8.179287
##   percelderlypoverty inmetro category
## 1          12.443812       0      AAR
## 2          25.228976       0      LHR
## 3          12.697410       0      AAR
## 4           6.217047       1      ALU
## 5          19.200000       0      AAR
## 6          11.008586       0      AAR

Histogram

Histogram sederhana

hist(df$percollege)

hist(df$percollege, freq = F)

Histogram modifikasi

hist (df$percollege,
      main ='Histogram persentase berpendidikan perguruan tinggi ',
      sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
      ylab = 'Frekuensi',
      xlab = 'Persentase berpendidikan perguruan tinggi',
      col = 'light blue')

hist (df$percollege,
      main ='Histogram persentase berpendidikan perguruan tinggi ',
      sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
      ylab = 'Frekuensi',
      xlab = 'Persentase berpendidikan perguruan tinggi',
      col = 'light blue',
      breaks = seq(min(df$percollege), max(df$percollege), length.out = 21+1))

hist (df$percollege,
      main ='Histogram persentase berpendidikan perguruan tinggi ',
      sub = 'Negara bagian Illinois, Indiana, Michigan, Ohio, Wisconsin',
      ylab = 'Frekuensi',
      xlab = 'Persentase berpendidikan perguruan tinggi',
      col = 'light blue',
      breaks = seq(min(df$percollege), max(df$percollege), length.out = 21+1))
abline(v=median(df$percollege),col="coral",lwd=2)
abline(v=mean(df$percollege), col="green", lwd=5)

histogram modifikasi: package ggplot

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
ggplot(df, aes(x=percollege)) + 
  geom_histogram(color="black", fill="white")+
  geom_vline(aes(xintercept=mean(percollege)),
            color="blue", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x=percollege)) + 
  geom_histogram(color="dark blue", fill="light blue", linetype="dashed")+
  geom_vline(aes(xintercept=mean(percollege)),
            color="green", linetype="dashed", size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x=percollege, color=state)) +
  geom_histogram(fill="white")+
  theme(legend.position="top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(plyr)
## Warning: package 'plyr' was built under R version 4.1.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
rataan<-ddply(df, "state", summarise, rata2=mean(percollege))
ggplot(df, aes(x=percollege, color=state)) +
  geom_histogram(fill="white")+
  geom_vline(data=rataan, aes(xintercept=rata2, color=state),
             linetype="dashed")+
  theme(legend.position="top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x=percollege, color=state)) +
  geom_histogram(fill="white")+
  geom_vline(data=rataan, aes(xintercept=rata2, color=state),
             linetype="dashed")+
  theme(legend.position="top")+
  scale_color_grey() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(x=percollege))+
  geom_histogram(color="black", fill="white")+
  facet_grid(state ~ .)+
  geom_vline(data=rataan, aes(xintercept=rata2, color="red"),
             linetype="dashed")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(ggplot2)
ggplot(data = df, aes(x=percollege)) +
  geom_histogram(aes(fill=state)) +
  scale_fill_brewer(palette="Set2") +
  facet_wrap( ~ state, ncol=1) +
  xlab("Persentase berpendidikan perguruan tinggi") +
  ylab("Frekuensi") +
  theme_bw() +
  ggtitle("Percollege by State\n")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot

Base

boxplot(df$percchildbelowpovert)

Merubah judul dan warna boxplot

boxplot(df$percchildbelowpovert, horizontal = T,
        main = "% of Child Below\nThe Poverty Line in The Midwest", col = "red")

Base: Grouping

Pengelompokan dgn x = kategorik dan y = numerik
boxplot(percchildbelowpovert ~ state, data = df)

boxplot(percchildbelowpovert ~ inmetro, data = df)

Parameter xlim/ylim untuk mengubah batasan skala
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100))

Masukkan vektor ke dalam parameter col untuk menggunakan banyak warna
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100),
        col = c("red", "blue"))

Base: Axis Modification

xaxt untuk mengubah tipe sumbu-x, xaxt = ‘n’ untuk menghilangkan isi sumbu
boxplot(percchildbelowpovert ~ inmetro, data = df, ylim = c(0, 100),
        col = c("red", "blue"), xaxt = "n")
# untuk memodifikasi salah satu sumbu
axis(1, at = 1:2, labels = c("No", "Yes"))

Base: Category Sorting

boxplot(percchildbelowpovert ~ state, data = df)

Reorder() untuk mengurutkan suatu kategori berdasarkan suatu statistiknya
Dalam kasus ini diurutkan berdasarkan mediannya
boxplot(df$percchildbelowpovert ~
                reorder(df$state, df$percchildbelowpovert, FUN = median))

reorder() dapat disimpan ke dalam suatu variabel terlebih dahulu sehingga kode bisa menjdai lebih rapih
state.reord <- reorder(df$state, df$percchildbelowpovert, FUN = median)
boxplot(df$percchildbelowpovert ~ state.reord)

#####membalikkan urutan kategori menggunakan tidyverse::fct_rev()

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.5     v purrr   0.3.4
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plyr::arrange()   masks dplyr::arrange()
## x purrr::compact()  masks plyr::compact()
## x plyr::count()     masks dplyr::count()
## x plyr::failwith()  masks dplyr::failwith()
## x dplyr::filter()   masks stats::filter()
## x plyr::id()        masks dplyr::id()
## x dplyr::lag()      masks stats::lag()
## x plyr::mutate()    masks dplyr::mutate()
## x plyr::rename()    masks dplyr::rename()
## x dplyr::select()   masks MASS::select()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
boxplot(df$percchildbelowpovert ~ fct_rev(state.reord)) # reverse factor order

Penggunaan parameter data pada fungsi plot
boxplot(df$percchildbelowpovert ~ state.reord,
        col = c("darkred", "red", "white", "blue", "darkblue"))

boxplot(percchildbelowpovert ~ state.reord, data = df,
        col = c("darkred", "red", "white", "blue", "darkblue"))

modifikasi warna, judul, dan label
boxplot(df$percchildbelowpovert ~ state.reord,
        col = c("darkblue", "blue", "white", "red", "darkred"),
        main = "Child Poverty Rate in Midwest Counties,\nGrouped by States",
        xlab = "State", ylab = "% of Child Living Under Poverty Line",
        ylim = c(0, 100))
# menambahkan garis horizontal dan teks
abline(h = 50, lty = "dashed", col = "grey")
text(x = 2, y = 64.31 + 7, labels = "Menominee, WI", cex = .725)

max(df$percchildbelowpovert)
## [1] 64.30848
df$county[df$percchildbelowpovert == max(df$percchildbelowpovert)]
## [1] "MENOMINEE"

GGPlot

library(ggplot2)
ggplot(data = df, mapping = aes(y = percchildbelowpovert)) + geom_boxplot()

Objek ggplot dapat disimpan ke dalam suatu variabel
bp.child.povr <- ggplot(data = df, mapping = aes(y = percchildbelowpovert)) +
        geom_boxplot()

modifikasi tema di ggplot

bp.child.povr + theme_light()

bp.child.povr + theme_bw()

bp.child.povr + theme_classic()

bp.child.povr + theme_minimal()

bp.child.povr + theme_void()  # akan berguna jika menggunakan peta choropleth

GGPlot: Grouping

bp.cp.inmet <- ggplot(
        data = df, mapping = aes(x = as.factor(inmetro),  # faktor krn kategorik
                                 y = percchildbelowpovert)
) + geom_boxplot()

bp.cp.inmet + theme_classic()

GGPlot: Group Coloring

cp.inmet.col <- ggplot(
        data = df, mapping = aes(x = as.factor(inmetro),
                                 y = percchildbelowpovert,
                                 fill = as.factor(inmetro))
)
Modifikasi warna dan transparansi
cp.inmet.col + geom_boxplot(fill = "orange") + theme_classic()

cp.inmet.col + geom_boxplot(fill = "orange", alpha = .35) + theme_classic()

fill (warna isi) vs color (warna garis/border)

cp.inmet.col + geom_boxplot(fill = "orange", color = "brown") + theme_classic()

cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic()

GGPlot: Axis Modification

cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic()

Modifikasi label dan ticks pada sumbu-x dan y
cp.inmet.col + geom_boxplot(fill = c("red", "blue")) + theme_classic() +
        xlab("County considered in a metro area?") +
        ylab("% of Child Living Under Poverty") +
        scale_x_discrete(breaks = c(0, 1), labels = c("No", "Yes")) +
        ylim(0, 100)

GGPlot: Category Sorting

ggplot(df, aes(x = reorder(state, percchildbelowpovert, median),
               y = percchildbelowpovert)) +
        geom_boxplot()

# state.reord telah dideklarasikan sebelumnya
cp.state <- ggplot(df, aes(x = state.reord, y = percchildbelowpovert,
                           color = state.reord))

cp.state + geom_boxplot()

Membalikkan urutan kategori
cp.state + geom_boxplot() + scale_x_discrete(limits = rev(levels(state.reord)))

boxplot + jitter

cp.state + geom_boxplot() + geom_jitter(alpha = .5) + scale_x_discrete() +
        theme(legend.position="none")  # menghilangkan legenda

BAB 3 #PP PLOT

Deteksi sebaran dari suatu data secara eksploratif selain menggunakan QQ plot juga dapat menggunakan PP plot (Probability-Probability plot). Plot ini hampir sama dengan QQ plot yaitu berupa plot pencar yang digunakan untuk melihat kecocokan dari 2 dataset atau dataset terhadap sebaran tertentu. Namun pada PP plot, yang dibandingkan adalah peluang sebaran komulatif. Kedua dataset dikatakan identik atau suatu dataset dikatakan mengikuti sebaran tertentu jika titik-titik mendekati garis lurus diagonal y=x dimana dimulai dari titik (0,0) hingga (1,1). PP plot dalam mendeteksi sebaran dari suatu data menggunakan R dapat dibangun menggunakan langkah-langkah berikut:

  1. Mengurutkan data dari yang terkecil hingga terbesar
  2. Memeriksa data duplikat. Jika terdapat data duplikat, perlu dilakukan manajemen data untuk mengambil data unik
  3. menghitung peluang kumulatif dari masing-masing data unik sebagai peluang kumulatif dari data empiris
  4. menghitung peluang kumulatif dari data unif berdasarkan sebaran teoritis tertentu
  5. plot antara peluang kumulatif data empiris dengan peluang kumulatif sebara teoritis

Simulasi deteksi sebaran menggunakan PP plotdalam R dapat mengikuti syntax berikut:

#Membangktikan data (ex: menyebar normal)
set.seed(1)
x<-rnorm(100,mean=2, sd=1)
x<-sort(x) #mengurutkan data empiris dari yang terkecil
i<-seq(1:length(x))
df<-data.frame(i,x)
m<-mean(x)
s<-sd(x)
n<-length(x)
head(df, 10)
##     i          x
## 1   1 -0.2146999
## 2   2  0.0106483
## 3   3  0.1950414
## 4   4  0.4764332
## 5   5  0.5292476
## 6   6  0.6229404
## 7   7  0.7234078
## 8   8  0.7463666
## 9   9  0.7753874
## 10 10  0.8706369
#Memeriksa data duplikat
library(dplyr)
uniqe_x<-distinct(df,x)
dim(uniqe_x) #semua data unik, banyak data unik sama dengan banyak data yang dibangkitkan
## [1] 100   1
#Peluang kumulatif data empiris
cdf_x<-0
sum=0
for(i in 1:n){
  sum=sum+(1/n)
  cdf_x[i]<-sum
}
df$cdf_x<-cdf_x
head(df, 10)
##     i          x cdf_x
## 1   1 -0.2146999  0.01
## 2   2  0.0106483  0.02
## 3   3  0.1950414  0.03
## 4   4  0.4764332  0.04
## 5   5  0.5292476  0.05
## 6   6  0.6229404  0.06
## 7   7  0.7234078  0.07
## 8   8  0.7463666  0.08
## 9   9  0.7753874  0.09
## 10 10  0.8706369  0.10
#Peluang kumulatif sebaran teoritis (Normal)
cdf_normal<-pnorm(df$x, mean=m, sd=s)
#atau dapat menggunakan sebaran normal baku
z_score<-(df$x-m)/s
cdf_norm<-pnorm(z_score, mean=0, sd=1)

df$cdf_norm<-cdf_norm
head(df, 10)
##     i          x cdf_x    cdf_norm
## 1   1 -0.2146999  0.01 0.004841632
## 2   2  0.0106483  0.02 0.009744305
## 3   3  0.1950414  0.03 0.016554518
## 4   4  0.4764332  0.04 0.034572267
## 5   5  0.5292476  0.05 0.039316458
## 6   6  0.6229404  0.06 0.049026987
## 7   7  0.7234078  0.07 0.061475114
## 8   8  0.7463666  0.08 0.064640041
## 9   9  0.7753874  0.09 0.068820007
## 10 10  0.8706369  0.10 0.084010290
library(ggplot2)
histogram<-ggplot(df) +
  geom_histogram(aes(x = x),fill="darkred",col="darkred") +
  ggtitle("Histogram data empiris") +
  ylab("Frekuensi") +
  xlab("x") + 
  theme(plot.title = element_text(hjust = 0.5))

pp_norm<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_norm),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Normal") +
  ylab("Peluang kumulatif sebaran normal") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_norm

#Peluang kumulatif sebaran teoritis (Chi-square)
cdf_chisq<-pchisq(df$x, df=m)
df$cdf_chisq<-cdf_chisq
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq
## 1   1 -0.2146999  0.01 0.004841632 0.000000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216
## 3   3  0.1950414  0.03 0.016554518 0.079814878
## 4   4  0.4764332  0.04 0.034572267 0.190789559
## 5   5  0.5292476  0.05 0.039316458 0.210400610
## 6   6  0.6229404  0.06 0.049026987 0.244188936
## 7   7  0.7234078  0.07 0.061475114 0.278998367
## 8   8  0.7463666  0.08 0.064640041 0.286748071
## 9   9  0.7753874  0.09 0.068820007 0.296435838
## 10 10  0.8706369  0.10 0.084010290 0.327392723
pp_chisq<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_chisq),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Chi-square") +
  ylab("Peluang kumulatif sebaran chi-square") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_chisq

#Peluang kumulatif sebaran teoritis (Eksponensial)
cdf_exp<-pexp(df$x, rate=m)
df$cdf_exp<-cdf_exp
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq    cdf_exp
## 1   1 -0.2146999  0.01 0.004841632 0.000000000 0.00000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216 0.02220581
## 3   3  0.1950414  0.03 0.016554518 0.079814878 0.33722538
## 4   4  0.4764332  0.04 0.034572267 0.190789559 0.63386163
## 5   5  0.5292476  0.05 0.039316458 0.210400610 0.67245296
## 6   6  0.6229404  0.06 0.049026987 0.244188936 0.73117945
## 7   7  0.7234078  0.07 0.061475114 0.278998367 0.78250633
## 8   8  0.7463666  0.08 0.064640041 0.286748071 0.79278598
## 9   9  0.7753874  0.09 0.068820007 0.296435838 0.80508752
## 10 10  0.8706369  0.10 0.084010290 0.327392723 0.84055802
pp_exp<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_exp),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Exponensial") +
  ylab("Peluang kumulatif sebaran exponensial") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_exp

#Peluang kumulatif sebaran teoritis (Gamma)
cdf_gamma<-pgamma(df$x, shape = 1, rate = 1) #alfa=1, beta=1
df$cdf_gamma<-cdf_gamma
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq    cdf_exp  cdf_gamma
## 1   1 -0.2146999  0.01 0.004841632 0.000000000 0.00000000 0.00000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216 0.02220581 0.01059181
## 3   3  0.1950414  0.03 0.016554518 0.079814878 0.33722538 0.17719938
## 4   4  0.4764332  0.04 0.034572267 0.190789559 0.63386163 0.37900559
## 5   5  0.5292476  0.05 0.039316458 0.210400610 0.67245296 0.41095201
## 6   6  0.6229404  0.06 0.049026987 0.244188936 0.73117945 0.46363503
## 7   7  0.7234078  0.07 0.061475114 0.278998367 0.78250633 0.51490367
## 8   8  0.7463666  0.08 0.064640041 0.286748071 0.79278598 0.52591403
## 9   9  0.7753874  0.09 0.068820007 0.296435838 0.80508752 0.53947465
## 10 10  0.8706369  0.10 0.084010290 0.327392723 0.84055802 0.58131520
pp_gamma<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_gamma),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Gamma") +
  ylab("Peluang kumulatif sebaran gamma") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_gamma

#menggabungkan semua plot

par(mfrow=c(2,2))
pp_norm

pp_chisq

pp_exp

pp_gamma

Berdasarkan PP-plot di atas, data paling mendekati sebaran normal. Titik-titik pada plot mendekati garis diagonal y=x. Hal ini sesuai dengan data yang memang dibangkitkan dari sebaran normal.

BAB 3 #QQ PLOT # QQ-Plot QQ Plot/Plot kuantil merupakan grafik yang membandingkan antara nilai-nilai kuantil dari sebaran suatu data terhadap kuantil-kuantil milik sebaran lain. Jika kedua sebaran berbentuk sama/hampir sama, maka tiap nilai amatannya akan berada di suatu garis lurus. Plot ini dapat digunakan untuk mencari tahu sebaran teoretis yang paling mendekati sebaran suatu data secara visual.

Pada sesi ini kita akan menggunakan data ggplot2::midwest$percollege (persentase penduduk yang berpendidikan )

library(ggplot2)
df <- as.data.frame(midwest)
var1 <- df$percollege
str(var1)
##  num [1:437] 19.6 11.2 17 17.3 14.5 ...

Berikut adalah histogram dan plot densitas bagi var1 (plot densitas akan dipelajari lebih lanjut di minggu ke-4). Menggunakan QQ Plot, kita akan berusaha untuk mencari sebaran teoretis (normal, exponential, chi-square, etc.) yang paling mendekati sebaran kepekatan data yang kita miliki.

Base (qqplot)

qqplot(x, y, plot.it = TRUE, xlab = deparse1(substitute(x)), ylab = deparse1(substitute(y)), …)

x: The first sample for qqplot. y: The second or only data sample.

qqline(y, datax = FALSE, distribution = qnorm, probs = c(0.25, 0.75), qtype = 7, …)

# Normal
set.seed(42)
qqnorm(var1, cex = .5)
set.seed(42)
qqline(var1, distribution = qnorm, col = "red", lty = "dashed", lwd = .1)

# Normal
set.seed(42)
qqplot(rnorm(n = length(var1), mean = mean(var1), sd = sd(var1)), var1)
set.seed(42)
qqline(distribution = function(p) qnorm(p, mean = mean(var1), sd = sd(var1)),
       var1)

X ~ Lognormal -> log(X) ~ Normal

# Lognormal
set.seed(42)
qqplot(rlnorm(n = length(var1), meanlog = mean(log(var1)),
              sdlog = sd(log(var1))), var1)
set.seed(42)
qqline(distribution = function(p) qlnorm(p, meanlog = mean(log(var1)),
                                         sdlog = sd(log(var1))), var1,
       col = "red")

# Chi-Squared
qqplot(rchisq(n = length(var1), df = mean(var1)), var1)
qqline(distribution = function(p) qchisq(p, df = mean(var1)), var1, col = "red")

# Exponential
set.seed(42)
qqplot(rexp(n = length(var1), rate = 1/mean(var1)), var1)
set.seed(42)
qqline(distribution = function(p) qexp(p, rate = 1/mean(var1)), var1, col = "red")

hist(var1, breaks = 20, freq = F, xlim = c(0, 50), border = "white",
     col = "skyblue", main = "var1 vs Exponential")
curve(dexp(x, rate = 1/mean(var1)), from = 0, to = 50, add = T, lwd = 4)

GGPlot (stat_qq)

stat_qq( mapping = NULL, data = NULL, geom = “point”, position = “identity”, …, distribution = stats::qnorm, dparams = list(), na.rm = FALSE, show.legend = NA, inherit.aes = TRUE )

stat_qq_line( mapping = NULL, data = NULL, geom = “path”, position = “identity”, …, distribution = stats::qnorm, dparams = list(), line.p = c(0.25, 0.75), fullrange = FALSE, na.rm = FALSE, show.legend = NA, inherit.aes = TRUE )

library(ggplot2)
gg.percol <- ggplot(data = df, mapping = aes(sample = percollege))
# Normal
gg.percol + stat_qq() + stat_qq_line()

# Lognormal
param.lnorm <- list(meanlog = mean(log(var1)), sdlog = sd(log(var1)))
gg.percol + stat_qq(distribution = qlnorm, dparams = param.lnorm) +
    stat_qq_line(distribution = qlnorm, dparams = param.lnorm)

# Chi-Squared
param.chisq <- list(df = mean(var1))
gg.percol + stat_qq(distribution = qchisq, dparams = param.chisq) +
    stat_qq_line(distribution = qchisq, dparams = param.chisq, color = "red") +
    theme_classic()

# Exponential
param.exp <- list(rate = 1/mean(var1))
gg.percol + stat_qq(distribution = qexp, dparams = param.exp) +
    stat_qq_line(distribution = qexp, dparams = param.exp, color = "red") +
    theme_classic() + xlab("Theoretical Quantiles") + ylab("Sample Quantiles")

CAR (qqPlot)

qqPlot(x, distribution=“norm”, groups, layout, ylim=range(x, na.rm=TRUE), ylab=deparse(substitute(x)), xlab=paste(distribution, “quantiles”), glab=deparse(substitute(groups)), main=NULL, las=par(“las”), envelope=TRUE, col=carPalette()[1], col.lines=carPalette()[2], lwd=2, pch=1, cex=par(“cex”), line=c(“quartiles”, “robust”, “none”), id=TRUE, grid=TRUE, …)

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
# Normal
qqPlot(var1, distribution = "norm", mean = mean(var1), sd = sd(var1))

## [1] 275 378
# Lognormal
qqPlot(var1, distribution = "lnorm", meanlog = mean(log(var1)),
       sdlog = sd(log(var1)))

## [1] 275 378
# Chi-Squared
qqPlot(var1, distribution = "chisq", df = mean(var1))

## [1] 275 378
# Exponential
qqPlot(var1, distribution = "exp", rate = 1/mean(var1))

## [1] 275 378

Formal Sample Distribution Test

Kolmogorov-Smirnov Test : Numerik Anderson-Darling Test : Numerik Chi-Squared Test : Kategorik

In a nutshell: H0: Sebaran 1 == Sebaran 2 H1: Sebaran 1 != Sebaran 2

p-value > alpha, Terima H0 p-value < alpha, Tolak H0

Kolmogorov-Smirnov Test

ks.test(x, y, …, alternative = c(“two.sided”, “less”, “greater”), exact = NULL)

# Normal
set.seed(42)
ks.test(var1, "pnorm", mean = mean(var1), sd = sd(var1))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  var1
## D = 0.1289, p-value = 9.859e-07
## alternative hypothesis: two-sided
# Lognormal
set.seed(42)
ks.test(var1, "plnorm", meanlog = mean(log(var1)), sdlog = sd(log(var1)))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  var1
## D = 0.061947, p-value = 0.0699
## alternative hypothesis: two-sided
# Chi-Squared
set.seed(42)
ks.test(var1, "pchisq", df = mean(var1))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  var1
## D = 0.08466, p-value = 0.003806
## alternative hypothesis: two-sided
# Exponential
set.seed(42)
ks.test(log(var1), "pexp", rate = 1/mean(var1))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  log(var1)
## D = 0.81103, p-value < 2.2e-16
## alternative hypothesis: two-sided

Anderson-Darling Test

ad.test(…, data = NULL, method = c(“asymptotic”, “simulated”, “exact”), dist = FALSE, Nsim = 10000)

library(kSamples)
## Loading required package: SuppDists
# Normal
set.seed(42)
ad.test(var1, rnorm(n = length(var1), mean = mean(var1), sd = sd(var1)))
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  2
## Sample sizes:  437, 437
## Number of ties: 0
## 
## Mean of  Anderson-Darling  Criterion: 1
## Standard deviation of  Anderson-Darling  Criterion: 0.75972
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##               AD  T.AD  asympt. P-value
## version 1: 5.962 6.531        0.0009721
## version 2: 5.980 6.555        0.0009560
# Lognormal
set.seed(42)
ad.test(var1, rlnorm(n = length(var1), meanlog = mean(log(var1)),
                     sdlog = sd(log(var1))))
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  2
## Sample sizes:  437, 437
## Number of ties: 0
## 
## Mean of  Anderson-Darling  Criterion: 1
## Standard deviation of  Anderson-Darling  Criterion: 0.75972
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##               AD    T.AD  asympt. P-value
## version 1: 1.052 0.06866           0.3306
## version 2: 1.060 0.07271           0.3292
# Chi-Squared
set.seed(42)
ad.test(var1, rchisq(n = length(var1), df = mean(var1)))
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  2
## Sample sizes:  437, 437
## Number of ties: 0
## 
## Mean of  Anderson-Darling  Criterion: 1
## Standard deviation of  Anderson-Darling  Criterion: 0.75972
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##              AD  T.AD  asympt. P-value
## version 1: 4.30 4.344         0.006241
## version 2: 4.31 4.361         0.006162
# Exponential
set.seed(42)
ad.test(log(var1), rexp(n = length(var1), rate = 1/mean(var1)))
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  2
## Sample sizes:  437, 437
## Number of ties: 0
## 
## Mean of  Anderson-Darling  Criterion: 1
## Standard deviation of  Anderson-Darling  Criterion: 0.75972
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##               AD  T.AD  asympt. P-value
## version 1: 220.5 288.9       1.430e-121
## version 2: 221.0 289.0       3.343e-121

Chi-Square Test

chisq.test(x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000)

var2 <- df$state
str(var2)
##  chr [1:437] "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" "IL" ...

O : Data observasi yang ada di dataset E : Data ekspektasi andaikan mengikuti suatu sebaran teoretik tertentu

Observed <- factor(var2)
str(Observed)
##  Factor w/ 5 levels "IL","IN","MI",..: 1 1 1 1 1 1 1 1 1 1 ...
table(Observed)
## Observed
##  IL  IN  MI  OH  WI 
## 102  92  83  88  72
# Theoretical Uniform
set.seed(42)
Expected <- sample(levels(Observed), size = length(var2), replace = T)
Expected <- factor(Expected)
str(Expected)
##  Factor w/ 5 levels "IL","IN","MI",..: 1 5 1 1 2 4 2 2 1 4 ...
table(Expected)
## Expected
##  IL  IN  MI  OH  WI 
##  93 103  76  83  82

sample(x, size, replace = FALSE, prob = NULL)

parameter prob dapat diisi oleh vektor berisi peluang muncul suatu kategori, sehingga nanti dapat diubah menjadi bentuk sebaran teoretik yang diinginkan.

chisq.test(Observed, Expected)
## 
##  Pearson's Chi-squared test
## 
## data:  Observed and Expected
## X-squared = 12.879, df = 16, p-value = 0.6816

p-val > alpha -> sebaran df$state adalah uniform

#PP-Plot

Deteksi sebaran dari suatu data secara eksploratif selain menggunakan QQ plot juga dapat menggunakan PP plot (Probability-Probability plot). Plot ini hampir sama dengan QQ plot yaitu berupa plot pencar yang digunakan untuk melihat kecocokan dari 2 dataset atau dataset terhadap sebaran tertentu. Namun pada PP plot, yang dibandingkan adalah peluang sebaran komulatif. Kedua dataset dikatakan identik atau suatu dataset dikatakan mengikuti sebaran tertentu jika titik-titik mendekati garis lurus diagonal y=x dimana dimulai dari titik (0,0) hingga (1,1). PP plot dalam mendeteksi sebaran dari suatu data menggunakan R dapat dibangun menggunakan langkah-langkah berikut:

  1. Mengurutkan data dari yang terkecil hingga terbesar
  2. Memeriksa data duplikat. Jika terdapat data duplikat, perlu dilakukan manajemen data untuk mengambil data unik
  3. menghitung peluang kumulatif dari masing-masing data unik sebagai peluang kumulatif dari data empiris
  4. menghitung peluang kumulatif dari data unif berdasarkan sebaran teoritis tertentu
  5. plot antara peluang kumulatif data empiris dengan peluang kumulatif sebara teoritis

Simulasi deteksi sebaran menggunakan PP plotdalam R dapat mengikuti syntax berikut:

#Membangktikan data (ex: menyebar normal)
set.seed(1)
x<-rnorm(100,mean=2, sd=1)
x<-sort(x) #mengurutkan data empiris dari yang terkecil
i<-seq(1:100)
df<-data.frame(i,x)
m<-mean(x)
s<-sd(x)
n<-length(x)
head(df, 10)
##     i          x
## 1   1 -0.2146999
## 2   2  0.0106483
## 3   3  0.1950414
## 4   4  0.4764332
## 5   5  0.5292476
## 6   6  0.6229404
## 7   7  0.7234078
## 8   8  0.7463666
## 9   9  0.7753874
## 10 10  0.8706369
#Memeriksa data duplikat
library(dplyr)
uniqe_x<-distinct(df,x)
dim(uniqe_x) #semua data unik, banyak data unik sama dengan banyak data yang dibangkitkan
## [1] 100   1
#Peluang kumulatif data empiris
cdf_x<-0
sum=0
for(i in 1:n){
  sum=sum+(1/n)
  cdf_x[i]<-sum
}
df$cdf_x<-cdf_x
head(df, 10)
##     i          x cdf_x
## 1   1 -0.2146999  0.01
## 2   2  0.0106483  0.02
## 3   3  0.1950414  0.03
## 4   4  0.4764332  0.04
## 5   5  0.5292476  0.05
## 6   6  0.6229404  0.06
## 7   7  0.7234078  0.07
## 8   8  0.7463666  0.08
## 9   9  0.7753874  0.09
## 10 10  0.8706369  0.10
#Peluang kumulatif sebaran teoritis (Normal)
cdf_normal<-pnorm(df$x, mean=m, sd=s)
#atau dapat menggunakan sebaran normal baku
z_score<-(df$x-m)/s
cdf_norm<-pnorm(z_score, mean=0, sd=1)

df$cdf_norm<-cdf_norm
head(df, 10)
##     i          x cdf_x    cdf_norm
## 1   1 -0.2146999  0.01 0.004841632
## 2   2  0.0106483  0.02 0.009744305
## 3   3  0.1950414  0.03 0.016554518
## 4   4  0.4764332  0.04 0.034572267
## 5   5  0.5292476  0.05 0.039316458
## 6   6  0.6229404  0.06 0.049026987
## 7   7  0.7234078  0.07 0.061475114
## 8   8  0.7463666  0.08 0.064640041
## 9   9  0.7753874  0.09 0.068820007
## 10 10  0.8706369  0.10 0.084010290
library(ggplot2)
histogram<-ggplot(df) +
  geom_histogram(aes(x = x),fill="darkred",col="darkred") +
  ggtitle("Histogram data empiris") +
  ylab("Frekuensi") +
  xlab("x") + 
  theme(plot.title = element_text(hjust = 0.5))

pp_norm<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_norm),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Normal") +
  ylab("Peluang kumulatif sebaran normal") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_norm

#Peluang kumulatif sebaran teoritis (Chi-square)
cdf_chisq<-pchisq(df$x, df=m)
df$cdf_chisq<-cdf_chisq
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq
## 1   1 -0.2146999  0.01 0.004841632 0.000000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216
## 3   3  0.1950414  0.03 0.016554518 0.079814878
## 4   4  0.4764332  0.04 0.034572267 0.190789559
## 5   5  0.5292476  0.05 0.039316458 0.210400610
## 6   6  0.6229404  0.06 0.049026987 0.244188936
## 7   7  0.7234078  0.07 0.061475114 0.278998367
## 8   8  0.7463666  0.08 0.064640041 0.286748071
## 9   9  0.7753874  0.09 0.068820007 0.296435838
## 10 10  0.8706369  0.10 0.084010290 0.327392723
pp_chisq<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_chisq),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Chi-square") +
  ylab("Peluang kumulatif sebaran chi-square") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_chisq

#Peluang kumulatif sebaran teoritis (Eksponensial)
cdf_exp<-pexp(df$x, rate=m)
df$cdf_exp<-cdf_exp
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq    cdf_exp
## 1   1 -0.2146999  0.01 0.004841632 0.000000000 0.00000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216 0.02220581
## 3   3  0.1950414  0.03 0.016554518 0.079814878 0.33722538
## 4   4  0.4764332  0.04 0.034572267 0.190789559 0.63386163
## 5   5  0.5292476  0.05 0.039316458 0.210400610 0.67245296
## 6   6  0.6229404  0.06 0.049026987 0.244188936 0.73117945
## 7   7  0.7234078  0.07 0.061475114 0.278998367 0.78250633
## 8   8  0.7463666  0.08 0.064640041 0.286748071 0.79278598
## 9   9  0.7753874  0.09 0.068820007 0.296435838 0.80508752
## 10 10  0.8706369  0.10 0.084010290 0.327392723 0.84055802
pp_exp<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_exp),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Exponensial") +
  ylab("Peluang kumulatif sebaran exponensial") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_exp

#Peluang kumulatif sebaran teoritis (Gamma)
cdf_gamma<-pgamma(df$x, shape = 1, rate = 1) #alfa=1, beta=1
df$cdf_gamma<-cdf_gamma
head(df,10)
##     i          x cdf_x    cdf_norm   cdf_chisq    cdf_exp  cdf_gamma
## 1   1 -0.2146999  0.01 0.004841632 0.000000000 0.00000000 0.00000000
## 2   2  0.0106483  0.02 0.009744305 0.003898216 0.02220581 0.01059181
## 3   3  0.1950414  0.03 0.016554518 0.079814878 0.33722538 0.17719938
## 4   4  0.4764332  0.04 0.034572267 0.190789559 0.63386163 0.37900559
## 5   5  0.5292476  0.05 0.039316458 0.210400610 0.67245296 0.41095201
## 6   6  0.6229404  0.06 0.049026987 0.244188936 0.73117945 0.46363503
## 7   7  0.7234078  0.07 0.061475114 0.278998367 0.78250633 0.51490367
## 8   8  0.7463666  0.08 0.064640041 0.286748071 0.79278598 0.52591403
## 9   9  0.7753874  0.09 0.068820007 0.296435838 0.80508752 0.53947465
## 10 10  0.8706369  0.10 0.084010290 0.327392723 0.84055802 0.58131520
pp_gamma<-ggplot(df) +
  geom_point(aes(x = cdf_x,y = cdf_gamma),
             color="steelblue",size=2) +
  ggtitle("PP-plot Terhadap Sebaran Gamma") +
  ylab("Peluang kumulatif sebaran gamma") +
  xlab("Peluang kumulatif data empiris") + 
  theme(plot.title = element_text(hjust = 0.5))+
  theme_linedraw()+
  geom_abline(intercept = 0, slope = 1, col="darkred", lwd=1)

par(mfrow=c(1,2))
histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pp_gamma

par(mfrow=c(2,2))
pp_norm

pp_chisq

pp_exp

pp_gamma

BAB 4

Plot densitas adalah kurva yang menggambarkan hubungan antara x dan y = f(x) dari suatu fungsi peluang. Pada suatu sebaran peluang teoretis, setiap nilai f(x) telah terdefinisi dengan jelas. Akan tetapi, pada suatu dataset asli, maka plot densitas hanya dapat diestimasi menggunakan metode-metode pendugaan densitas kernel. Baca lebih lanjut: Pendugaan Kernel untuk Plot Densitas (RPubs Pak Bagus).

Base

Sebaran Teoretis

curve(df(x, df1 = 50, df2 = 20), from = 0, to = 3.5,
      lwd = 4, col = "blue")

myCol <- rgb(1, 0, 0, alpha = .6)
scales::show_col(myCol)

polygon(curve(df(x, df1 = 50, df2 = 25), from = 0, to = 3.5),
        col = myCol, border = myCol)

Gunakan set.seed() setiap kali ingin menggunakan r* (runif, rnorm, rchisq, dll.) supaya bisa dapat contoh yang mampu direproduksi kembali. Fungsi d* (dunif, dnorm, dchisq, dll.) tidak perlu set.seed() karena hasilnya akan selalu sama setiap waktu.

set.seed(42)
hist(rf(n = 100, df1 = 50, df2 = 20), breaks = 10)

set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
     border = "white", col = "grey70")

set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
     border = "white", col = "grey70")
curve(df(x, df1 = 50, df2 = 20), from = 0, to = 3.5, add = T,
      lwd = 4, col = "blue")

set.seed(42)
hist(rf(n = 1000, df1 = 50, df2 = 25), breaks = 20, freq = F,
     border = "white", col = "grey70")
polygon(curve(df(x, df1 = 50, df2 = 25), add = T),
        col = myCol, border = myCol)

Sebaran Data Sampel

vec1 <- ggplot2::midwest$percbelowpoverty
hist(vec1)

plot(density(vec1, bw = .2), type = "l", col = "grey50", lwd = 2)
lines(density(vec1, bw = 1), col = "blue", lwd = 2)
lines(density(vec1, bw = 5), col = "red", lwd = 2)

legend("topright", legend = c("bw = 0.2", "bw = 1", "bw =  5"),
       lwd = 2, col = c("grey50", "blue", "red"))

hist(vec1, freq = F, breaks = 50, border = "white", col = rgb(.95, .95, 0, .6))

lines(density(vec1, bw = .2), col = "grey50", lwd = 2)
lines(density(vec1, bw = 1), col = "blue", lwd = 2)
lines(density(vec1, bw = 5), col = "red", lwd = 2)

legend("topright", legend = c("bw = 0.2", "bw =  1", "bw =  5"),
       lwd = 2, col = c("grey50", "blue", "red"))

Data Sampel vs Sebaran Teoretis

myGrey <- rgb(.5, .5, .5, alpha = .6)
scales::show_col(myGrey)

plot(density(vec1, bw = 2), col = myGrey, type = "l")
polygon(density(vec1, bw = 2), col = myGrey, border = myGrey)

curve(dchisq(x, df = mean(vec1)), from = 0, to = 50, add = T,
      lwd = 2, col = "red")

legend("topright", legend = c("Penduga Densitas Kernel", "Chi-Squared"),
       col = c("black", "red"), lwd = 2)

Kita dapat mencoba uji formal untuk memastikan apakah antara kedua kurva adalah sebaran yang sama atau tidak.

set.seed(42)
kSamples::ad.test(vec1, rchisq(n = length(vec1), df = mean(vec1)))
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  2
## Sample sizes:  437, 437
## Number of ties: 0
## 
## Mean of  Anderson-Darling  Criterion: 1
## Standard deviation of  Anderson-Darling  Criterion: 0.75972
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##               AD  T.AD  asympt. P-value
## version 1: 1.918 1.209           0.1017
## version 2: 1.920 1.208           0.1018

Hasil uji Anderson-Darling: Terima H0 atau “kedua sampel berasal dari populasi yang sama” atau “kedua sebaran memiliki bentuk yang sama”. Namun, p-value tidak jauh di atas 0.05, sehingga masih perlu dicari alternatif lain untuk dicari kepastian bentuk sebarannya.

Perbandingan Antar Metode Kernel

Tidak ada banyak perbedaan pada setiap metode kernel, terutama ketika jumlah sampel besar.

par(mfrow = c(2, 3))

plot(density(vec1, kernel = "gaussian", bw = .2), lwd = 2, type = "l",
     main = "gaussian")
plot(density(vec1, kernel = "epanechnikov", bw = .2), lwd = 2, type = "l",
     main = "epanechnikov")
plot(density(vec1, kernel = "triangular", bw = .2), lwd = 2, type = "l",
     main = "triangular")
plot(density(vec1, kernel = "biweight", bw = .2), lwd = 2, type = "l",
     main = "biweight")
plot(density(vec1, kernel = "cosine", bw = .2), lwd = 2, type = "l",
     main = "cosine")
plot(density(vec1, kernel = "optcosine", bw = .2), lwd = 2, type = "l",
     main = "optcosine")

par(mfrow = c(2, 3))

plot(density(vec1, kernel = "gaussian"), lwd = 2, type = "l",
     main = "gaussian")
plot(density(vec1, kernel = "epanechnikov"), lwd = 2, type = "l",
     main = "epanechnikov")
plot(density(vec1, kernel = "triangular"), lwd = 2, type = "l",
     main = "triangular")
plot(density(vec1, kernel = "biweight"), lwd = 2, type = "l",
     main = "biweight")
plot(density(vec1, kernel = "cosine"), lwd = 2, type = "l",
     main = "cosine")
plot(density(vec1, kernel = "optcosine"), lwd = 2, type = "l",
     main = "optcosine")

GGPlot

library(ggplot2)

Sebaran Teoretis

set.seed(42)
ggExamp <- ggplot(mapping = aes(x = rbeta(n = 1000, shape1 = 2, shape2 = 5)))
ggExamp + geom_density() + theme_classic()

Paramater adjust: jika semakin tinggi, maka semakin mulus namun semakin tidak merepresentasikan sebaran data aslinya. Begitupun sebaliknya.

ggExamp + geom_density(adjust = .2) + theme_classic()

ggExamp + geom_density(adjust = 5) + theme_classic()

Histogram dapat digabungkan dengan kurva densitas.

ggExamp + geom_histogram(aes(y = ..density..), bins = 20,
                         fill = "white", col = "black") +
    geom_density(adjust = 2, col = "darkorange", fill = "orange", alpha = .6) +
    theme_classic()

Sebaran Data Sampel

ggExamp2 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty))
ggExamp2 + geom_histogram(aes(y = ..density..), bins = 50,
                          color = "grey35", fill = "white") +
    geom_density(color = "orange", size = 1) +
    theme_classic()

Modifikasi parameter “color =” pada aes() agar dapat dikelompokkan berdasarkan warna garis.

ggGrouped1 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty,
                                                  color = state))
ggGrouped1 + geom_density(adjust = 2) + theme_classic()

Gunakan library RColorBrewer untuk menggunakan bermacam palet warna.

library(RColorBrewer)
par(cex = .6)
display.brewer.all()

scale_color_manual() untuk menyesuaikan warna pada tiap garis kurva.

ggGrouped1 + geom_density(adjust = 2) + theme_classic() +
    scale_color_manual(values = brewer.pal(n = 5, "Reds"))

Modifikasi parameter fill pada aes() agar pengelompokan dapat didasarkan pada warna isi. parameter alpha pada geom_density() untuk mengatur transparansi warna isi pada kurva denstitas.

ggGrouped2 <- ggplot(data = midwest, mapping = aes(x = percbelowpoverty,
                                                   color = state, fill = state))
ggGrouped2 + geom_density(adjust = 2, alpha = .1) + theme_classic()

scale_fill_manual() untuk menyesuaikan warna pada tiap isi kurva.

ggGrouped2 + geom_density(adjust = 2, alpha = .1) + theme_classic() +
    scale_color_manual(values = brewer.pal(n = 9, "Blues")[5:9]) +
    scale_fill_manual(values = brewer.pal(n = 9, "Blues")[5:9])

Data Sampel vs Sebaran Teoretis

stat_function() dapat digunakan untuk memasukkan suatu sebaran teoritis terhadap suatu sebaran sampel.

ggExamp3 <- ggplot(midwest, aes(x = percbelowpoverty))
ggExamp3 + geom_density(size = 1, color = "blue", adjust = 2) +
    theme_classic() +
    stat_function(
        fun = dnorm,
        args = with(midwest, c(mean = mean(percbelowpoverty),
                               sd = sd(percbelowpoverty))),
        size = 1, color = "red"
    )

Data visualisasi

library(ggplot2)
data(midwest)
df<-midwest

Violin plot sederhana

ggplot(df, aes(state, percollege)) + 
  geom_violin(alpha = 0.2)

Modifikasi dengan boxplot

ggplot(df, aes(state, percollege)) + 
  geom_boxplot() + 
  geom_violin(alpha = 0.2)

Modifikasi dengan summary

ggplot(df,aes(state,percollege))+
  geom_violin()+
  geom_boxplot(width=.1,fill="black",outlier.colour=NA)+
  stat_summary(fun.y=median,geom="point",fill="blue",shape=21,size=2.5)
## Warning: `fun.y` is deprecated. Use `fun` instead.

Modifikasi dengan kuantil

ggplot(df, aes(state, percollege)) + 
  geom_violin(draw_quantiles = 0.5) #kuantil 2

Modifikasi warna

ggplot(df, aes(state, percollege)) + 
  geom_violin(aes(col = state), fill = NA, draw_quantiles = c(0.25,0.5,0.75))  +
  # kuantil 1,2, dan 3
  labs(title = "Persentase Penduduk Berendidikan Perguruan Tinggi",
       subtitle = "Berdasarkan Negara Bagian",
       caption = "Data obtained from the ggplot2 package")

ggplot(df, aes(state, percollege)) + 
  geom_violin(aes(fill = state)) +
  scale_fill_viridis_d(option = "B") + 
  labs(title = "Persentase Penduduk Berendidikan Perguruan Tinggi",
       subtitle = "Berdasarkan Negara Bagian",
       caption = "Data obtained from the ggplot2 package")

Modifikasi dengan titik-titik amatan

ggplot(df, aes(state, percollege))+
  geom_violin(fill='lightblue',alpha=0.5)+
  geom_jitter(position = position_jitter(width = 0.2), col="red", size=1.5)

Modifikasi jenis garis plot

ggplot(df, aes(state, percollege))+
  geom_violin(fill='lightblue', col="darkred",alpha=0.5, linetype="dotted")

BAB 5 SCATTER PLOT`

df <- as.data.frame(ggplot2::midwest)
df2 <- df[c("percwhite", "percblack", "percamerindan", "percasian",
            "perchsd", "percollege", "percprof",
            "percpovertyknown", "percbelowpoverty", "percchildbelowpovert")]
head(df2)
##   percwhite  percblack percamerindan  percasian  perchsd percollege percprof
## 1  96.71206  2.5752761     0.1482826 0.37675897 75.10740   19.63139 4.355859
## 2  66.38434 32.9004329     0.1788067 0.45172219 59.72635   11.24331 2.870315
## 3  96.57128  2.8617170     0.2334734 0.10673071 69.33499   17.03382 4.488572
## 4  95.25417  0.4122574     0.1493216 0.48691813 75.47219   17.27895 4.197800
## 5  90.19877  9.3728581     0.2398903 0.08567512 68.86152   14.47600 3.367680
## 6  98.51210  0.1401031     0.1821340 0.54640215 76.62941   18.90462 3.275891
##   percpovertyknown percbelowpoverty percchildbelowpovert
## 1         96.27478        13.151443             18.01172
## 2         99.08714        32.244278             45.82651
## 3         94.95697        12.068844             14.03606
## 4         98.47757         7.209019             11.17954
## 5         82.50514        13.520249             13.02289
## 6         98.37200        10.399635             14.15882

Correlation Matrix

round(cor(df2), 2)
##                      percwhite percblack percamerindan percasian perchsd
## percwhite                 1.00     -0.76         -0.60     -0.32   -0.02
## percblack                -0.76      1.00         -0.05      0.32    0.01
## percamerindan            -0.60     -0.05          1.00     -0.06   -0.09
## percasian                -0.32      0.32         -0.06      1.00    0.47
## perchsd                  -0.02      0.01         -0.09      0.47    1.00
## percollege               -0.21      0.24         -0.08      0.75    0.78
## percprof                 -0.22      0.26         -0.09      0.79    0.63
## percpovertyknown          0.13     -0.14          0.01     -0.35   -0.12
## percbelowpoverty         -0.38      0.22          0.37     -0.04   -0.59
## percchildbelowpovert     -0.41      0.29          0.36     -0.15   -0.62
##                      percollege percprof percpovertyknown percbelowpoverty
## percwhite                 -0.21    -0.22             0.13            -0.38
## percblack                  0.24     0.26            -0.14             0.22
## percamerindan             -0.08    -0.09             0.01             0.37
## percasian                  0.75     0.79            -0.35            -0.04
## perchsd                    0.78     0.63            -0.12            -0.59
## percollege                 1.00     0.89            -0.27            -0.28
## percprof                   0.89     1.00            -0.40            -0.15
## percpovertyknown          -0.27    -0.40             1.00            -0.18
## percbelowpoverty          -0.28    -0.15            -0.18             1.00
## percchildbelowpovert      -0.36    -0.28            -0.02             0.95
##                      percchildbelowpovert
## percwhite                           -0.41
## percblack                            0.29
## percamerindan                        0.36
## percasian                           -0.15
## perchsd                             -0.62
## percollege                          -0.36
## percprof                            -0.28
## percpovertyknown                    -0.02
## percbelowpoverty                     0.95
## percchildbelowpovert                 1.00

Scatter Plot

plot(df2)

Individual Scatter Plot

plot(df$perchsd, df$percbelowpoverty)

Linear Model Fitting

fit1 <- lm(percbelowpoverty ~ perchsd, data = df)
coef(fit1)
## (Intercept)     perchsd 
##  50.7850813  -0.5174649

Model yang diperoleh: Y = 50.79 - 0.52*X

plot(df$perchsd, df$percbelowpoverty, cex = .5)
abline(fit1, col = "red", lwd = 2)

plot(df$perchsd, df$percbelowpoverty, cex = 1, pch = 16,
     col = rgb(.8, 0, 0, alpha = .5))
abline(fit1, col = "black", lwd = 2)

Group By Categories

myBlue <- rgb(0, 0, .8, .8)
myRed <- rgb(.8, 0, 0, .8)
col2 <- ifelse(df$inmetro == 1, myBlue, myRed)

plot(df$perchsd, df$percbelowpoverty, col = col2, pch = 16, cex = 1.25)
legend("topright", legend = c("In Metro Area", "Not In Metro Area"), pch = 16,
       col = c("blue", "red"))

More Than 2 Categories

map <- data.frame(find = c("IL", "IN", "MI", "OH", "WI"),
                  replace = c("darkred", "red", "purple", "blue", "darkblue"))
col <- as.character(map[match(df$state, map$find), "replace"])

data.frame(df$state, col)[c(100, 150, 200, 250, 300, 350, 400), ]
##     df.state      col
## 100       IL  darkred
## 150       IN      red
## 200       MI   purple
## 250       MI   purple
## 300       OH     blue
## 350       OH     blue
## 400       WI darkblue
plot(df$perchsd, df$percbelowpoverty, col = col, pch = 16)
legend("topright", legend = map$find, col = map$replace, pch = 16)

2D Scatter Plot Smoothing

plot(df$perchsd, df$percbelowpoverty, pch = 16,
     col = densCols(df$perchsd, df$percbelowpoverty, nbin = 3))
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'

plot(df$perchsd, df$percbelowpoverty, pch = 16,
     col = densCols(df$perchsd, df$percbelowpoverty, nbin = 60))

plot(df$perchsd, df$percbelowpoverty, pch = 16,
     col = densCols(df$perchsd, df$percbelowpoverty))

smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 30)

smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 30, col = NULL)

smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 50, col = NULL)

smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 200, col = NULL)

smoothScatter(df$perchsd, df$percbelowpoverty, nbin = 50,
              pch = 16, col = "red", nrpoints = 30)

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.1.2
plot(hexbin(df$perchsd, df$percbelowpoverty))

plot(hexbin(df$perchsd, df$percbelowpoverty),
     colramp = colorRampPalette(c("white", "blue")))

plot(hexbin(df$perchsd, df$percbelowpoverty),
     colramp = colorRampPalette(c("white", "skyblue", "blue")))

plot(hexbin(df$perchsd, df$percbelowpoverty),
     colramp = colorRampPalette(c("pink", "red", "black")))

Adding A 3rd Continuous Variable

Menggunakan ukuran sebagai dimensi ke-3:

plot(df$perchsd, df$percwhite, cex = df$percbelowpoverty/100*5)

# X     -> % of Highschool Diploma
# Y     -> % of White Population
# Size  -> % of Population Below Poverty Line

Menggunakan transparansi sebagai dimensi ke-3:

col2 <- rgb(1, 0, 0, df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, col = col2, pch = 16)

Menggunakan transparansi dan ukuran sebagai dimensi ke-3:

col2 <- rgb(1, 0, 0, df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, col = col2, cex = df$percbelowpoverty/100*5,
     pch = 16)

Menggunakan warna dan ukuran sebagai dimensi ke-3:

col3 <- rgb(df$percbelowpoverty/100*2,
            1 - df$percbelowpoverty/100*2,
            1 - df$percbelowpoverty/100*2)
plot(df$perchsd, df$percwhite, main = "Midwest Counties",
     xlab = "% of High School Diploma", ylab = "% of White",
     col = col3, cex = df$percbelowpoverty/100*5, pch = 16)

centerPoint <- c(5, 15, 25, 35, 45)
legend("bottomright", pch = 16, title = "% Below Poverty",
       pt.cex = centerPoint/100*5,
       col = rgb(centerPoint/50, 1 - centerPoint/50, 1 - centerPoint/50),
       legend = c("  0-10", "10-20", "20-30", "30-40", "40-50"))

3D Scatter Plot

library(rgl)
plot3d(df$perchsd, df$percwhite, df$percbelowpoverty)
BAB 6 title: “Korelasi menggunakan corrplot”
## Data Simulasi Data yang digunakan dalam simulasi syntax corrplot untuk menunjukkan korelasi antar peubah adalah data midwest yang tersedia dalam library ggplot2. Peubah yang akan digunakan dalam visualisasi korelasi menggunakan corrplot hanya peubah numerik.
r library(ggplot2) data(midwest) dt<-midwest[,4:26]
## Visualisasi corrplot ### Visualisasi sederhana
r library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
r names(dt)<-paste("X", 1:23) #inisialisasi untuk mempersingkan nama peubah cordt<-cor(dt, method = "pearson") #contoh: korelasi pearson corrplot(cordt)
### Modifikasi bentuk
r corrplot(cordt, title = "Correlation Plot", method = "square" )
### Modifikasi penggerombolan
r corrplot(cordt, method = "square", order = "hclust", addrect = 3)
### Modifikasi kemunculan nilai tertentu Pada umumnya, matriks korlasi akan menampilkan seluruh nilai korelasi dari peubah-peubah yang digunakan. Modifikasi berikut dapat digunaan ketika hanya ingin menampilkan nilai korelasi tertentu. Contoh yang akan disimulasikan yaitu hanya menampilkan nilai korelasi lebih dari atau sama dengan 0.7 baik berkorelasi secara negatif atau positif.
```r ind<-abs(cordt)<0.7 #fungsi abs() digunakan untuk merperhitungkan korelasi positif dan negatif #jika hanya ingin menampilkan korelasi positif saja, fungsi abs() dapat dihilangkan #ind adalah suatu objek sebagai index yang menandakan nilai korelasi kurang dari 0.7
cor_baru<-cordt cor_baru[ind]<-NA #nilai korelasi pada cor_baru yang bersesuaian posisi dengan ind akan diganti dengan “NA” sehingga yang akan tampil adalah nilai korelasi lebih dari atau sama dengan 0.7 baik korelasi positif maupun negatif
corrplot(cor_baru, na.label.col = “white”, type = “lower”) ```
r #data korelasi yang bernilai NA akan diberi warna "white" #argumen type= digunakan untuk menampilkan segitiga bawah atau atas saja dari matriks korelasi

BAB 7 title: “Tukey’s Median-Median Resistant Line”

Dataset

Dataset yang akan terdiri dari dua peubah, x (persentase populasi yang hidup di bawah garis kemiskinan) dan y (persentase populasi kulit putih). Setiap amatan mewakili data pada suatu County (setingkat di atas kota dan setingkat di bawah negara bagian) di kawasan Midwest, Amerika Serikat.

df <- ggplot2::midwest
x <- df$percwhite  # % of white population
y <- df$percbelowpoverty  # % of population below poverty line
head(cbind(x, y))
##             x         y
## [1,] 96.71206 13.151443
## [2,] 66.38434 32.244278
## [3,] 96.57128 12.068844
## [4,] 95.25417  7.209019
## [5,] 90.19877 13.520249
## [6,] 98.51210 10.399635
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
     main = "Midwest Dataset",
     xlab = "% of white population", ylab = "% below poverty line")

Regresi Linear OLS

OLS (Ordinary Least Squares) atau MKT (Metode Kuadrat Terkecil) adalah metode regresi parametrik yang paling umum digunakan dalam statistika untuk mencari besaran hubungan linear antar peubah. Berikut adalah koefisien persamaan yang dihasilkan oleh regresi OLS. Di bawahnya adalah grafik yang menggambarkan persamaan tersebut.

linReg <- lm(y ~ x)
coef(linReg)
## (Intercept)           x 
##  38.6604624  -0.2736541
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
     main = "Midwest Dataset",
     xlab = "% of white population", ylab = "% below poverty line")
abline(linReg, lwd = 2)
text(40, 35, "OLS: y = 38.66 - 0.27X", cex = .8)

Algoritma MML

STEP 1: Urutkan setiap data sesuai dengan peubah x-nya.

y <- y[order(x)]  # y dirutkan (sesuai dengan urutan dari x)
x <- sort(x)  # x diurutkan

head(cbind(x, y))  # sehingga hasil akhir tetap berpasangan
##             x        y
## [1,] 10.69409 48.69110
## [2,] 57.39520 20.06530
## [3,] 62.77972 14.19830
## [4,] 66.38434 32.24428
## [5,] 66.88821 30.18411
## [6,] 70.27065 13.80515

STEP 2: Data akan dibagi menjadi 3 kelompok, maka carilah panjang yang sesuai untuk tiap kelompok

n <- round(length(x)/3)
group1 <- 1:n
group2 <- (n + 1):(length(x) - n)
group3 <- (length(x) - n + 1):length(x)
##        min max length
## All      1 437    437
## Group1   1 146    146
## Group2 147 291    145
## Group3 292 437    146

STEP 3: Carilah median di setiap x & y per kelompok.

x1 <- median(x[group1])
y1 <- median(y[group1])

x2 <- median(x[group2])
y2 <- median(y[group2])

x3 <- median(x[group3])
y3 <- median(y[group3])
##                x        y
## Group 1 92.79061 13.13309
## Group 2 98.03274 11.10143
## Group 3 99.27902 11.51767
plot(x, y, col = rgb(0, 0, 0, .5), pch = 16, cex = .5,
     main = "Garis Resisten MML Akan Berusaha
Menghubungkan Ketiga Titik Median (warna merah)")
points(c(x1, x2, x3), c(y1, y2, y3), col = "red", pch = 16)

STEP 4: Carilah rataan untuk setiap median-median x dan y.

xbar <- mean(c(x1, x2, x3))
ybar <- mean(c(y1, y2, y3))

STEP 5: Carilah intercept dan slope dari garis MML.

b1 <- (y3 - y1)/(x3 - x1)
b0 <- ybar - b1*xbar
##     Intercept   Slope
## OLS   38.6605 -0.2737
## MML   35.9930 -0.2490
plot(x, y, col = rgb(1, 0, 0, .5), pch = 16, cex = .75,
     main = "Midwest Dataset",
     xlab = "% of white population", ylab = "% below poverty line")

abline(linReg, lwd = 2)  # visualisasi OLS
abline(b0, b1, col = "blue", lwd = 2)  # visualisasi MML
legend("bottomleft", legend = c("OLS", "MML"), col = c("black", "blue"),
       lty = 1)

text(40, 31.8, "y = 38.66 - 0.27X", cex = .8)
text(30, 25, "y = 35.99 - 0.25X", cex = .8, col = "blue")

Visualisasi OLS dan MML Menggunakan GGPlot

geom_point() untuk scatter plot.
geom_smooth() untuk OLS.
geom_abline() untuk MML.
scale_color_manual() untuk meanmpilkan legenda.
theme_bw() untuk mengubah tema.

library(ggplot2)

ggplot(data = df, mapping = aes(x = percwhite, y = percbelowpoverty)) +
    geom_point(colour = "red", alpha = .5, size = 1) +
    geom_smooth(method = "lm", formula = y ~ x, mapping = aes(col = "black"),
                size = 1.5, se = F) +
    geom_abline(mapping = aes(intercept = b0, slope = b1, col = "blue"),
                size = 1.5, show.legend = F) +
    scale_colour_manual(name = "",
                        labels = c("OLS", "MML"), 
                        values = c("black", "blue")) +
    theme_bw()