#LATIHAN WEEK 4 “Data Preparation”
#IMPORT DATA
titanic <- as.data.frame(Titanic)
titanic
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
databaru <- read.csv("house_price.csv")
head(databaru)
## date price bedrooms bathrooms sqft_living sqft_lot floors
## 1 2014-05-02 00:00:00 313000 3 1.50 1340 7912 1.5
## 2 2014-05-02 00:00:00 2384000 5 2.50 3650 9050 2.0
## 3 2014-05-02 00:00:00 342000 3 2.00 1930 11947 1.0
## 4 2014-05-02 00:00:00 420000 3 2.25 2000 8030 1.0
## 5 2014-05-02 00:00:00 550000 4 2.50 1940 10500 1.0
## 6 2014-05-02 00:00:00 490000 2 1.00 880 6380 1.0
## waterfront view condition sqft_above sqft_basement yr_built yr_renovated
## 1 0 0 3 1340 0 1955 2005
## 2 0 4 5 3370 280 1921 0
## 3 0 0 4 1930 0 1966 0
## 4 0 0 4 1000 1000 1963 0
## 5 0 0 4 1140 800 1976 1992
## 6 0 0 3 880 0 1938 1994
## street city statezip country
## 1 18810 Densmore Ave N Shoreline WA 98133 USA
## 2 709 W Blaine St Seattle WA 98119 USA
## 3 26206-26214 143rd Ave SE Kent WA 98042 USA
## 4 857 170th Pl NE Bellevue WA 98008 USA
## 5 9105 170th Ave NE Redmond WA 98052 USA
## 6 522 NE 88th St Seattle WA 98115 USA
#DATA STRUCTURE #lihat struktur data, data berupa frekuensi level
str(titanic)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
#ringkasan statistik, berupa daftar berapa kali muncul, kuartil, mean, nilai max, dll
summary(titanic)
## Class Sex Age Survived Freq
## 1st :8 Male :16 Child:16 No :16 Min. : 0.00
## 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
## 3rd :8 Median : 13.50
## Crew:8 Mean : 68.78
## 3rd Qu.: 77.00
## Max. :670.00
#MISSING VALUE #cek apakah ada missing value pada data titanic
colSums(is.na(titanic))
## Class Sex Age Survived Freq
## 0 0 0 0 0
#cek apakah ada missing value pada data airquality
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
#cek apakah ada missing value pada data house price
colSums(is.na(databaru))
## date price bedrooms bathrooms sqft_living
## 0 0 0 0 0
## sqft_lot floors waterfront view condition
## 0 0 0 0 0
## sqft_above sqft_basement yr_built yr_renovated street
## 0 0 0 0 0
## city statezip country
## 0 0 0
#REPLACE MISSING VALUE #ganti NA pada kolom Ozone dengan median
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
airquality$Solar.R[is.na(airquality$Solar.R)] <- median(airquality$Solar.R, na.rm = TRUE)
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 0 0 0 0 0
#SELECT DATA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#memanggil hanya kolom tertentu saja data titanic
titanic_selected <- select(titanic, Sex)
head(titanic_selected)
## Sex
## 1 Male
## 2 Male
## 3 Male
## 4 Male
## 5 Female
## 6 Female
#memanggil hanya kolom tertentu saja data airquality
airquality_selected <- select(airquality, Ozone)
head(airquality_selected)
## Ozone
## 1 41.0
## 2 36.0
## 3 12.0
## 4 18.0
## 5 31.5
## 6 28.0
#memanggil hanya kolom tertentu saja data house price
databaru_selected <- select(databaru, price)
head(databaru_selected)
## price
## 1 313000
## 2 2384000
## 3 342000
## 4 420000
## 5 550000
## 6 490000
#FILTER AND SORT DATA
library(dplyr)
#filter penumpang anak-anak
titanic_child <- filter(titanic, Age =="Child")
head(titanic_child)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
#filter penumpang dewasa perempuan
titanic_female_adult <- filter(titanic, Sex == "Female", Age == "Adult")
head(titanic_female_adult)
## Class Sex Age Survived Freq
## 1 1st Female Adult No 4
## 2 2nd Female Adult No 13
## 3 3rd Female Adult No 89
## 4 Crew Female Adult No 3
## 5 1st Female Adult Yes 140
## 6 2nd Female Adult Yes 80
#mengurutkan berdasarkan frekuensi (ascending, kecil ke besar)
titanic_sorted_asc <- arrange(titanic, Freq)
head(titanic_sorted_asc)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 Crew Male Child No 0
## 4 1st Female Child No 0
## 5 2nd Female Child No 0
## 6 Crew Female Child No 0
#mengurutkan berdasarkan frekuensi (descending, dari besar ke kecil)
titanic_sorted_desc <- arrange(titanic, Freq)
head(titanic_sorted_desc)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 Crew Male Child No 0
## 4 1st Female Child No 0
## 5 2nd Female Child No 0
## 6 Crew Female Child No 0
#RENAME AND MUTATE #ganti nama kolom
titanic_rename <- rename(titanic, Usia = Age, Kelas = Class)
titanic_rename
## Kelas Sex Usia Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
#menambahkan kolom proporsi
titanic_mutate <- mutate(titanic, Proporsi = Freq / sum(Freq))
titanic_mutate
## Class Sex Age Survived Freq Proporsi
## 1 1st Male Child No 0 0.0000000000
## 2 2nd Male Child No 0 0.0000000000
## 3 3rd Male Child No 35 0.0159018628
## 4 Crew Male Child No 0 0.0000000000
## 5 1st Female Child No 0 0.0000000000
## 6 2nd Female Child No 0 0.0000000000
## 7 3rd Female Child No 17 0.0077237619
## 8 Crew Female Child No 0 0.0000000000
## 9 1st Male Adult No 118 0.0536119945
## 10 2nd Male Adult No 154 0.0699681963
## 11 3rd Male Adult No 387 0.1758291686
## 12 Crew Male Adult No 670 0.3044070877
## 13 1st Female Adult No 4 0.0018173557
## 14 2nd Female Adult No 13 0.0059064062
## 15 3rd Female Adult No 89 0.0404361654
## 16 Crew Female Adult No 3 0.0013630168
## 17 1st Male Child Yes 5 0.0022716947
## 18 2nd Male Child Yes 11 0.0049977283
## 19 3rd Male Child Yes 13 0.0059064062
## 20 Crew Male Child Yes 0 0.0000000000
## 21 1st Female Child Yes 1 0.0004543389
## 22 2nd Female Child Yes 13 0.0059064062
## 23 3rd Female Child Yes 14 0.0063607451
## 24 Crew Female Child Yes 0 0.0000000000
## 25 1st Male Adult Yes 57 0.0258973194
## 26 2nd Male Adult Yes 14 0.0063607451
## 27 3rd Male Adult Yes 75 0.0340754203
## 28 Crew Male Adult Yes 192 0.0872330759
## 29 1st Female Adult Yes 140 0.0636074512
## 30 2nd Female Adult Yes 80 0.0363471149
## 31 3rd Female Adult Yes 76 0.0345297592
## 32 Crew Female Adult Yes 20 0.0090867787
#JOIN DATA #untuk menggabungkan dua dataset berbeda #buat data tambahan #banyak data tambahan disesuaikan dengan banyaknya variabel di kolom class
extra <- data.frame(Class = c("1st", "2nd", "3rd", "Crew"), Bonus = c("Jake", "Heeseung", "Jay", "Seungcheol"))
#memasukkan/join data ke dataset titanic
titanic_joined <- left_join(titanic, extra, by = "Class")
titanic_joined
## Class Sex Age Survived Freq Bonus
## 1 1st Male Child No 0 Jake
## 2 2nd Male Child No 0 Heeseung
## 3 3rd Male Child No 35 Jay
## 4 Crew Male Child No 0 Seungcheol
## 5 1st Female Child No 0 Jake
## 6 2nd Female Child No 0 Heeseung
## 7 3rd Female Child No 17 Jay
## 8 Crew Female Child No 0 Seungcheol
## 9 1st Male Adult No 118 Jake
## 10 2nd Male Adult No 154 Heeseung
## 11 3rd Male Adult No 387 Jay
## 12 Crew Male Adult No 670 Seungcheol
## 13 1st Female Adult No 4 Jake
## 14 2nd Female Adult No 13 Heeseung
## 15 3rd Female Adult No 89 Jay
## 16 Crew Female Adult No 3 Seungcheol
## 17 1st Male Child Yes 5 Jake
## 18 2nd Male Child Yes 11 Heeseung
## 19 3rd Male Child Yes 13 Jay
## 20 Crew Male Child Yes 0 Seungcheol
## 21 1st Female Child Yes 1 Jake
## 22 2nd Female Child Yes 13 Heeseung
## 23 3rd Female Child Yes 14 Jay
## 24 Crew Female Child Yes 0 Seungcheol
## 25 1st Male Adult Yes 57 Jake
## 26 2nd Male Adult Yes 14 Heeseung
## 27 3rd Male Adult Yes 75 Jay
## 28 Crew Male Adult Yes 192 Seungcheol
## 29 1st Female Adult Yes 140 Jake
## 30 2nd Female Adult Yes 80 Heeseung
## 31 3rd Female Adult Yes 76 Jay
## 32 Crew Female Adult Yes 20 Seungcheol
#GROUP AND SUMMARIZE #hitung total penumpang selamat vs tidak berdasarkan kelas
titanic_summary <- titanic %>%
group_by(Class, Survived) %>%
summarise(total = sum(Freq))
## `summarise()` has grouped output by 'Class'. You can override using the
## `.groups` argument.
titanic_summary
## # A tibble: 8 × 3
## # Groups: Class [4]
## Class Survived total
## <fct> <fct> <dbl>
## 1 1st No 122
## 2 1st Yes 203
## 3 2nd No 167
## 4 2nd Yes 118
## 5 3rd No 528
## 6 3rd Yes 178
## 7 Crew No 673
## 8 Crew Yes 212
#hitung banyaknya penumpang laki-laki dan perempuan berdasarkan umur #kayanya fungsi pipe operator tuh sama kaya endl di C++ deh
titanic_summary <- titanic %>%
group_by(Age, Sex) %>%
summarise(total = sum(Freq))
## `summarise()` has grouped output by 'Age'. You can override using the `.groups`
## argument.
titanic_summary
## # A tibble: 4 × 3
## # Groups: Age [2]
## Age Sex total
## <fct> <fct> <dbl>
## 1 Child Male 64
## 2 Child Female 45
## 3 Adult Male 1667
## 4 Adult Female 425
#SPLIT DATA #buat indeks sampling 50%
index <- sample(1:nrow(titanic), 0.5*nrow(titanic))
train_data <- titanic[index,]
test_data <- titanic[-index,]
nrow(train_data); nrow(test_data)
## [1] 16
## [1] 16