data("Titanic")
titanic <- as.data.frame(Titanic)
Titanic
## , , Age = Child, Survived = No
##
## Sex
## Class Male Female
## 1st 0 0
## 2nd 0 0
## 3rd 35 17
## Crew 0 0
##
## , , Age = Adult, Survived = No
##
## Sex
## Class Male Female
## 1st 118 4
## 2nd 154 13
## 3rd 387 89
## Crew 670 3
##
## , , Age = Child, Survived = Yes
##
## Sex
## Class Male Female
## 1st 5 1
## 2nd 11 13
## 3rd 13 14
## Crew 0 0
##
## , , Age = Adult, Survived = Yes
##
## Sex
## Class Male Female
## 1st 57 140
## 2nd 14 80
## 3rd 75 76
## Crew 192 20
databaru <- read.csv("house_price.csv")
head(databaru)
## date price bedrooms bathrooms sqft_living sqft_lot floors
## 1 2014-05-02 00:00:00 313000 3 1.50 1340 7912 1.5
## 2 2014-05-02 00:00:00 2384000 5 2.50 3650 9050 2.0
## 3 2014-05-02 00:00:00 342000 3 2.00 1930 11947 1.0
## 4 2014-05-02 00:00:00 420000 3 2.25 2000 8030 1.0
## 5 2014-05-02 00:00:00 550000 4 2.50 1940 10500 1.0
## 6 2014-05-02 00:00:00 490000 2 1.00 880 6380 1.0
## waterfront view condition sqft_above sqft_basement yr_built yr_renovated
## 1 0 0 3 1340 0 1955 2005
## 2 0 4 5 3370 280 1921 0
## 3 0 0 4 1930 0 1966 0
## 4 0 0 4 1000 1000 1963 0
## 5 0 0 4 1140 800 1976 1992
## 6 0 0 3 880 0 1938 1994
## street city statezip country
## 1 18810 Densmore Ave N Shoreline WA 98133 USA
## 2 709 W Blaine St Seattle WA 98119 USA
## 3 26206-26214 143rd Ave SE Kent WA 98042 USA
## 4 857 170th Pl NE Bellevue WA 98008 USA
## 5 9105 170th Ave NE Redmond WA 98052 USA
## 6 522 NE 88th St Seattle WA 98115 USA
str(titanic)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
summary(titanic)
## Class Sex Age Survived Freq
## 1st :8 Male :16 Child:16 No :16 Min. : 0.00
## 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
## 3rd :8 Median : 13.50
## Crew:8 Mean : 68.78
## 3rd Qu.: 77.00
## Max. :670.00
colSums(is.na(titanic))
## Class Sex Age Survived Freq
## 0 0 0 0 0
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
airquality$Ozone[is.na(airquality$Ozone)] <- median(airquality$Ozone, na.rm = TRUE)
# Ganti NA pada kolom Solar.R dengan median
airquality$Solar.R[is.na(airquality$Solar.R)] <- median(airquality$Solar.R, na.rm = TRUE)
# Cek lagi apakah masih ada missing value
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 0 0 0 0 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
titanic_selected <- select(titanic, Class, Sex, Survived)
head(titanic_selected)
## Class Sex Survived
## 1 1st Male No
## 2 2nd Male No
## 3 3rd Male No
## 4 Crew Male No
## 5 1st Female No
## 6 2nd Female No
library(dplyr)
# Filter penumpang anak-anak
titanic_child <- filter(titanic, Age == "Child")
# Urutkan berdasarkan frekuensi (ascending: kecil → besar)
titanic_sorted_asc <- arrange(titanic, Freq)
head(titanic_sorted_asc)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 Crew Male Child No 0
## 4 1st Female Child No 0
## 5 2nd Female Child No 0
## 6 Crew Female Child No 0
titanic_sorted_desc <- arrange(titanic, desc(Freq))
head(titanic_sorted_desc)
## Class Sex Age Survived Freq
## 1 Crew Male Adult No 670
## 2 3rd Male Adult No 387
## 3 Crew Male Adult Yes 192
## 4 2nd Male Adult No 154
## 5 1st Female Adult Yes 140
## 6 1st Male Adult No 118
titanic_rename <- rename(titanic, Umur = Age)
titanic_rename
## Class Sex Umur Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
titanic_mutate <- mutate(titanic, Proporsi = Freq / sum(Freq))
head(titanic_mutate)
## Class Sex Age Survived Freq Proporsi
## 1 1st Male Child No 0 0.00000000
## 2 2nd Male Child No 0 0.00000000
## 3 3rd Male Child No 35 0.01590186
## 4 Crew Male Child No 0 0.00000000
## 5 1st Female Child No 0 0.00000000
## 6 2nd Female Child No 0 0.00000000
extra <- data.frame(
Class = c("1st","2nd","3rd","Crew"),
Kapal = c("Titanic","Titanic","Titanic","Titanic")
)
# Join ke dataset titanic
titanic_joined <- left_join(titanic, extra, by = "Class")
head(titanic_joined)
## Class Sex Age Survived Freq Kapal
## 1 1st Male Child No 0 Titanic
## 2 2nd Male Child No 0 Titanic
## 3 3rd Male Child No 35 Titanic
## 4 Crew Male Child No 0 Titanic
## 5 1st Female Child No 0 Titanic
## 6 2nd Female Child No 0 Titanic
titanic_summary <- titanic %>%
group_by(Class, Survived) %>%
summarise(total = sum(Freq))
## `summarise()` has grouped output by 'Class'. You can override using the
## `.groups` argument.
set.seed(123)
# Buat indeks sampling 70%
index <- sample(1:nrow(titanic), 0.7*nrow(titanic))
train_data <- titanic[index, ]
test_data <- titanic[-index, ]
nrow(train_data); nrow(test_data)
## [1] 22
## [1] 10