R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

  1. Soal visualisasi melihat hubungan mpg vs wt
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(size = 3, color = "pink") +
  labs(title = "Scatter Plot: mpg vs wt",
       x = "Weight (1000 lbs)",
       y = "Miles per Gallon") +
  theme_minimal()

  1. Soal melihat weight dan time dalam dataset ChickWeight
library(ggplot2)
ggplot(ChickWeight, aes(x = Time, y = weight, group = Chick)) +
  geom_line(alpha = 0.3) + 
  stat_summary(aes(group = 1), fun = "mean", geom = "line", 
               size = 1.5, color = "orange") + 
  labs(title = "melihat weight dan time",
       x = "Waktu (hari)",
       y = "Berat (gram)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  1. Soal Menhitung korelasi antara mpg dan cyl dalam dataset mtcars
data(mtcars)
cor(mtcars$mpg, mtcars$cyl)
## [1] -0.852162

Jadi, -0,85 merupakan korelasi negatif kuat

plot(mtcars$cyl, mtcars$mpg,
     main = "Korelasi mpg vs cyl",
     xlab = "Jumlah Silinder (cyl)",
     ylab = "Miles per Gallon (mpg)",
     pch = 19, col = "green")

  1. Soal penanganan missing value di kolom ozone dalam dataset airquality
data(airquality)
sum(is.na(airquality$Ozone)) 
## [1] 37
med_ozone <- median(airquality$Ozone, na.rm = TRUE)
airquality$Ozone_imputed <- ifelse(is.na(airquality$Ozone), med_ozone, airquality$Ozone)
head(airquality[, c("Ozone", "Ozone_imputed")])
##   Ozone Ozone_imputed
## 1    41          41.0
## 2    36          36.0
## 3    12          12.0
## 4    18          18.0
## 5    NA          31.5
## 6    28          28.0
  1. Soal membedakan distribusi sepal length antar species dalam dataset
library(ggplot2)
ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_boxplot() +
  labs(title = "perbedaan distribusi sepal length per spesies",
       x = "Species",
       y = "Sepal Length (cm)") +
  theme_minimal()

  1. Soal mengecek total penumpang yang selamat (survived 1)
library(titanic)
## Warning: package 'titanic' was built under R version 4.5.3
sum(titanic_train$Survived)
## [1] 342
data("titanic_train")
  1. Soal korelasi eruptions dan waiting
data(faithful)
cor(faithful$eruptions, faithful$waiting)
## [1] 0.9008112
plot(faithful$eruptions, faithful$waiting,
     main = "Hubungan Eruptions vs Waiting",
     xlab = "Durasi Erupsi (menit)",
     ylab = "Waktu Tunggu (menit)",
     pch = 19, col = "brown")

  1. Soal mengetahui tipe data kolom cut pada dataset diamonds
library(ggplot2)
class(diamonds$cut)
## [1] "ordered" "factor"
class(diamonds$cut)
## [1] "ordered" "factor"
levels(diamonds$cut)
## [1] "Fair"      "Good"      "Very Good" "Premium"   "Ideal"
str(diamonds$cut)
##  Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
  1. Soal struktur data mtcars
data(mtcars)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
cor(mtcars[, c("mpg", "hp", "wt")])
##            mpg         hp         wt
## mpg  1.0000000 -0.7761684 -0.8676594
## hp  -0.7761684  1.0000000  0.6587479
## wt  -0.8676594  0.6587479  1.0000000
plot(mtcars$wt, mtcars$mpg,
     main = "hubungan antara Weight vs MPG",
     xlab = "Weight (1000 lbs)", ylab = "MPG")

  1. Soal visualisasi terbaik untuk melihat distribusi price berdasarkan cut dalam dataset diamonds
library(ggplot2)
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  labs(title = "distribusi price berdasarkan cut",
       x = "cut (kualitas potongan)",
       y = "price (USD)") +
  theme_minimal()

Jadi kesimmpulannya Fair dan Good harganya paling murah dan variasi sempit. Ideal harganya menengah tapi banyak sekali outlier mahal. Premium dan Very Good harganya lebih tinggi dengan sebaran lebar. Yang intinya, hampir semua kategori mempunyai outlier harga selangit atau tinggi.