library(ggplot2) EDA pada dataset mtcars

str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

hubungan mpg vs wt

library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point(size = 3, color = "darkblue") + # Membuat titik
  geom_smooth(method = "lm", se = FALSE, color = "red") + # Tambahan garis tren
  labs(title = "Visualisasi mpg vs wt",
       x = "Berat (wt)",
       y = "Efisiensi (mpg)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

dataset chiken weight

avg_weight <- aggregate(weight ~ Time, data = ChickWeight, mean)
plot(avg_weight$Time, avg_weight$weight, type = "o", 
     col = "blue", pch = 16,
     main = "Tren Berat Anak Ayam terhadap Waktu",
     xlab = "Waktu (Hari)", ylab = "Rata-rata Berat")

membedakan distribusi sepal.length antara spesies dalam dataset iris

boxplot(Sepal.Length ~ Species, data = iris,
        main = "Distribusi Sepal Length per Spesies",
        xlab = "Spesies", ylab = "Sepal Length",
        col = c("orange", "green", "purple"))

missing values

head (airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
colSums(is.na(airquality))
##   Ozone Solar.R    Wind    Temp   Month     Day 
##      37       7       0       0       0       0
clean_data <- na.omit(airquality)
dim(clean_data)
## [1] 111   6

data diamonds

data("diamonds")
head(diamonds)
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

diamonds

class(diamonds$cut)
## [1] "ordered" "factor"
str(diamonds$cut)
##  Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...

korelasi erupsi

plot(faithful$eruptions, faithful$waiting,
     main="Korelasi Eruptions vs Waiting",
     xlab="Durasi Eruptions",
     ylab="Waiting Time")
abline(lm(faithful$waiting ~ faithful$eruptions), col="red")

boxplot(price ~ cut, data = diamonds,
        main = "Distribusi Price berdasarkan Cut",
        xlab = "Cut",
        ylab = "Price",
        col = "lightblue")

korelasi antara mpg dan cyl

cor(mtcars$mpg, mtcars$cyl)
## [1] -0.852162

titanic

library(titanic)
## Warning: package 'titanic' was built under R version 4.5.3
margin.table(Titanic, 4)
## Survived
##   No  Yes 
## 1490  711