library(readr)
## Warning: package 'readr' was built under R version 4.1.3
insurance <- read_csv("C:\\Users\\user\\OneDrive\\Documents\\R\\insurance.csv")
## Rows: 1338 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(insurance)
## [1] 1338 7
Ada 1338 instance dan 7 variable dalam dataset Insurance
str(insurance)
## spec_tbl_df [1,338 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr [1:1338] "female" "male" "male" "male" ...
## $ bmi : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
## $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr [1:1338] "yes" "no" "no" "no" ...
## $ region : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. bmi = col_double(),
## .. children = col_double(),
## .. smoker = col_character(),
## .. region = col_character(),
## .. charges = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Terdapat 7 variable dengan tipe data yang berbeda-beda, yaitu age (integer), sex (char), bmi (number), children (integer), smoker (char), region (char), region (char), charges (number)
sapply(insurance[, c(1, 3, 4)], mean, na.rm=TRUE)
## age bmi children
## 39.207025 30.663397 1.094918
sapply(insurance[, c(1, 3, 4)], quantile, na.rm=TRUE)
## age bmi children
## 0% 18 15.96000 0
## 25% 27 26.29625 0
## 50% 39 30.40000 1
## 75% 51 34.69375 2
## 100% 64 53.13000 5
Dari data diatas, terdapat nilai rata-rata serta kuartil pada variabel age, bmi, dan children
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(insurance)
## insurance
##
## 7 Variables 1338 Observations
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 47 0.999 39.21 16.21 18 19
## .25 .50 .75 .90 .95
## 27 39 51 59 62
##
## lowest : 18 19 20 21 22, highest: 60 61 62 63 64
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 1338 0 2
##
## Value female male
## Frequency 662 676
## Proportion 0.495 0.505
## --------------------------------------------------------------------------------
## bmi
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 548 1 30.66 6.893 21.26 22.99
## .25 .50 .75 .90 .95
## 26.30 30.40 34.69 38.62 41.11
##
## lowest : 15.960 16.815 17.195 17.290 17.385, highest: 48.070 49.060 50.380 52.580 53.130
## --------------------------------------------------------------------------------
## children
## n missing distinct Info Mean Gmd
## 1338 0 6 0.899 1.095 1.275
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 574 324 240 157 25 18
## Proportion 0.429 0.242 0.179 0.117 0.019 0.013
## --------------------------------------------------------------------------------
## smoker
## n missing distinct
## 1338 0 2
##
## Value no yes
## Frequency 1064 274
## Proportion 0.795 0.205
## --------------------------------------------------------------------------------
## region
## n missing distinct
## 1338 0 4
##
## Value northeast northwest southeast southwest
## Frequency 324 325 364 325
## Proportion 0.242 0.243 0.272 0.243
## --------------------------------------------------------------------------------
## charges
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 1337 1 13270 12301 1758 2347
## .25 .50 .75 .90 .95
## 4740 9382 16640 34832 41182
##
## lowest : 1121.874 1131.507 1135.941 1136.399 1137.011
## highest: 55135.402 58571.074 60021.399 62592.873 63770.428
## --------------------------------------------------------------------------------
summary(insurance)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
Dari data diatas, dapat dilihat nilai terkecil (minimum), quartile (q1, q2, q3), max(nilai maksimum), rata-rata, dan median dari tiap variabel yang ada pada data insurance
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.3
qqPlot(insurance$bmi)
## [1] 1318 1048
Dari visualisasi daitas, dapat dilihat nilai dari BMI ada di rentang 18-50
outlierIndex <- which(insurance$bmi > 50)
rownames(insurance)[outlierIndex]
## [1] "848" "1048" "1318"
Terdapat beberapa data outlier atas yaitu dengan rentang diatas 50 sebanyak 3, dengan nilai 848, 1048, dan 1318.
relation <- table(insurance$bmi,insurance$smoker)
relation
##
## no yes
## 15.96 1 0
## 16.815 2 0
## 17.195 0 1
## 17.29 2 1
## 17.385 1 0
## 17.4 1 0
## 17.48 1 0
## 17.67 1 0
## 17.765 0 1
## 17.8 1 0
## 17.86 1 0
## 17.955 0 1
## 18.05 1 0
## 18.3 0 1
## 18.335 3 0
## 18.5 1 0
## 18.6 1 0
## 18.715 1 0
## 18.905 1 0
## 19 1 0
## 19.095 0 1
## 19.19 1 0
## 19.3 0 1
## 19.475 1 0
## 19.57 1 0
## 19.8 2 1
## 19.855 2 0
## 19.95 5 1
## 20.045 0 2
## 20.1 1 0
## 20.13 0 1
## 20.235 3 1
## 20.3 1 0
## 20.35 1 0
## 20.4 1 0
## 20.425 1 0
## 20.52 1 1
## 20.6 2 0
## 20.615 1 0
## 20.7 1 0
## 20.79 1 0
## 20.8 2 0
## 20.9 1 1
## 21.01 1 0
## 21.09 1 0
## 21.12 1 0
## 21.28 1 0
## 21.3 1 0
## 21.375 2 0
## 21.4 2 0
## 21.47 3 0
## 21.5 1 0
## 21.56 1 0
## 21.565 0 1
## 21.66 2 1
## 21.7 0 1
## 21.755 3 1
## 21.78 2 0
## 21.8 0 1
## 21.85 2 2
## 21.89 1 0
## 21.945 1 0
## 22 1 0
## 22.04 1 0
## 22.1 1 0
## 22.135 4 0
## 22.22 0 1
## 22.23 2 0
## 22.3 2 0
## 22.42 2 1
## 22.515 5 0
## 22.6 1 1
## 22.61 3 1
## 22.705 3 0
## 22.77 1 0
## 22.8 2 0
## 22.88 0 1
## 22.895 1 3
## 22.99 1 2
## 23 1 0
## 23.085 2 0
## 23.1 1 0
## 23.18 5 0
## 23.2 1 0
## 23.21 4 1
## 23.275 1 0
## 23.3 1 0
## 23.32 1 0
## 23.37 2 1
## 23.4 2 0
## 23.465 2 0
## 23.54 1 0
## 23.56 2 0
## 23.6 2 0
## 23.65 1 0
## 23.655 2 3
## 23.7 2 0
## 23.75 3 0
## 23.76 0 1
## 23.8 1 0
## 23.845 2 1
## 23.87 1 0
## 23.9 1 0
## 23.94 1 0
## 23.98 2 1
## 24.035 3 0
## 24.09 1 0
## 24.1 2 0
## 24.13 2 2
## 24.225 3 0
## 24.3 3 0
## 24.31 2 0
## 24.32 6 1
## 24.4 0 1
## 24.415 1 0
## 24.42 0 3
## 24.51 3 0
## 24.53 1 0
## 24.6 2 1
## 24.605 3 1
## 24.64 0 1
## 24.7 3 1
## 24.75 0 1
## 24.795 2 2
## 24.86 2 0
## 24.89 0 1
## 24.97 1 0
## 24.985 2 0
## 25 2 0
## 25.08 5 0
## 25.1 0 1
## 25.175 5 1
## 25.2 1 0
## 25.27 2 1
## 25.3 3 2
## 25.365 3 0
## 25.4 1 0
## 25.41 0 1
## 25.46 7 0
## 25.52 1 0
## 25.555 2 1
## 25.6 3 1
## 25.65 1 0
## 25.7 1 1
## 25.74 4 0
## 25.745 3 0
## 25.8 7 0
## 25.84 4 1
## 25.85 0 1
## 25.9 2 1
## 25.935 3 0
## 26.03 4 1
## 26.07 0 1
## 26.125 3 1
## 26.18 2 0
## 26.2 1 0
## 26.22 4 0
## 26.29 0 1
## 26.315 5 0
## 26.4 3 1
## 26.41 5 1
## 26.505 1 0
## 26.51 2 0
## 26.6 5 1
## 26.62 2 0
## 26.695 3 2
## 26.7 1 1
## 26.73 2 0
## 26.79 2 0
## 26.8 2 0
## 26.84 1 1
## 26.885 2 2
## 26.9 1 0
## 26.98 2 1
## 27 1 0
## 27.06 0 1
## 27.075 1 0
## 27.1 3 1
## 27.17 2 0
## 27.2 2 0
## 27.265 4 0
## 27.28 0 1
## 27.3 0 1
## 27.36 5 2
## 27.4 2 0
## 27.455 2 0
## 27.5 6 0
## 27.55 4 0
## 27.6 4 1
## 27.61 1 0
## 27.645 6 1
## 27.7 2 1
## 27.72 4 0
## 27.74 4 2
## 27.8 0 1
## 27.83 3 1
## 27.835 4 1
## 27.9 0 1
## 27.93 4 0
## 27.94 2 1
## 28 2 1
## 28.025 2 3
## 28.05 3 0
## 28.1 2 0
## 28.12 2 2
## 28.16 1 0
## 28.2 1 0
## 28.215 3 1
## 28.27 2 0
## 28.3 0 2
## 28.31 5 4
## 28.38 0 1
## 28.4 2 0
## 28.405 2 0
## 28.49 0 1
## 28.5 3 2
## 28.595 6 0
## 28.6 3 0
## 28.69 2 1
## 28.7 5 0
## 28.785 5 0
## 28.8 1 0
## 28.82 1 0
## 28.88 7 1
## 28.9 5 0
## 28.93 2 1
## 28.975 4 1
## 29 2 0
## 29.04 1 0
## 29.07 0 2
## 29.1 1 0
## 29.15 1 1
## 29.165 1 0
## 29.2 1 0
## 29.26 4 0
## 29.3 2 0
## 29.355 2 0
## 29.37 2 0
## 29.4 1 0
## 29.45 1 0
## 29.48 3 0
## 29.5 1 0
## 29.545 1 0
## 29.59 2 0
## 29.6 4 0
## 29.64 5 0
## 29.7 4 1
## 29.735 4 0
## 29.8 3 1
## 29.81 2 2
## 29.83 4 2
## 29.9 3 0
## 29.92 5 1
## 29.925 2 1
## 30 1 1
## 30.02 3 1
## 30.03 3 0
## 30.1 2 0
## 30.115 6 0
## 30.14 2 0
## 30.2 4 2
## 30.21 3 1
## 30.25 1 1
## 30.3 3 0
## 30.305 2 0
## 30.36 0 1
## 30.4 4 1
## 30.495 6 2
## 30.5 4 0
## 30.59 7 0
## 30.685 1 2
## 30.69 2 0
## 30.78 3 2
## 30.8 6 2
## 30.875 7 1
## 30.9 2 1
## 30.97 1 0
## 31 2 0
## 31.02 2 1
## 31.065 3 1
## 31.1 1 0
## 31.13 3 1
## 31.16 3 1
## 31.2 1 0
## 31.24 1 0
## 31.255 3 0
## 31.3 0 1
## 31.35 6 2
## 31.4 1 2
## 31.445 2 0
## 31.46 2 0
## 31.5 2 0
## 31.54 2 0
## 31.57 2 0
## 31.6 3 0
## 31.635 2 0
## 31.68 0 2
## 31.73 4 2
## 31.79 3 1
## 31.8 1 0
## 31.825 4 1
## 31.9 3 0
## 31.92 3 2
## 32 1 0
## 32.01 2 0
## 32.015 1 1
## 32.1 1 0
## 32.11 7 0
## 32.12 1 0
## 32.2 2 1
## 32.205 2 0
## 32.23 2 0
## 32.3 12 1
## 32.34 2 0
## 32.395 5 0
## 32.4 1 0
## 32.45 0 1
## 32.49 1 1
## 32.5 1 0
## 32.56 1 1
## 32.585 1 0
## 32.6 2 0
## 32.67 2 0
## 32.68 4 0
## 32.7 1 1
## 32.775 5 2
## 32.78 0 1
## 32.8 2 1
## 32.87 1 0
## 32.9 2 1
## 32.965 4 0
## 33 6 0
## 33.06 1 0
## 33.1 4 0
## 33.11 2 2
## 33.155 5 0
## 33.2 1 0
## 33.25 3 0
## 33.3 2 0
## 33.33 6 1
## 33.345 5 0
## 33.4 2 1
## 33.44 4 0
## 33.5 0 1
## 33.535 2 1
## 33.55 1 0
## 33.63 4 2
## 33.66 5 0
## 33.7 3 0
## 33.725 2 0
## 33.77 2 0
## 33.8 0 1
## 33.82 4 0
## 33.88 2 1
## 33.915 4 0
## 33.99 2 0
## 34.01 1 0
## 34.1 7 1
## 34.105 2 2
## 34.2 3 2
## 34.21 3 1
## 34.295 1 0
## 34.3 1 0
## 34.32 3 0
## 34.39 1 1
## 34.4 3 1
## 34.43 3 1
## 34.485 1 1
## 34.5 1 0
## 34.58 2 0
## 34.6 1 1
## 34.675 1 0
## 34.7 1 1
## 34.77 3 0
## 34.8 5 2
## 34.865 2 0
## 34.87 1 0
## 34.9 0 1
## 34.96 1 2
## 35.09 0 1
## 35.1 1 0
## 35.15 1 0
## 35.2 5 2
## 35.245 2 0
## 35.3 2 2
## 35.31 2 0
## 35.4 1 0
## 35.42 1 0
## 35.435 1 0
## 35.5 0 1
## 35.53 3 3
## 35.6 0 1
## 35.625 3 1
## 35.64 1 0
## 35.7 1 0
## 35.72 2 0
## 35.75 1 2
## 35.8 2 0
## 35.815 4 0
## 35.86 3 1
## 35.9 1 0
## 35.91 2 0
## 35.97 3 1
## 36 2 0
## 36.005 1 0
## 36.08 2 2
## 36.1 3 0
## 36.19 2 1
## 36.195 1 0
## 36.2 2 0
## 36.29 1 0
## 36.3 2 2
## 36.385 1 1
## 36.4 0 1
## 36.48 2 1
## 36.52 1 0
## 36.575 2 0
## 36.6 1 0
## 36.63 2 1
## 36.67 2 2
## 36.7 1 0
## 36.765 2 1
## 36.85 4 1
## 36.86 2 1
## 36.955 2 2
## 36.96 0 1
## 37 2 0
## 37.05 2 1
## 37.07 1 2
## 37.1 6 0
## 37.145 1 0
## 37.18 2 0
## 37.29 4 0
## 37.3 1 0
## 37.335 2 0
## 37.4 3 0
## 37.43 3 0
## 37.51 2 0
## 37.525 1 0
## 37.62 1 1
## 37.7 0 1
## 37.715 1 0
## 37.73 2 0
## 37.8 0 1
## 37.9 1 0
## 37.905 1 0
## 38 3 0
## 38.06 4 3
## 38.095 2 1
## 38.17 2 1
## 38.19 1 0
## 38.28 3 0
## 38.285 1 0
## 38.38 2 0
## 38.39 1 2
## 38.6 2 0
## 38.665 1 0
## 38.83 3 0
## 38.9 1 0
## 38.94 1 1
## 38.95 0 1
## 39.05 2 1
## 39.1 1 0
## 39.14 1 0
## 39.16 3 0
## 39.2 1 0
## 39.27 1 0
## 39.33 1 0
## 39.4 0 1
## 39.425 1 0
## 39.49 3 0
## 39.5 2 0
## 39.52 1 0
## 39.6 3 0
## 39.615 1 0
## 39.7 2 0
## 39.71 1 0
## 39.8 1 0
## 39.805 2 0
## 39.82 3 0
## 39.9 0 1
## 39.93 1 0
## 39.995 1 0
## 40.15 2 1
## 40.185 2 0
## 40.26 2 0
## 40.28 2 0
## 40.3 1 0
## 40.37 1 1
## 40.375 1 0
## 40.47 1 0
## 40.48 1 0
## 40.5 1 0
## 40.565 1 2
## 40.66 1 0
## 40.81 1 0
## 40.92 0 1
## 40.945 1 0
## 41.1 1 0
## 41.14 1 1
## 41.23 2 0
## 41.325 3 0
## 41.42 1 0
## 41.47 3 0
## 41.69 1 0
## 41.8 1 1
## 41.895 0 1
## 41.91 3 0
## 42.13 2 2
## 42.24 0 1
## 42.35 0 1
## 42.4 2 0
## 42.46 1 0
## 42.655 1 0
## 42.68 1 0
## 42.75 0 1
## 42.9 1 1
## 42.94 1 0
## 43.01 1 0
## 43.12 1 0
## 43.34 1 0
## 43.4 1 0
## 43.7 1 0
## 43.89 1 1
## 44 1 0
## 44.22 2 0
## 44.7 1 0
## 44.745 1 0
## 44.77 1 0
## 44.88 0 1
## 45.32 1 0
## 45.43 1 0
## 45.54 0 1
## 45.9 1 0
## 46.09 1 0
## 46.2 0 1
## 46.53 3 0
## 46.7 1 0
## 46.75 1 0
## 47.41 0 1
## 47.52 1 0
## 47.6 0 1
## 47.74 1 0
## 48.07 1 0
## 49.06 1 0
## 50.38 1 0
## 52.58 0 1
## 53.13 1 0
Terdapat relasi antara nilai BMI dengan perokok, sebagian besar non-smoker (bukan perokok) memiliki BMI normal yaitu di rentang 18,5-25
plot(insurance)
Dari visualisasi diatas, dapat dilihat sedikit variabel yang memiliki hubungan yang baik (linear)
# Create the layout
df <- layout( matrix(c(1,1,2,3), nrow=2, byrow=TRUE) )
# Fill with plots
mosaicplot(smoker ~ bmi, data = insurance, main = "", las = 1, shade = TRUE)
# Scatterplot between bmi and age
plot(insurance$bmi, insurance$age)
boxplot(charges ~ region, data= insurance, xlab = "region", ylab ="charges")
Dapat dilihat visualisasi hubungan antar variabel pada data insurance, menggunakan scatter, boxplot.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.