library(readr)
dt <- read_csv("insurance.csv", col_names = TRUE)
## Rows: 1338 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dt
## # A tibble: 1,338 x 7
## age sex bmi children smoker region charges
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 19 female 27.9 0 yes southwest 16885.
## 2 18 male 33.8 1 no southeast 1726.
## 3 28 male 33 3 no southeast 4449.
## 4 33 male 22.7 0 no northwest 21984.
## 5 32 male 28.9 0 no northwest 3867.
## 6 31 female 25.7 0 no southeast 3757.
## 7 46 female 33.4 1 no southeast 8241.
## 8 37 female 27.7 3 no northwest 7282.
## 9 37 male 29.8 2 no northeast 6406.
## 10 60 female 25.8 0 no northwest 28923.
## # ... with 1,328 more rows
#EXPLANATION saya menggunakan library readr untuk menggunakan function read_csv agar bisa membaca dataset yang ada
dim(dt)
## [1] 1338 7
str(dt)
## spec_tbl_df [1,338 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr [1:1338] "female" "male" "male" "male" ...
## $ bmi : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
## $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr [1:1338] "yes" "no" "no" "no" ...
## $ region : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. bmi = col_double(),
## .. children = col_double(),
## .. smoker = col_character(),
## .. region = col_character(),
## .. charges = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#EXPLANATION function dim digunakan untuk mengecek dimensi yang ada didalam data set dan biasanya digunakan untuk melihat jumlah kolom str untuk melihat isi categori dan bentuk dataset seperti numeric atau integer
BasicSummary <- function(df, dgts = 3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(dt)
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
## variable type levels topLevel topCount topFrac missFreq missFrac
## 1 age tbl_df 47 18 69 0.052 0 0
## 2 sex tbl_df 2 male 676 0.505 0 0
## 3 bmi tbl_df 548 32.3 13 0.010 0 0
## 4 children tbl_df 6 0 574 0.429 0 0
## 5 smoker tbl_df 2 no 1064 0.795 0 0
## 6 region tbl_df 4 southeast 364 0.272 0 0
## 7 charges tbl_df 1337 1639.5631 2 0.001 0 0
#EXPLANATION 1. dari summary ini kita bisa melihat tipe variable jumlah frekuensi muncul dan sebagainya 2. lalu bisa kita lihat disini bahwa lebih banyak pria dibandingkan wanita yang berada di dalam dataset ini 3. charges memiliki level terbesar tapi memiliki count yang terkecil
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(dt)
## dt
##
## 7 Variables 1338 Observations
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 47 0.999 39.21 16.21 18 19
## .25 .50 .75 .90 .95
## 27 39 51 59 62
##
## lowest : 18 19 20 21 22, highest: 60 61 62 63 64
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 1338 0 2
##
## Value female male
## Frequency 662 676
## Proportion 0.495 0.505
## --------------------------------------------------------------------------------
## bmi
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 548 1 30.66 6.893 21.26 22.99
## .25 .50 .75 .90 .95
## 26.30 30.40 34.69 38.62 41.11
##
## lowest : 15.960 16.815 17.195 17.290 17.385, highest: 48.070 49.060 50.380 52.580 53.130
## --------------------------------------------------------------------------------
## children
## n missing distinct Info Mean Gmd
## 1338 0 6 0.899 1.095 1.275
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 574 324 240 157 25 18
## Proportion 0.429 0.242 0.179 0.117 0.019 0.013
## --------------------------------------------------------------------------------
## smoker
## n missing distinct
## 1338 0 2
##
## Value no yes
## Frequency 1064 274
## Proportion 0.795 0.205
## --------------------------------------------------------------------------------
## region
## n missing distinct
## 1338 0 4
##
## Value northeast northwest southeast southwest
## Frequency 324 325 364 325
## Proportion 0.242 0.243 0.272 0.243
## --------------------------------------------------------------------------------
## charges
## n missing distinct Info Mean Gmd .05 .10
## 1338 0 1337 1 13270 12301 1758 2347
## .25 .50 .75 .90 .95
## 4740 9382 16640 34832 41182
##
## lowest : 1121.874 1131.507 1135.941 1136.399 1137.011
## highest: 55135.402 58571.074 60021.399 62592.873 63770.428
## --------------------------------------------------------------------------------
#EXPLANATION describe digunakan untuk memberikan detail dalam dataset
library(car)
## Loading required package: carData
qqPlot(dt$charges)
## [1] 544 1301
#EXPLANATION qqplot digunakan untuk menampilkan dan memperlihatkan anomalies yang ada didalam data set
outlierIndex <- which(dt$charges > 60000)
rownames(dt)[outlierIndex]
## [1] "544" "1231" "1301"
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
fullSummary <- FindOutliers(dt$charges)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1338 0 7 -23059.611 49600.46 1121.874 49577.66
## 2 Hampel 1338 0 155 -12940.395 31704.46 1121.874 31620.00
## 3 BoxplotRule 1338 0 139 -1209.526 34489.35 1121.874 34472.84
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.