library(readr)
dt <- read_csv("insurance.csv", col_names = TRUE)
## Rows: 1338 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dt
## # A tibble: 1,338 x 7
##      age sex      bmi children smoker region    charges
##    <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>       <dbl>
##  1    19 female  27.9        0 yes    southwest  16885.
##  2    18 male    33.8        1 no     southeast   1726.
##  3    28 male    33          3 no     southeast   4449.
##  4    33 male    22.7        0 no     northwest  21984.
##  5    32 male    28.9        0 no     northwest   3867.
##  6    31 female  25.7        0 no     southeast   3757.
##  7    46 female  33.4        1 no     southeast   8241.
##  8    37 female  27.7        3 no     northwest   7282.
##  9    37 male    29.8        2 no     northeast   6406.
## 10    60 female  25.8        0 no     northwest  28923.
## # ... with 1,328 more rows

#EXPLANATION saya menggunakan library readr untuk menggunakan function read_csv agar bisa membaca dataset yang ada

dim(dt)
## [1] 1338    7
str(dt)
## spec_tbl_df [1,338 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age     : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr [1:1338] "female" "male" "male" "male" ...
##  $ bmi     : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
##  $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr [1:1338] "yes" "no" "no" "no" ...
##  $ region  : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   sex = col_character(),
##   ..   bmi = col_double(),
##   ..   children = col_double(),
##   ..   smoker = col_character(),
##   ..   region = col_character(),
##   ..   charges = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

#EXPLANATION function dim digunakan untuk mengecek dimensi yang ada didalam data set dan biasanya digunakan untuk melihat jumlah kolom str untuk melihat isi categori dan bentuk dataset seperti numeric atau integer

BasicSummary <- function(df, dgts = 3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)

for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
 levels = levels, topLevel = topLevel,
 topCount = topCount, topFrac = topFrac,
 missFreq = missCount, missFrac = missFrac)
 return(summaryFrame)
 }

BasicSummary(dt)
## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length

## Warning in varType[i] <- class(x): number of items to replace is not a multiple
## of replacement length
##   variable   type levels  topLevel topCount topFrac missFreq missFrac
## 1      age tbl_df     47        18       69   0.052        0        0
## 2      sex tbl_df      2      male      676   0.505        0        0
## 3      bmi tbl_df    548      32.3       13   0.010        0        0
## 4 children tbl_df      6         0      574   0.429        0        0
## 5   smoker tbl_df      2        no     1064   0.795        0        0
## 6   region tbl_df      4 southeast      364   0.272        0        0
## 7  charges tbl_df   1337 1639.5631        2   0.001        0        0

#EXPLANATION 1. dari summary ini kita bisa melihat tipe variable jumlah frekuensi muncul dan sebagainya 2. lalu bisa kita lihat disini bahwa lebih banyak pria dibandingkan wanita yang berada di dalam dataset ini 3. charges memiliki level terbesar tapi memiliki count yang terkecil

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(dt)
## dt 
## 
##  7  Variables      1338  Observations
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0       47    0.999    39.21    16.21       18       19 
##      .25      .50      .75      .90      .95 
##       27       39       51       59       62 
## 
## lowest : 18 19 20 21 22, highest: 60 61 62 63 64
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##     1338        0        2 
##                         
## Value      female   male
## Frequency     662    676
## Proportion  0.495  0.505
## --------------------------------------------------------------------------------
## bmi 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0      548        1    30.66    6.893    21.26    22.99 
##      .25      .50      .75      .90      .95 
##    26.30    30.40    34.69    38.62    41.11 
## 
## lowest : 15.960 16.815 17.195 17.290 17.385, highest: 48.070 49.060 50.380 52.580 53.130
## --------------------------------------------------------------------------------
## children 
##        n  missing distinct     Info     Mean      Gmd 
##     1338        0        6    0.899    1.095    1.275 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##                                               
## Value          0     1     2     3     4     5
## Frequency    574   324   240   157    25    18
## Proportion 0.429 0.242 0.179 0.117 0.019 0.013
## --------------------------------------------------------------------------------
## smoker 
##        n  missing distinct 
##     1338        0        2 
##                       
## Value         no   yes
## Frequency   1064   274
## Proportion 0.795 0.205
## --------------------------------------------------------------------------------
## region 
##        n  missing distinct 
##     1338        0        4 
##                                                   
## Value      northeast northwest southeast southwest
## Frequency        324       325       364       325
## Proportion     0.242     0.243     0.272     0.243
## --------------------------------------------------------------------------------
## charges 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1338        0     1337        1    13270    12301     1758     2347 
##      .25      .50      .75      .90      .95 
##     4740     9382    16640    34832    41182 
## 
## lowest :  1121.874  1131.507  1135.941  1136.399  1137.011
## highest: 55135.402 58571.074 60021.399 62592.873 63770.428
## --------------------------------------------------------------------------------

#EXPLANATION describe digunakan untuk memberikan detail dalam dataset

library(car)
## Loading required package: carData
qqPlot(dt$charges)

## [1]  544 1301

#EXPLANATION qqplot digunakan untuk menampilkan dan memperlihatkan anomalies yang ada didalam data set

outlierIndex <- which(dt$charges > 60000)
rownames(dt)[outlierIndex]
## [1] "544"  "1231" "1301"
ThreeSigma <- function(x, t = 3){

 mu <- mean(x, na.rm = TRUE)
 sig <- sd(x, na.rm = TRUE)
 if (sig == 0){
 message("All non-missing x-values are identical")
}
 up <- mu + t * sig
 down <- mu - t * sig
 out <- list(up = up, down = down)
 return(out)
 }

Hampel <- function(x, t = 3){

 mu <- median(x, na.rm = TRUE)
 sig <- mad(x, na.rm = TRUE)
 if (sig == 0){
 message("Hampel identifer implosion: MAD scale estimate is zero")
 }
 up <- mu + t * sig
 down <- mu - t * sig
 out <- list(up = up, down = down)
 return(out)
 }
   
BoxplotRule<- function(x, t = 1.5){

 xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
 xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
 Q <- xU - xL
 if (Q == 0){
 message("Boxplot rule implosion: interquartile distance is zero")
 }
 up <- xU + t * Q
 down <- xU - t * Q
 out <- list(up = up, down = down)
 return(out)
}   

ExtractDetails <- function(x, down, up){

 outClass <- rep("N", length(x))
 indexLo <- which(x < down)
 indexHi <- which(x > up)
 outClass[indexLo] <- "L"
 outClass[indexHi] <- "U"
 index <- union(indexLo, indexHi)
 values <- x[index]
 outClass <- outClass[index]
 nOut <- length(index)
 maxNom <- max(x[which(x <= up)])
 minNom <- min(x[which(x >= down)])
 outList <- list(nOut = nOut, lowLim = down,
 upLim = up, minNom = minNom,
 maxNom = maxNom, index = index,
 values = values,
 outClass = outClass)
 return(outList)
 }
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
 threeLims <- ThreeSigma(x, t = t3)
 HampLims <- Hampel(x, t = tH)
 boxLims <- BoxplotRule(x, t = tb)

 n <- length(x)
 nMiss <- length(which(is.na(x)))

 threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
 HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
 boxList <- ExtractDetails(x, boxLims$down, boxLims$up)

 sumFrame <- data.frame(method = "ThreeSigma", n = n,
 nMiss = nMiss, nOut = threeList$nOut,
 lowLim = threeList$lowLim,
 upLim = threeList$upLim,
 minNom = threeList$minNom,
 maxNom = threeList$maxNom)
 upFrame <- data.frame(method = "Hampel", n = n,
 nMiss = nMiss, nOut = HampList$nOut,
 lowLim = HampList$lowLim,
 upLim = HampList$upLim,
 minNom = HampList$minNom,
 maxNom = HampList$maxNom)
 sumFrame <- rbind.data.frame(sumFrame, upFrame)
 upFrame <- data.frame(method = "BoxplotRule", n = n,
 nMiss = nMiss, nOut = boxList$nOut,
 lowLim = boxList$lowLim,
 upLim = boxList$upLim,
 minNom = boxList$minNom,
 maxNom = boxList$maxNom)
 sumFrame <- rbind.data.frame(sumFrame, upFrame)

 threeFrame <- data.frame(index = threeList$index,
 values = threeList$values,
 type = threeList$outClass)
 HampFrame <- data.frame(index = HampList$index,
 values = HampList$values,
 type = HampList$outClass)
 boxFrame <- data.frame(index = boxList$index,
 values = boxList$values,
 type = boxList$outClass)
 outList <- list(summary = sumFrame, threeSigma = threeFrame,
 Hampel = HampFrame, boxplotRule = boxFrame)
 return(outList)
}
fullSummary <- FindOutliers(dt$charges)
fullSummary$summary
##        method    n nMiss nOut     lowLim    upLim   minNom   maxNom
## 1  ThreeSigma 1338     0    7 -23059.611 49600.46 1121.874 49577.66
## 2      Hampel 1338     0  155 -12940.395 31704.46 1121.874 31620.00
## 3 BoxplotRule 1338     0  139  -1209.526 34489.35 1121.874 34472.84

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.