#Read CSV File from chosen data using file.choose()
df = read.csv("Insurance.csv",header = T)
head(df)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
Brief Explanation about the data:
age: age of primary beneficiary
sex: insurance contractor gender, female, male
bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
children: Number of children covered by health insurance / Number of dependents
smoker: Smoking
region: the beneficiary’s residential area in the US, northeast, southeast, southwest, northwest.
charges: Individual medical costs billed by health insurance
#1.Basic Data Characteristics
dim(df)
## [1] 1338 7
#Brief Explanation : This Dataset has 1338 row and 7 column, so that means this dataset has 1338 data and 7 variable
str(df)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
Explanation :
#2.Compute Summary Statistics
#Explore the numerical Variable to Explore the Mean and each quantile
sapply(df[,c(3,7)], mean, na.rm=TRUE)
## bmi charges
## 30.6634 13270.4223
sapply(df[,c(3,7)], quantile, na.rm=TRUE)
## bmi charges
## 0% 15.96000 1121.874
## 25% 26.29625 4740.287
## 50% 30.40000 9382.033
## 75% 34.69375 16639.913
## 100% 53.13000 63770.428
BasicSummary <- function(df, dgts = 3){
## #
## ################################################################
## #
## # Create a basic summary of variables in the data frame df,
## # a data frame with one row for each column of df giving the
## # variable name, type, number of unique levels, the most
## # frequent level, its frequency and corresponding fraction of
## # records, the number of missing values and its corresponding
## # fraction of records
## #
## ################################################################
## #
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(df)
## variable type levels topLevel topCount topFrac missFreq missFrac
## 1 age integer 47 18 69 0.052 0 0
## 2 sex character 2 male 676 0.505 0 0
## 3 bmi numeric 548 32.3 13 0.010 0 0
## 4 children integer 6 0 574 0.429 0 0
## 5 smoker character 2 no 1064 0.795 0 0
## 6 region character 4 southeast 364 0.272 0 0
## 7 charges numeric 1337 1639.5631 2 0.001 0 0
Explanation :
#Change the Character dtype to factor
df[,c( 2,5:6)] <- lapply(df[,c( 2,5:6)],as.factor)
#RecHheck the previous function
BasicSummary(df)
## variable type levels topLevel topCount topFrac missFreq missFrac
## 1 age integer 47 18 69 0.052 0 0
## 2 sex factor 2 male 676 0.505 0 0
## 3 bmi numeric 548 32.3 13 0.010 0 0
## 4 children integer 6 0 574 0.429 0 0
## 5 smoker factor 2 no 1064 0.795 0 0
## 6 region factor 4 southeast 364 0.272 0 0
## 7 charges numeric 1337 1639.5631 2 0.001 0 0
#3.Look For Data Anomalies
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
Summaries <- FindOutliers(df$charges)
Summaries$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1338 0 7 -23059.611 49600.46 1121.874 49577.66
## 2 Hampel 1338 0 155 -12940.395 31704.46 1121.874 31620.00
## 3 BoxplotRule 1338 0 139 -1209.526 34489.35 1121.874 34472.84
#By Using FindOutliers function and analyzing the charges variable in the datasets given, there is a few things that we can conclude that:
By Using 3 method to find the outliers , the summaries is 7 outliers detected using Three Sigma rule, meanwhile there is 155 and 139 outliers respectively using Hampel Identifier and Boxplot Rule
2. the Upper Limit of Three Sigma Rule is much higher than Hampel and Boxplot that has not too big differences
#4.Exploring Relation between Variables
Tables <- table(df$age,df$smoker)
print(Tables)
##
## no yes
## 18 57 12
## 19 50 18
## 20 20 9
## 21 26 2
## 22 22 6
## 23 21 7
## 24 22 6
## 25 23 5
## 26 25 3
## 27 19 9
## 28 25 3
## 29 21 6
## 30 18 9
## 31 22 5
## 32 21 5
## 33 20 6
## 34 21 5
## 35 20 5
## 36 19 6
## 37 16 9
## 38 23 2
## 39 19 6
## 40 22 5
## 41 25 2
## 42 19 8
## 43 15 12
## 44 21 6
## 45 24 5
## 46 24 5
## 47 19 10
## 48 24 5
## 49 24 4
## 50 25 4
## 51 23 6
## 52 23 6
## 53 23 5
## 54 23 5
## 55 24 2
## 56 22 4
## 57 22 4
## 58 24 1
## 59 21 4
## 60 18 5
## 61 17 6
## 62 19 4
## 63 18 5
## 64 15 7
Conclusion : There is More People that is not smoking rather than smoking in every age in this datasets
#Creating Multople Plot using Matrix
stage <- layout( matrix(c(1,2,
3,4), nrow=2, byrow=TRUE) )
#Find Relation of Region and Charges
mosaicplot(sex ~ smoker, data = df, main = "", las = 1)
plot(df$region,df$charges,xlab="Region",ylab="Charges",main="Relation between Region and Charges")
plot(df$smoker,df$charges,xlab="Status as Smoker",ylab="Medical Charges")
plot(df$smoker,df$bmi,xlab="Status as Smoker",ylab = "BMI Index")
Conclusion:
Plot 1 (Upper Left) : There are a little bit more male that smoke rather than female, but mostly both gender doesn’t smoke
Plot 2(Upper Right) : There is no differences charges between every region in US Residentials, they are almost having a same medical charges
Plot 3(Lower Leff) : This is the most interesting plot, by this plot we can said that Non-Smoker has lower medical charges more than smoker. We can assume that smokers are more having a complication so they came to hospital more often than non- smokers, hence it makes more medical costs that are billed by health insurance
Plot 4(Lower Right) : Smoking Status doesn’t affect body mass index, as the boxplot represented, we can say that both smoker and non smoker has the same bmi index.