df <- read.csv("E:/Binus University/Semester 2/Data Mining and Visualization/bike_buyers.csv")
dim(df)
## [1] 1000 13
EXPLANATION Data Set Buyer Bike terdiri dari 1000 baris dan 13 kolom.
str(df)
## 'data.frame': 1000 obs. of 13 variables:
## $ ï..ID : int 12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
## $ Marital.Status : chr "Married" "Married" "Married" "Single" ...
## $ Gender : chr "Female" "Male" "Male" "" ...
## $ Income : int 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : int 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : chr "Bachelors" "Partial College" "Partial College" "Bachelors" ...
## $ Occupation : chr "Skilled Manual" "Clerical" "Professional" "Professional" ...
## $ Home.Owner : chr "Yes" "Yes" "No" "Yes" ...
## $ Cars : int 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute.Distance: chr "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
## $ Region : chr "Europe" "Europe" "Europe" "Pacific" ...
## $ Age : int 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased.Bike : chr "No" "No" "No" "Yes" ...
EXPLANATION ID : ID buyer bike, dengan tipe data integer Marital Status: Status pernikahan buyer bike, dengan tipe data character Gender : Jenis kelamin buyer bike, dengan tipe data character Income : Besar pendapatan buyer bike, dengan tipe data integer Children : Jumlah anak buyer bike, dengan tipe data integer Education : Latar belakang pendidikan buyer bike, dengan tipe data character Occupation : Pekerjaan buyer bike, dengan tipe data character Home.Owner : Apakah buyer bike memiliki rumah atau tidak, dengan tipe data character Cars : Jumlah mobil buyer bike, dengan tipe data integer Commute.Distance : Jarak rumah buyer bike dengan perusahaan, dengan tipe data character Region : Daerah tempat tinggal buyer bike, dengan tipe data character Age : Usia buyer bike, dengan tipe data integer Purchased.Bike : apakah buyer bike jadi membeli atau tidak, dengan tipe data character
BasicSummary <- function(df, dgts = 3){
## #
## ################################################################
## #
## # Create a basic summary of variables in the data frame df,
## # a data frame with one row for each column of df giving the
## # variable name, type, number of unique levels, the most
## # frequent level, its frequency and corresponding fraction of
## # records, the number of missing values and its corresponding
## # fraction of records
## #
## ################################################################
## #
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(df)
## variable type levels topLevel topCount topFrac missFreq
## 1 ï..ID integer 1000 11000 1 0.001 0
## 2 Marital.Status character 3 Married 535 0.535 7
## 3 Gender character 3 Male 500 0.500 11
## 4 Income integer 17 60000 165 0.165 6
## 5 Children integer 7 0 274 0.274 8
## 6 Education character 5 Bachelors 306 0.306 0
## 7 Occupation character 5 Professional 276 0.276 0
## 8 Home.Owner character 3 Yes 682 0.682 4
## 9 Cars integer 6 2 342 0.342 9
## 10 Commute.Distance character 5 0-1 Miles 366 0.366 0
## 11 Region character 3 North America 508 0.508 0
## 12 Age integer 54 40 40 0.040 8
## 13 Purchased.Bike character 2 No 519 0.519 0
## missFrac
## 1 0.000
## 2 0.007
## 3 0.011
## 4 0.006
## 5 0.008
## 6 0.000
## 7 0.000
## 8 0.004
## 9 0.009
## 10 0.000
## 11 0.000
## 12 0.008
## 13 0.000
EXPLANATION: Age : memiliki 47 unique value. Angka 18 paling sering muncul, dengan frekuensi sebanyak 69 kali dan persentase sebesar 0.052. Tipe data integer dan tidak ada missing value.
# Compute the mean of each column
sapply(df[, c(1,4,5,9,12)], mean, na.rm=TRUE)
## ï..ID Income Children Cars Age
## 19965.992000 56267.605634 1.910282 1.455096 44.181452
# Compute quartiles
sapply(df[, c(1,4,5,9,12)], quantile, na.rm=TRUE)
## ï..ID Income Children Cars Age
## 0% 11000.00 10000 0 0 25
## 25% 15290.75 30000 0 1 35
## 50% 19744.00 60000 2 1 43
## 75% 24470.75 70000 3 2 52
## 100% 29447.00 170000 5 4 89
EXPLANATION: 1. 2. 3.
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(df)
## df
##
## 13 Variables 1000 Observations
## --------------------------------------------------------------------------------
## ï..ID
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 1000 1 19966 6176 11781 12627
## .25 .50 .75 .90 .95
## 15291 19744 24471 27544 28413
##
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital.Status
## n missing distinct
## 993 7 2
##
## Value Married Single
## Frequency 535 458
## Proportion 0.539 0.461
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 989 11 2
##
## Value Female Male
## Frequency 489 500
## Proportion 0.494 0.506
## --------------------------------------------------------------------------------
## Income
## n missing distinct Info Mean Gmd .05 .10
## 994 6 16 0.986 56268 34273 10000 20000
## .25 .50 .75 .90 .95
## 30000 60000 70000 100000 120000
##
## lowest : 10000 20000 30000 40000 50000, highest: 120000 130000 150000 160000 170000
##
## Value 10000 20000 30000 40000 50000 60000 70000 80000 90000
## Frequency 73 74 134 153 40 165 123 90 38
## Proportion 0.073 0.074 0.135 0.154 0.040 0.166 0.124 0.091 0.038
##
## Value 100000 110000 120000 130000 150000 160000 170000
## Frequency 29 16 17 32 4 3 3
## Proportion 0.029 0.016 0.017 0.032 0.004 0.003 0.003
## --------------------------------------------------------------------------------
## Children
## n missing distinct Info Mean Gmd
## 992 8 6 0.96 1.91 1.827
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 274 169 209 133 126 81
## Proportion 0.276 0.170 0.211 0.134 0.127 0.082
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 1000 0 5
##
## lowest : Bachelors Graduate Degree High School Partial College Partial High School
## highest: Bachelors Graduate Degree High School Partial College Partial High School
##
## Value Bachelors Graduate Degree High School
## Frequency 306 174 179
## Proportion 0.306 0.174 0.179
##
## Value Partial College Partial High School
## Frequency 265 76
## Proportion 0.265 0.076
## --------------------------------------------------------------------------------
## Occupation
## n missing distinct
## 1000 0 5
##
## lowest : Clerical Management Manual Professional Skilled Manual
## highest: Clerical Management Manual Professional Skilled Manual
##
## Value Clerical Management Manual Professional
## Frequency 177 173 119 276
## Proportion 0.177 0.173 0.119 0.276
##
## Value Skilled Manual
## Frequency 255
## Proportion 0.255
## --------------------------------------------------------------------------------
## Home.Owner
## n missing distinct
## 996 4 2
##
## Value No Yes
## Frequency 314 682
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## Cars
## n missing distinct Info Mean Gmd
## 991 9 5 0.925 1.455 1.226
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 238 267 342 85 59
## Proportion 0.240 0.269 0.345 0.086 0.060
## --------------------------------------------------------------------------------
## Commute.Distance
## n missing distinct
## 1000 0 5
##
## lowest : 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## highest: 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
##
## Value 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## Frequency 366 169 111 162 192
## Proportion 0.366 0.169 0.111 0.162 0.192
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 1000 0 3
##
## Value Europe North America Pacific
## Frequency 300 508 192
## Proportion 0.300 0.508 0.192
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 992 8 53 0.999 44.18 12.85 28.00 30.00
## .25 .50 .75 .90 .95
## 35.00 43.00 52.00 60.90 65.45
##
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased.Bike
## n missing distinct
## 1000 0 2
##
## Value No Yes
## Frequency 519 481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------
EXPLANATION: 1. Proportion = Freq / n 2. 3.
an observation (or subset of observations) which appears to be inconsistent with the remainder of that set of data
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.3
qqPlot(df$Income)
## [1] 13 44
library(car)
qqPlot(df$Children)
## [1] 3 13
library(car)
qqPlot(df$Age)
## [1] 376 402
out <- boxplot.stats(df$Income)$out
boxplot(df$Income,
ylab = "",
main = "Income"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
out <- boxplot.stats(df$Children)$out
boxplot(df$Children,
ylab = "",
main = "Children"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
out <- boxplot.stats(df$Cars)$out
boxplot(df$Cars,
ylab = "",
main = "Cars"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
out <- boxplot.stats(df$Age)$out
boxplot(df$Age,
ylab = "",
main = "Age"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
fullSummary <- FindOutliers(df$Income)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 6 10 -36935.85 149471.1 10000 130000
## 2 Hampel 1000 6 10 -28956.00 148956.0 10000 130000
## 3 BoxplotRule 1000 6 10 10000.00 130000.0 10000 130000
fullSummary <- FindOutliers(df$Children)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 8 0 -2.970448 6.791013 0 5
## 2 Hampel 1000 8 0 -2.447800 6.447800 0 5
## 3 BoxplotRule 1000 8 0 -1.500000 7.500000 0 5
fullSummary <- FindOutliers(df$Cars)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 9 0 -1.91017 4.820362 0 4
## 2 Hampel 1000 9 0 -3.44780 5.447800 0 4
## 3 BoxplotRule 1000 9 297 0.50000 3.500000 1 3
fullSummary <- FindOutliers(df$Age)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 8 2 10.09543 78.26747 25 78
## 2 Hampel 1000 8 2 7.41760 78.58240 25 78
## 3 BoxplotRule 1000 8 25 26.50000 77.50000 27 74
count <- table(df$Income, df$Purchased.Bike)
count
##
## No Yes
## 10000 45 28
## 20000 43 31
## 30000 81 53
## 40000 64 89
## 50000 20 20
## 60000 84 81
## 70000 58 65
## 80000 56 34
## 90000 14 24
## 100000 18 11
## 110000 8 8
## 120000 8 9
## 130000 17 15
## 150000 1 3
## 160000 0 3
## 170000 2 1
semakin tinggi income, semakin tinggi persentase yang beli sepeda
# Create the layout
nf <- layout(matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
# Fill with plots
mosaicplot(Income ~ Purchased.Bike, data = df, main = "", las = 1, shade = TRUE)
# Scatterplot between sugars and carbs
#plot(df$Income, df$Purchased.Bike)
boxplot(Income ~ Purchased.Bike, data= df, xlab = "Children", ylab ="Purchased.Bike")
count <- table(df$Children, df$Purchased.Bike)
count
##
## No Yes
## 0 135 139
## 1 72 97
## 2 112 97
## 3 61 72
## 4 72 54
## 5 63 18
semakin sedikit anaknya, semakin tinggi persentase yang beli sepeda
# Create the layout
nf <- layout(matrix(c(1,1,2,3), nrow=2, byrow=TRUE))
# Fill with plots
mosaicplot(Children ~ Purchased.Bike, data = df, main = "", las = 1, shade = TRUE)
# Scatterplot between sugars and carbs
#plot(df$Children, df$Purchased.Bike)
boxplot(Children ~ Purchased.Bike, data= df, xlab = "Children", ylab ="Purchased.Bike")
matrix(c(1,1,2,3), nrow=2) creates a matrix of 2 rows and 2 columns.
First 2 panels will be for the first chart, the third for chart2 and the
last for chart 3.
Mosaic plots describe the relationship between two categorical variables. Essentially, these plots are graphical representations of contingency tables that tell us how many times the values of two categorical variables occur together in a dataset.
EXPLANATION: 1. This reflects the fact that cereals manufactured by R= Ralston Purina and P = Post are enriched with vitamins. 2. All cereals Manufactured by G and K have either 100% vitamins or enriched w/ vitamins.