#import csv
bike_buyers <- read.csv("E:/Perkuliahan Sems 2 [Flo]/Data Mining [LAB]/Session 4[5-4-2022]/bike_buyers.csv")
dim(bike_buyers)
## [1] 1000 13
EXPLANATION
The dim function returns the dimension of the bike_buyers dataset. it shows that the bike_buyers dataset has 1000 rows and 13 columns ( 1000 instances and 13 attributes)
str(bike_buyers)
## 'data.frame': 1000 obs. of 13 variables:
## $ ï..ID : int 12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
## $ Marital.Status : chr "Married" "Married" "Married" "Single" ...
## $ Gender : chr "Female" "Male" "Male" "" ...
## $ Income : int 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : int 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : chr "Bachelors" "Partial College" "Partial College" "Bachelors" ...
## $ Occupation : chr "Skilled Manual" "Clerical" "Professional" "Professional" ...
## $ Home.Owner : chr "Yes" "Yes" "No" "Yes" ...
## $ Cars : int 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute.Distance: chr "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
## $ Region : chr "Europe" "Europe" "Europe" "Pacific" ...
## $ Age : int 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased.Bike : chr "No" "No" "No" "Yes" ...
writeLines("\n")
sapply(bike_buyers, class)
## ï..ID Marital.Status Gender Income
## "integer" "character" "character" "integer"
## Children Education Occupation Home.Owner
## "integer" "character" "character" "character"
## Cars Commute.Distance Region Age
## "integer" "character" "character" "integer"
## Purchased.Bike
## "character"
EXPLANATION
there are two different data types in the bike_buyers dataset which are integer (ID, Income, Children, Cars, Age) and character (Marital.Status, Gender, Education, Occupation, Home.Owner, Commute Distance, Region, Purchased.Bike)
BasicSummary <- function(df, dgts = 3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(bike_buyers)
## variable type levels topLevel topCount topFrac missFreq
## 1 ï..ID integer 1000 11000 1 0.001 0
## 2 Marital.Status character 3 Married 535 0.535 7
## 3 Gender character 3 Male 500 0.500 11
## 4 Income integer 17 60000 165 0.165 6
## 5 Children integer 7 0 274 0.274 8
## 6 Education character 5 Bachelors 306 0.306 0
## 7 Occupation character 5 Professional 276 0.276 0
## 8 Home.Owner character 3 Yes 682 0.682 4
## 9 Cars integer 6 2 342 0.342 9
## 10 Commute.Distance character 5 0-1 Miles 366 0.366 0
## 11 Region character 3 North America 508 0.508 0
## 12 Age integer 54 40 40 0.040 8
## 13 Purchased.Bike character 2 No 519 0.519 0
## missFrac
## 1 0.000
## 2 0.007
## 3 0.011
## 4 0.006
## 5 0.008
## 6 0.000
## 7 0.000
## 8 0.004
## 9 0.009
## 10 0.000
## 11 0.000
## 12 0.008
## 13 0.000
EXPLANATION
It is clear that all the variables have clear and simple explanatory names which are not difficult to understand and it describes the data in the dataset. From the 7 variables in the dataset, 5 of them were integer, and the rest is character. It can be seen that the integer variables has more levels than the character variables.
summary(bike_buyers)
## ï..ID Marital.Status Gender Income
## Min. :11000 Length:1000 Length:1000 Min. : 10000
## 1st Qu.:15291 Class :character Class :character 1st Qu.: 30000
## Median :19744 Mode :character Mode :character Median : 60000
## Mean :19966 Mean : 56268
## 3rd Qu.:24471 3rd Qu.: 70000
## Max. :29447 Max. :170000
## NA's :6
## Children Education Occupation Home.Owner
## Min. :0.00 Length:1000 Length:1000 Length:1000
## 1st Qu.:0.00 Class :character Class :character Class :character
## Median :2.00 Mode :character Mode :character Mode :character
## Mean :1.91
## 3rd Qu.:3.00
## Max. :5.00
## NA's :8
## Cars Commute.Distance Region Age
## Min. :0.000 Length:1000 Length:1000 Min. :25.00
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:35.00
## Median :1.000 Mode :character Mode :character Median :43.00
## Mean :1.455 Mean :44.18
## 3rd Qu.:2.000 3rd Qu.:52.00
## Max. :4.000 Max. :89.00
## NA's :9 NA's :8
## Purchased.Bike
## Length:1000
## Class :character
## Mode :character
##
##
##
##
writeLines("Mean:")
## Mean:
sapply(bike_buyers[, c(4,12)], mean, na.rm=TRUE)
## Income Age
## 56267.60563 44.18145
writeLines("\nDescription:")
##
## Description:
sapply(bike_buyers[, c(4,12)], quantile, na.rm=TRUE)
## Income Age
## 0% 10000 25
## 25% 30000 35
## 50% 60000 43
## 75% 70000 52
## 100% 170000 89
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(bike_buyers)
## bike_buyers
##
## 13 Variables 1000 Observations
## --------------------------------------------------------------------------------
## ï..ID
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 1000 1 19966 6176 11781 12627
## .25 .50 .75 .90 .95
## 15291 19744 24471 27544 28413
##
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital.Status
## n missing distinct
## 993 7 2
##
## Value Married Single
## Frequency 535 458
## Proportion 0.539 0.461
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 989 11 2
##
## Value Female Male
## Frequency 489 500
## Proportion 0.494 0.506
## --------------------------------------------------------------------------------
## Income
## n missing distinct Info Mean Gmd .05 .10
## 994 6 16 0.986 56268 34273 10000 20000
## .25 .50 .75 .90 .95
## 30000 60000 70000 100000 120000
##
## lowest : 10000 20000 30000 40000 50000, highest: 120000 130000 150000 160000 170000
##
## Value 10000 20000 30000 40000 50000 60000 70000 80000 90000
## Frequency 73 74 134 153 40 165 123 90 38
## Proportion 0.073 0.074 0.135 0.154 0.040 0.166 0.124 0.091 0.038
##
## Value 100000 110000 120000 130000 150000 160000 170000
## Frequency 29 16 17 32 4 3 3
## Proportion 0.029 0.016 0.017 0.032 0.004 0.003 0.003
## --------------------------------------------------------------------------------
## Children
## n missing distinct Info Mean Gmd
## 992 8 6 0.96 1.91 1.827
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 274 169 209 133 126 81
## Proportion 0.276 0.170 0.211 0.134 0.127 0.082
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 1000 0 5
##
## lowest : Bachelors Graduate Degree High School Partial College Partial High School
## highest: Bachelors Graduate Degree High School Partial College Partial High School
##
## Value Bachelors Graduate Degree High School
## Frequency 306 174 179
## Proportion 0.306 0.174 0.179
##
## Value Partial College Partial High School
## Frequency 265 76
## Proportion 0.265 0.076
## --------------------------------------------------------------------------------
## Occupation
## n missing distinct
## 1000 0 5
##
## lowest : Clerical Management Manual Professional Skilled Manual
## highest: Clerical Management Manual Professional Skilled Manual
##
## Value Clerical Management Manual Professional
## Frequency 177 173 119 276
## Proportion 0.177 0.173 0.119 0.276
##
## Value Skilled Manual
## Frequency 255
## Proportion 0.255
## --------------------------------------------------------------------------------
## Home.Owner
## n missing distinct
## 996 4 2
##
## Value No Yes
## Frequency 314 682
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## Cars
## n missing distinct Info Mean Gmd
## 991 9 5 0.925 1.455 1.226
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 238 267 342 85 59
## Proportion 0.240 0.269 0.345 0.086 0.060
## --------------------------------------------------------------------------------
## Commute.Distance
## n missing distinct
## 1000 0 5
##
## lowest : 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## highest: 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
##
## Value 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## Frequency 366 169 111 162 192
## Proportion 0.366 0.169 0.111 0.162 0.192
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 1000 0 3
##
## Value Europe North America Pacific
## Frequency 300 508 192
## Proportion 0.300 0.508 0.192
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 992 8 53 0.999 44.18 12.85 28.00 30.00
## .25 .50 .75 .90 .95
## 35.00 43.00 52.00 60.90 65.45
##
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased.Bike
## n missing distinct
## 1000 0 2
##
## Value No Yes
## Frequency 519 481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------
EXPLANATION
Missing values were found in the bike_buyers dataset, 7 missing values in Marital.Status variable, 11 in Gender variable, 6 in Income variable, 8 missing values iin CHildren variable, 4 in Home.Owner variable, 9 in Cars variables, and 8 missing values in the Age variable.
bike_buyers[, c(2,3,6:8, 10, 11, 13)] <- lapply(bike_buyers[, c(2,3,6:8, 10, 11, 13)], as.factor)
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
FullSummary <- FindOutliers(bike_buyers$Income)
FullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 6 10 -36935.85 149471.1 10000 130000
## 2 Hampel 1000 6 10 -28956.00 148956.0 10000 130000
## 3 BoxplotRule 1000 6 10 10000.00 130000.0 10000 130000
EXPLANATION From these three method of finding the outliers, three of them detect the same amount of the outliers which is 10 outliers. For the upper and lower limit, the BoxplotRule has the lowest upper and lower outlier limit among the three of them, but it does’nt give that big/ much difference The lower and upper limits of the non-outlying data values of the three rule has the same value
rcorr(as.matrix(bike_buyers[c(1,4,5, 9, 12)]), type = "spearman")
## ï..ID Income Children Cars Age
## ï..ID 1.00 -0.06 -0.02 0.03 -0.05
## Income -0.06 1.00 0.29 0.33 0.20
## Children -0.02 0.29 1.00 0.28 0.60
## Cars 0.03 0.33 0.28 1.00 0.22
## Age -0.05 0.20 0.60 0.22 1.00
##
## n
## ï..ID Income Children Cars Age
## ï..ID 1000 994 992 991 992
## Income 994 994 986 985 987
## Children 992 986 992 983 985
## Cars 991 985 983 991 983
## Age 992 987 985 983 992
##
## P
## ï..ID Income Children Cars Age
## ï..ID 0.0593 0.5023 0.3593 0.1239
## Income 0.0593 0.0000 0.0000 0.0000
## Children 0.5023 0.0000 0.0000 0.0000
## Cars 0.3593 0.0000 0.0000 0.0000
## Age 0.1239 0.0000 0.0000 0.0000
hist.data.frame(bike_buyers)
plot(bike_buyers)
Table <- table(bike_buyers$Income, bike_buyers$Purchased.Bike, bike_buyers$Gender)
print(Table)
## , , =
##
##
## No Yes
## 10000 0 0
## 20000 0 0
## 30000 0 0
## 40000 0 0
## 50000 0 1
## 60000 2 1
## 70000 2 1
## 80000 4 0
## 90000 0 0
## 100000 0 0
## 110000 0 0
## 120000 0 0
## 130000 0 0
## 150000 0 0
## 160000 0 0
## 170000 0 0
##
## , , = Female
##
##
## No Yes
## 10000 25 17
## 20000 20 20
## 30000 41 26
## 40000 33 41
## 50000 9 9
## 60000 34 41
## 70000 29 34
## 80000 26 18
## 90000 9 10
## 100000 9 3
## 110000 4 2
## 120000 2 5
## 130000 9 8
## 150000 0 1
## 160000 0 1
## 170000 0 1
##
## , , = Male
##
##
## No Yes
## 10000 20 11
## 20000 23 11
## 30000 40 27
## 40000 31 48
## 50000 11 10
## 60000 48 39
## 70000 27 30
## 80000 26 16
## 90000 5 14
## 100000 9 8
## 110000 4 6
## 120000 6 4
## 130000 8 7
## 150000 1 2
## 160000 0 2
## 170000 2 0
Table <- table(bike_buyers$Age, bike_buyers$Marital.Status)
print(Table)
##
## Married Single
## 25 0 2 3
## 26 0 5 11
## 27 0 10 13
## 28 1 7 14
## 29 0 5 11
## 30 0 8 18
## 31 0 5 20
## 32 0 18 15
## 33 0 8 13
## 34 0 16 15
## 35 1 16 18
## 36 0 16 21
## 37 0 15 17
## 38 0 13 24
## 39 1 8 13
## 40 1 23 16
## 41 0 14 14
## 42 0 18 16
## 43 1 19 16
## 44 0 16 11
## 45 0 20 11
## 46 0 19 8
## 47 0 23 16
## 48 0 21 8
## 49 0 15 8
## 50 0 13 10
## 51 0 13 9
## 52 0 13 12
## 53 0 14 10
## 54 0 12 4
## 55 0 14 3
## 56 0 11 5
## 57 0 4 4
## 58 1 7 4
## 59 0 14 6
## 60 0 8 6
## 61 0 7 2
## 62 0 5 8
## 63 0 6 3
## 64 0 10 0
## 65 0 6 3
## 66 0 10 4
## 67 0 5 5
## 68 0 1 2
## 69 0 7 1
## 70 0 4 0
## 71 0 1 0
## 72 0 1 0
## 73 0 2 2
## 74 0 0 1
## 78 0 1 1
## 80 0 1 0
## 89 0 1 0
matrix <- layout( matrix(c(1,2,3,4), nrow=2, byrow=TRUE) )
mosaicplot(Gender~Purchased.Bike,
data = bike_buyers,
main = "Gender vs Purchased Bike",
col = "pink",
las=1,
shade = TRUE)
boxplot(Income~Purchased.Bike,
data = bike_buyers,
xlab = "Purchased Bike",
main = "Purchased bike status over Income",
col = "lightblue")
boxplot(Age~Marital.Status,
data = bike_buyers,
main = "Marital Status by age",
col = "lightgreen")
mosaicplot(Children~Purchased.Bike,
data = bike_buyers,
main = "",
col = "lightyellow")
EXPLANATION
The first plot in the upper left tells us that the amount of bike pruchased by the female and male gender has not mush difference
In the second plot (the upper right), the income of the buyers doesn’t really affect the purchased bike, so people with higer income will not be guaranteed to buy the bike.
The third plot (lower left), indicates that most people with high age have the status of being married and for the last plot, people with range 0-4 children tend to purchsed bike rather than people with 5 children which is the most amount of children in the dataset