bikeData <- read.csv("D:/KULIAH SEMESTER 2/DATA MINING AND VISUALIZATION/ExerciseLab2_2540134874/bike_buyers.csv")
head(bikeData)
## ï..ID Marital.Status Gender Income Children Education Occupation
## 1 12496 Married Female 40000 1 Bachelors Skilled Manual
## 2 24107 Married Male 30000 3 Partial College Clerical
## 3 14177 Married Male 80000 5 Partial College Professional
## 4 24381 Single 70000 0 Bachelors Professional
## 5 25597 Single Male 30000 0 Bachelors Clerical
## 6 13507 Married Female 10000 2 Partial College Manual
## Home.Owner Cars Commute.Distance Region Age Purchased.Bike
## 1 Yes 0 0-1 Miles Europe 42 No
## 2 Yes 1 0-1 Miles Europe 43 No
## 3 No 2 2-5 Miles Europe 60 No
## 4 Yes 1 5-10 Miles Pacific 41 Yes
## 5 No 0 0-1 Miles Europe 36 Yes
## 6 Yes 0 1-2 Miles Europe 50 No
dim(bikeData)
## [1] 1000 13
EXPLANATION This bike_buyers dataset has 1000 instances and 13 variables (this dataset has 1000 rows and 13 columns).
str(bikeData)
## 'data.frame': 1000 obs. of 13 variables:
## $ ï..ID : int 12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
## $ Marital.Status : chr "Married" "Married" "Married" "Single" ...
## $ Gender : chr "Female" "Male" "Male" "" ...
## $ Income : int 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : int 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : chr "Bachelors" "Partial College" "Partial College" "Bachelors" ...
## $ Occupation : chr "Skilled Manual" "Clerical" "Professional" "Professional" ...
## $ Home.Owner : chr "Yes" "Yes" "No" "Yes" ...
## $ Cars : int 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute.Distance: chr "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
## $ Region : chr "Europe" "Europe" "Europe" "Pacific" ...
## $ Age : int 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased.Bike : chr "No" "No" "No" "Yes" ...
EXPLANATION There are 13 variables in this dataset: 1. ID -> the buyers ID. This ID variable data type are integer.
Marital Status -> The buyer’s state of being married or single. This marital status data type are characters.
Gender -> The buyer’s state either the buyer are male or female. This gender data type are characters.
Income -> Buyer’s income in a certain time. This income variable data type are integer.
Children -> The number of children the buyer has. This children variable data type are integer.
Education -> Buyer’s education background either Bachelors, Graduate Degree, High School, Partial College, or Partial High School. This education variable data type are character.
Occupation -> The buyer’s job or occupation either Clerical, Management, Manual,Professional, or Skilled Manual. This occupation variable data type are character.
Home Owner -> Buyer’s state whether the buyer has or does not have their own house. This home owner variable data type are character.
Cars -> The number of cars the buyer has. This cars variable data type are integer.
Commute Distance -> The distance between the buyer’s house and the buyer’s company. There are 5 categories; 0-1 Miles, 1-2 Miles 10+ Miles, 2-5 Miles, and 5-10 Miles. This commute distance variable data type are character.
Region -> Variable that tell us where the buyer lives either at Europe, North America, or Pacific. This region variable data type are character.
Age -> The buyer’s age. This age variable data type are integer.
Purchased Bike -> Buyer’s state whether the buyer purchased the bike or not. This purchased variable data type are character.
bikeData$Home.Owner <- as.factor(bikeData$Home.Owner)
bikeData$Marital.Status <- as.factor(bikeData$Marital.Status)
bikeData$Gender <- as.factor(bikeData$Gender)
bikeData$Purchased.Bike <- as.factor(bikeData$Purchased.Bike)
bikeData$Commute.Distance <- as.factor(bikeData$Commute.Distance)
bikeData$Education <- as.factor(bikeData$Education)
bikeData$Occupation <- as.factor(bikeData$Occupation)
bikeData$Region <- as.factor(bikeData$Region)
str(bikeData)
## 'data.frame': 1000 obs. of 13 variables:
## $ ï..ID : int 12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
## $ Marital.Status : Factor w/ 3 levels "","Married","Single": 2 2 2 3 3 2 3 2 1 2 ...
## $ Gender : Factor w/ 3 levels "","Female","Male": 2 3 3 1 3 2 3 3 3 3 ...
## $ Income : int 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : int 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : Factor w/ 5 levels "Bachelors","Graduate Degree",..: 1 4 4 1 1 4 3 1 5 4 ...
## $ Occupation : Factor w/ 5 levels "Clerical","Management",..: 5 1 4 4 1 3 2 5 1 3 ...
## $ Home.Owner : Factor w/ 3 levels "","No","Yes": 3 3 2 3 2 3 1 3 3 3 ...
## $ Cars : int 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute.Distance: Factor w/ 5 levels "0-1 Miles","1-2 Miles",..: 1 1 4 5 1 2 1 1 5 1 ...
## $ Region : Factor w/ 3 levels "Europe","North America",..: 1 1 1 3 1 1 3 1 3 1 ...
## $ Age : int 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased.Bike : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 2 1 2 ...
EXPLANATION To describe ordinal categorical variables, I assign factors to characters variables.
BasicSummary <- function(df, dgts = 3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(bikeData)
## variable type levels topLevel topCount topFrac missFreq
## 1 ï..ID integer 1000 11000 1 0.001 0
## 2 Marital.Status factor 3 Married 535 0.535 7
## 3 Gender factor 3 Male 500 0.500 11
## 4 Income integer 17 60000 165 0.165 6
## 5 Children integer 7 0 274 0.274 8
## 6 Education factor 5 Bachelors 306 0.306 0
## 7 Occupation factor 5 Professional 276 0.276 0
## 8 Home.Owner factor 3 Yes 682 0.682 4
## 9 Cars integer 6 2 342 0.342 9
## 10 Commute.Distance factor 5 0-1 Miles 366 0.366 0
## 11 Region factor 3 North America 508 0.508 0
## 12 Age integer 54 40 40 0.040 8
## 13 Purchased.Bike factor 2 No 519 0.519 0
## missFrac
## 1 0.000
## 2 0.007
## 3 0.011
## 4 0.006
## 5 0.008
## 6 0.000
## 7 0.000
## 8 0.004
## 9 0.009
## 10 0.000
## 11 0.000
## 12 0.008
## 13 0.000
EXPLANATION Of the 13 variables in the bike buyers dataset, 5 are integer and 8 are factor. More factor variables than integer variables in this dataset.
sapply(bikeData[, c(4:5,9,12)], mean, na.rm=TRUE)
## Income Children Cars Age
## 56267.605634 1.910282 1.455096 44.181452
sapply(bikeData[, c(4:5,9,12)], quantile, na.rm=TRUE)
## Income Children Cars Age
## 0% 10000 0 0 25
## 25% 30000 0 1 35
## 50% 60000 2 1 43
## 75% 70000 3 2 52
## 100% 170000 5 4 89
EXPLANATION
The bicycle buyers have an average income of 56267.605634 and has 1 or 2 children. Customer mostly has car and the average age of the customer is 44 years.
out <- boxplot.stats(bikeData$Income)$out
boxplot(bikeData$Income, ylab = "",main = "Boxplot of income")
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
EXPLANATION There are 10 values of income detected as outliers by the boxplot. Those values are 160000 that exist 3 times,170000 that exist 3 times and 15000 that exist 4 times.
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(bikeData)
## bikeData
##
## 13 Variables 1000 Observations
## --------------------------------------------------------------------------------
## ï..ID
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 1000 1 19966 6176 11781 12627
## .25 .50 .75 .90 .95
## 15291 19744 24471 27544 28413
##
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital.Status
## n missing distinct
## 1000 0 3
##
## Value Married Single
## Frequency 7 535 458
## Proportion 0.007 0.535 0.458
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 1000 0 3
##
## Value Female Male
## Frequency 11 489 500
## Proportion 0.011 0.489 0.500
## --------------------------------------------------------------------------------
## Income
## n missing distinct Info Mean Gmd .05 .10
## 994 6 16 0.986 56268 34273 10000 20000
## .25 .50 .75 .90 .95
## 30000 60000 70000 100000 120000
##
## lowest : 10000 20000 30000 40000 50000, highest: 120000 130000 150000 160000 170000
##
## Value 10000 20000 30000 40000 50000 60000 70000 80000 90000
## Frequency 73 74 134 153 40 165 123 90 38
## Proportion 0.073 0.074 0.135 0.154 0.040 0.166 0.124 0.091 0.038
##
## Value 100000 110000 120000 130000 150000 160000 170000
## Frequency 29 16 17 32 4 3 3
## Proportion 0.029 0.016 0.017 0.032 0.004 0.003 0.003
## --------------------------------------------------------------------------------
## Children
## n missing distinct Info Mean Gmd
## 992 8 6 0.96 1.91 1.827
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 274 169 209 133 126 81
## Proportion 0.276 0.170 0.211 0.134 0.127 0.082
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 1000 0 5
##
## lowest : Bachelors Graduate Degree High School Partial College Partial High School
## highest: Bachelors Graduate Degree High School Partial College Partial High School
##
## Value Bachelors Graduate Degree High School
## Frequency 306 174 179
## Proportion 0.306 0.174 0.179
##
## Value Partial College Partial High School
## Frequency 265 76
## Proportion 0.265 0.076
## --------------------------------------------------------------------------------
## Occupation
## n missing distinct
## 1000 0 5
##
## lowest : Clerical Management Manual Professional Skilled Manual
## highest: Clerical Management Manual Professional Skilled Manual
##
## Value Clerical Management Manual Professional
## Frequency 177 173 119 276
## Proportion 0.177 0.173 0.119 0.276
##
## Value Skilled Manual
## Frequency 255
## Proportion 0.255
## --------------------------------------------------------------------------------
## Home.Owner
## n missing distinct
## 1000 0 3
##
## Value No Yes
## Frequency 4 314 682
## Proportion 0.004 0.314 0.682
## --------------------------------------------------------------------------------
## Cars
## n missing distinct Info Mean Gmd
## 991 9 5 0.925 1.455 1.226
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 238 267 342 85 59
## Proportion 0.240 0.269 0.345 0.086 0.060
## --------------------------------------------------------------------------------
## Commute.Distance
## n missing distinct
## 1000 0 5
##
## lowest : 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## highest: 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
##
## Value 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## Frequency 366 169 111 162 192
## Proportion 0.366 0.169 0.111 0.162 0.192
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 1000 0 3
##
## Value Europe North America Pacific
## Frequency 300 508 192
## Proportion 0.300 0.508 0.192
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 992 8 53 0.999 44.18 12.85 28.00 30.00
## .25 .50 .75 .90 .95
## 35.00 43.00 52.00 60.90 65.45
##
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased.Bike
## n missing distinct
## 1000 0 2
##
## Value No Yes
## Frequency 519 481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------
EXPLANATION
More customers are male than female.
The number of customers who have an income of around 400000 the most and customers whose income is around 160000-170000 is only 3 people. There are 6 missing values in the income’s variables.
Most customers do not have children. But there are also those who have children, either 2 children, 1 child, 3 children, 4 children, or 5 children. There are 6 missing values in the children’s variable.
The educcation backgrounds of the customers are mostly bachelors and few are high school.
Occupation of most customers is professional.
Costumers mostly have their own house.
Most customers’ commute distance is 0-1 miles.
Most of the customers live in North America and the least in the Pacific.
The age of customers in this dataset are 25,26,27,28,29,73,74,78,80, and 89.There are 8 missing values in the age’s variables.
In this data, there are fewer people who buy bikes than those who don’t.
Costumers mostly have cars.There are 9 missing values in the car’s variables.
Costumers are mostly married.
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
fullSummary <- FindOutliers(bikeData$Income)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 6 10 -36935.85 149471.1 10000 130000
## 2 Hampel 1000 6 10 -28956.00 148956.0 10000 130000
## 3 BoxplotRule 1000 6 10 10000.00 130000.0 10000 130000
EXPLANATION There are 10 outliers detected in this dataset whether it’s detected by the threeSigma, Hampel, or Boxplot. And there are 6 missing values in this dataset.
count <- table(bikeData$Gender, bikeData$Purchased.Bike)
count
##
## No Yes
## 8 3
## Female 250 239
## Male 261 239
EXPLANATION The bike buyers are mostly doesn’t purchased bike.
count <- table(bikeData$Income, bikeData$Purchased.Bike)
count
##
## No Yes
## 10000 45 28
## 20000 43 31
## 30000 81 53
## 40000 64 89
## 50000 20 20
## 60000 84 81
## 70000 58 65
## 80000 56 34
## 90000 14 24
## 100000 18 11
## 110000 8 8
## 120000 8 9
## 130000 17 15
## 150000 1 3
## 160000 0 3
## 170000 2 1
EXPLANATION Few customers with high income purchased the bike.But more customers with low income purchased bike.
layoutMat <- layout( matrix(c(1,1,2,2), nrow=2),widths = c(5,10),heights = c(5,10))
boxplot(Income~Purchased.Bike,data = bikeData, ylab = "income", xlab ="Purchase Bike")
mosaicplot(Commute.Distance~Purchased.Bike,data = bikeData, xlab = "Commute Distance", ylab ="Purchase Bike")
EXPLANATION 1. More customer doesn’t purchase bike than the costumer who purchase bike.
layoutMat <- layout( matrix(c(1,1,2,2), nrow=2))
boxplot(Age~Purchased.Bike,data = bikeData, ylab = "age",xlab ="Purchase Bike")
mosaicplot(Marital.Status~Purchased.Bike,data = bikeData, xlab = "Marital Status", ylab ="Purchase Bike")
EXPLANATION 1. Customer in age around 35-50 purchase bike. There are 4 buyers that are in age up to 70. More customer doesn’t purchase bike.