df <- read.csv('bike_buyers.csv')
dim(df)
## [1] 1000 13
Explanation Terdapat 1000 baris dan 13 kolom
str(df)
## 'data.frame': 1000 obs. of 13 variables:
## $ ï..ID : int 12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
## $ Marital.Status : chr "Married" "Married" "Married" "Single" ...
## $ Gender : chr "Female" "Male" "Male" "" ...
## $ Income : int 40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
## $ Children : int 1 3 5 0 0 2 2 1 2 2 ...
## $ Education : chr "Bachelors" "Partial College" "Partial College" "Bachelors" ...
## $ Occupation : chr "Skilled Manual" "Clerical" "Professional" "Professional" ...
## $ Home.Owner : chr "Yes" "Yes" "No" "Yes" ...
## $ Cars : int 0 1 2 1 0 0 4 0 2 1 ...
## $ Commute.Distance: chr "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
## $ Region : chr "Europe" "Europe" "Europe" "Pacific" ...
## $ Age : int 42 43 60 41 36 50 33 43 58 NA ...
## $ Purchased.Bike : chr "No" "No" "No" "Yes" ...
Explanation Tipe Data
ID, Income, Children, Cars, Age -> integer
Marital.Status, Education. Occupation, Home.Owner, Commute.Distance, Purchased.Bike -> character/string
BasicSummary <- function(df, dgts = 3){
## #
## ################################################################
## #
## # Create a basic summary of variables in the data frame df,
## # a data frame with one row for each column of df giving the
## # variable name, type, number of unique levels, the most
## # frequent level, its frequency and corresponding fraction of
## # records, the number of missing values and its corresponding
## # fraction of records
## #
## ################################################################
## #
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(df)
## variable type levels topLevel topCount topFrac missFreq
## 1 ï..ID integer 1000 11000 1 0.001 0
## 2 Marital.Status character 3 Married 535 0.535 7
## 3 Gender character 3 Male 500 0.500 11
## 4 Income integer 17 60000 165 0.165 6
## 5 Children integer 7 0 274 0.274 8
## 6 Education character 5 Bachelors 306 0.306 0
## 7 Occupation character 5 Professional 276 0.276 0
## 8 Home.Owner character 3 Yes 682 0.682 4
## 9 Cars integer 6 2 342 0.342 9
## 10 Commute.Distance character 5 0-1 Miles 366 0.366 0
## 11 Region character 3 North America 508 0.508 0
## 12 Age integer 54 40 40 0.040 8
## 13 Purchased.Bike character 2 No 519 0.519 0
## missFrac
## 1 0.000
## 2 0.007
## 3 0.011
## 4 0.006
## 5 0.008
## 6 0.000
## 7 0.000
## 8 0.004
## 9 0.009
## 10 0.000
## 11 0.000
## 12 0.008
## 13 0.000
Explanation
Terdapat 2 tipe data yaitu 5 integer, dan 8 character
Pada variable ID terdapat 1000 unique variable berarti tidak ada yang double
Orang dengan status menikah yang paling banyak
Paling banyak punya mobil 2
sapply(df, function(x) sum(is.na(x)))
## ï..ID Marital.Status Gender Income
## 0 0 0 6
## Children Education Occupation Home.Owner
## 8 0 0 0
## Cars Commute.Distance Region Age
## 9 0 0 8
## Purchased.Bike
## 0
Explanation
df$Age[is.na(df$Age)] = 0
m<-mean(df$Age)
df$Age[df$Age==0]<-m
df$Children[is.na(df$Children)] = 0
m<-mean(df$Children)
df$Children[df$Children==0]<-m
df$Cars[is.na(df$Cars)] = 0
m<-mean(df$Cars)
df$Cars[df$Cars==0]<-m
df$Income[is.na(df$Income)] = 0
m<-mean(df$Income)
df$Income[df$Income==0]<-m
sapply(df, function(x) sum(is.na(x)))
## ï..ID Marital.Status Gender Income
## 0 0 0 0
## Children Education Occupation Home.Owner
## 0 0 0 0
## Cars Commute.Distance Region Age
## 0 0 0 0
## Purchased.Bike
## 0
Explanation
sudah tidak terdapat missing value
# Compute the mean of each column
sapply(df[, c(4, 5, 9, 12)], mean, na.rm=TRUE)
## Income Children Cars Age
## 56265.580000 2.429390 1.798174 44.178624
# Compute quartiles
sapply(df[, c(4, 5, 9, 12)], quantile, na.rm=TRUE)
## Income Children Cars Age
## 0% 10000 1.000 1.000 25
## 25% 30000 1.895 1.000 35
## 50% 60000 2.000 1.442 43
## 75% 70000 3.000 2.000 52
## 100% 170000 5.000 4.000 89
Explanation
Income rata-rata adalah 56268
Rata-rata mempunyai 2 anak
Rata-rata punya 1 mobil
Rata-rata buyers umur 44
https://cran.r-project.org/web/packages/Hmisc/Hmisc.pdf
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(df)
## df
##
## 13 Variables 1000 Observations
## --------------------------------------------------------------------------------
## ï..ID
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 1000 1 19966 6176 11781 12627
## .25 .50 .75 .90 .95
## 15291 19744 24471 27544 28413
##
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital.Status
## n missing distinct
## 993 7 2
##
## Value Married Single
## Frequency 535 458
## Proportion 0.539 0.461
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 989 11 2
##
## Value Female Male
## Frequency 489 500
## Proportion 0.494 0.506
## --------------------------------------------------------------------------------
## Income
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 17 0.986 56266 34159 10000 20000
## .25 .50 .75 .90 .95
## 30000 60000 70000 100000 120000
##
## lowest : 10000 20000 30000 40000 50000, highest: 120000 130000 150000 160000 170000
##
## Value 10000 20000 30000 40000 50000 55930 60000 70000 80000
## Frequency 73 74 134 153 40 6 165 123 90
## Proportion 0.073 0.074 0.134 0.153 0.040 0.006 0.165 0.123 0.090
##
## Value 90000 100000 110000 120000 130000 150000 160000 170000
## Frequency 38 29 16 17 32 4 3 3
## Proportion 0.038 0.029 0.016 0.017 0.032 0.004 0.003 0.003
## --------------------------------------------------------------------------------
## Children
## n missing distinct Info Mean Gmd
## 1000 0 6 0.959 2.429 1.231
##
## lowest : 1.000 1.895 2.000 3.000 4.000, highest: 1.895 2.000 3.000 4.000 5.000
##
## Value 1.000 1.895 2.000 3.000 4.000 5.000
## Frequency 169 282 209 133 126 81
## Proportion 0.169 0.282 0.209 0.133 0.126 0.081
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 1000 0 5
##
## lowest : Bachelors Graduate Degree High School Partial College Partial High School
## highest: Bachelors Graduate Degree High School Partial College Partial High School
##
## Value Bachelors Graduate Degree High School
## Frequency 306 174 179
## Proportion 0.306 0.174 0.179
##
## Value Partial College Partial High School
## Frequency 265 76
## Proportion 0.265 0.076
## --------------------------------------------------------------------------------
## Occupation
## n missing distinct
## 1000 0 5
##
## lowest : Clerical Management Manual Professional Skilled Manual
## highest: Clerical Management Manual Professional Skilled Manual
##
## Value Clerical Management Manual Professional
## Frequency 177 173 119 276
## Proportion 0.177 0.173 0.119 0.276
##
## Value Skilled Manual
## Frequency 255
## Proportion 0.255
## --------------------------------------------------------------------------------
## Home.Owner
## n missing distinct
## 996 4 2
##
## Value No Yes
## Frequency 314 682
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## Cars
## n missing distinct Info Mean Gmd
## 1000 0 5 0.925 1.798 0.8102
##
## lowest : 1.000 1.442 2.000 3.000 4.000, highest: 1.000 1.442 2.000 3.000 4.000
##
## Value 1.000 1.442 2.000 3.000 4.000
## Frequency 267 247 342 85 59
## Proportion 0.267 0.247 0.342 0.085 0.059
## --------------------------------------------------------------------------------
## Commute.Distance
## n missing distinct
## 1000 0 5
##
## lowest : 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## highest: 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
##
## Value 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## Frequency 366 169 111 162 192
## Proportion 0.366 0.169 0.111 0.162 0.192
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 1000 0 3
##
## Value Europe North America Pacific
## Frequency 300 508 192
## Proportion 0.300 0.508 0.192
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 1000 0 54 0.999 44.18 12.79 28.00 30.00
## .25 .50 .75 .90 .95
## 35.00 43.00 52.00 60.10 65.05
##
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased.Bike
## n missing distinct
## 1000 0 2
##
## Value No Yes
## Frequency 519 481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------
Explanation 1. calon pembeli kebanyakan sudah menikah (frequency 535) 2. kebanyakan laki-laki yang membeli 3. Income nya paling rendah 10000 paling tinggi 170000, frequency income terbanyak 60000 4. Kebanyakan yang beli pendidikan bachelors 5. Rata-rata yang membeli umur 45 6. Region paling banyak dari Amerika 508
library(car)
## Loading required package: carData
qqPlot(df$Income)
## [1] 13 44
out <- boxplot.stats(df$Income)$out
boxplot(df$Income,
ylab = "",
main = "Boxplot"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))
Explanation
outliers paling banyak 160000, dan outliers yaitu 150000, 160000, 170000 #### c. FindOutlier() function
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
fullSummary <- FindOutliers(df$Income)
fullSummary$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 1000 0 10 -36657.59 149188.8 10000 130000
## 2 Hampel 1000 0 10 -28956.00 148956.0 10000 130000
## 3 BoxplotRule 1000 0 10 10000.00 130000.0 10000 130000
count <- table(df$Income, df$Marital.Status)
count
##
## Married Single
## 10000 0 33 40
## 20000 1 34 39
## 30000 0 54 80
## 40000 2 90 61
## 50000 0 24 16
## 55930 1 3 2
## 60000 1 104 60
## 70000 0 66 57
## 80000 0 45 45
## 90000 1 21 16
## 1e+05 1 15 13
## 110000 0 6 10
## 120000 0 11 6
## 130000 0 23 9
## 150000 0 3 1
## 160000 0 1 2
## 170000 0 2 1
Explanation
Orang yang sudah menikah income nya paling besar bisa dilihat dari kolom di atas. Sedangkan orang yang single income nya lebih kecil rata-rata.
# Create the layout
nf <- layout( matrix(c(1,1,2,3), nrow=2, byrow=TRUE) )
# Fill with plots
mosaicplot(Age ~ Income, data = df, main = "", las = 1, shade = TRUE)
# Scatterplot between sugars and carbs
plot(df$Cars, df$Income)
boxplot(Age ~ Income, data = df, xlab = "Income", ylab ="Age")