Read file csv

df <- read.csv('bike_buyers.csv')

1. Examining the basic data characteristics

a. dim function

dim(df)
## [1] 1000   13

Explanation Terdapat 1000 baris dan 13 kolom

b. str function

str(df)
## 'data.frame':    1000 obs. of  13 variables:
##  $ ï..ID           : int  12496 24107 14177 24381 25597 13507 27974 19364 22155 19280 ...
##  $ Marital.Status  : chr  "Married" "Married" "Married" "Single" ...
##  $ Gender          : chr  "Female" "Male" "Male" "" ...
##  $ Income          : int  40000 30000 80000 70000 30000 10000 160000 40000 20000 NA ...
##  $ Children        : int  1 3 5 0 0 2 2 1 2 2 ...
##  $ Education       : chr  "Bachelors" "Partial College" "Partial College" "Bachelors" ...
##  $ Occupation      : chr  "Skilled Manual" "Clerical" "Professional" "Professional" ...
##  $ Home.Owner      : chr  "Yes" "Yes" "No" "Yes" ...
##  $ Cars            : int  0 1 2 1 0 0 4 0 2 1 ...
##  $ Commute.Distance: chr  "0-1 Miles" "0-1 Miles" "2-5 Miles" "5-10 Miles" ...
##  $ Region          : chr  "Europe" "Europe" "Europe" "Pacific" ...
##  $ Age             : int  42 43 60 41 36 50 33 43 58 NA ...
##  $ Purchased.Bike  : chr  "No" "No" "No" "Yes" ...

Explanation Tipe Data

  • ID, Income, Children, Cars, Age -> integer

  • Marital.Status, Education. Occupation, Home.Owner, Commute.Distance, Purchased.Bike -> character/string

c. summary function

BasicSummary <- function(df, dgts = 3){
## #
## ################################################################
## #
## # Create a basic summary of variables in the data frame df,
## # a data frame with one row for each column of df giving the
## # variable name, type, number of unique levels, the most
## # frequent level, its frequency and corresponding fraction of
## # records, the number of missing values and its corresponding
## # fraction of records
## #
## ################################################################
## #
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)

for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
 levels = levels, topLevel = topLevel,
 topCount = topCount, topFrac = topFrac,
 missFreq = missCount, missFrac = missFrac)
 return(summaryFrame)
 }

BasicSummary(df)
##            variable      type levels      topLevel topCount topFrac missFreq
## 1             ï..ID   integer   1000         11000        1   0.001        0
## 2    Marital.Status character      3       Married      535   0.535        7
## 3            Gender character      3          Male      500   0.500       11
## 4            Income   integer     17         60000      165   0.165        6
## 5          Children   integer      7             0      274   0.274        8
## 6         Education character      5     Bachelors      306   0.306        0
## 7        Occupation character      5  Professional      276   0.276        0
## 8        Home.Owner character      3           Yes      682   0.682        4
## 9              Cars   integer      6             2      342   0.342        9
## 10 Commute.Distance character      5     0-1 Miles      366   0.366        0
## 11           Region character      3 North America      508   0.508        0
## 12              Age   integer     54            40       40   0.040        8
## 13   Purchased.Bike character      2            No      519   0.519        0
##    missFrac
## 1     0.000
## 2     0.007
## 3     0.011
## 4     0.006
## 5     0.008
## 6     0.000
## 7     0.000
## 8     0.004
## 9     0.009
## 10    0.000
## 11    0.000
## 12    0.008
## 13    0.000

Explanation

  1. Terdapat 2 tipe data yaitu 5 integer, dan 8 character

  2. Pada variable ID terdapat 1000 unique variable berarti tidak ada yang double

  3. Orang dengan status menikah yang paling banyak

  4. Paling banyak punya mobil 2

2 Examining Summary Statistics

sapply(df, function(x) sum(is.na(x)))
##            ï..ID   Marital.Status           Gender           Income 
##                0                0                0                6 
##         Children        Education       Occupation       Home.Owner 
##                8                0                0                0 
##             Cars Commute.Distance           Region              Age 
##                9                0                0                8 
##   Purchased.Bike 
##                0

Explanation

  1. Terdapat missing value pada variable children, cars, income, dan Age
df$Age[is.na(df$Age)] = 0
m<-mean(df$Age)
df$Age[df$Age==0]<-m

df$Children[is.na(df$Children)] = 0
m<-mean(df$Children)
df$Children[df$Children==0]<-m

df$Cars[is.na(df$Cars)] = 0
m<-mean(df$Cars)
df$Cars[df$Cars==0]<-m

df$Income[is.na(df$Income)] = 0
m<-mean(df$Income)
df$Income[df$Income==0]<-m
sapply(df, function(x) sum(is.na(x)))
##            ï..ID   Marital.Status           Gender           Income 
##                0                0                0                0 
##         Children        Education       Occupation       Home.Owner 
##                0                0                0                0 
##             Cars Commute.Distance           Region              Age 
##                0                0                0                0 
##   Purchased.Bike 
##                0

Explanation

sudah tidak terdapat missing value

a. sapply() function

# Compute the mean of each column
sapply(df[, c(4, 5, 9, 12)], mean, na.rm=TRUE)
##       Income     Children         Cars          Age 
## 56265.580000     2.429390     1.798174    44.178624
# Compute quartiles
sapply(df[, c(4, 5, 9, 12)], quantile, na.rm=TRUE)
##      Income Children  Cars Age
## 0%    10000    1.000 1.000  25
## 25%   30000    1.895 1.000  35
## 50%   60000    2.000 1.442  43
## 75%   70000    3.000 2.000  52
## 100% 170000    5.000 4.000  89

Explanation

  1. Income rata-rata adalah 56268

  2. Rata-rata mempunyai 2 anak

  3. Rata-rata punya 1 mobil

  4. Rata-rata buyers umur 44

b. describe() function

https://cran.r-project.org/web/packages/Hmisc/Hmisc.pdf

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(df)
## df 
## 
##  13  Variables      1000  Observations
## --------------------------------------------------------------------------------
## ï..ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0     1000        1    19966     6176    11781    12627 
##      .25      .50      .75      .90      .95 
##    15291    19744    24471    27544    28413 
## 
## lowest : 11000 11047 11061 11090 11116, highest: 29337 29355 29380 29424 29447
## --------------------------------------------------------------------------------
## Marital.Status 
##        n  missing distinct 
##      993        7        2 
##                           
## Value      Married  Single
## Frequency      535     458
## Proportion   0.539   0.461
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      989       11        2 
##                         
## Value      Female   Male
## Frequency     489    500
## Proportion  0.494  0.506
## --------------------------------------------------------------------------------
## Income 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       17    0.986    56266    34159    10000    20000 
##      .25      .50      .75      .90      .95 
##    30000    60000    70000   100000   120000 
## 
## lowest :  10000  20000  30000  40000  50000, highest: 120000 130000 150000 160000 170000
##                                                                          
## Value       10000  20000  30000  40000  50000  55930  60000  70000  80000
## Frequency      73     74    134    153     40      6    165    123     90
## Proportion  0.073  0.074  0.134  0.153  0.040  0.006  0.165  0.123  0.090
##                                                                   
## Value       90000 100000 110000 120000 130000 150000 160000 170000
## Frequency      38     29     16     17     32      4      3      3
## Proportion  0.038  0.029  0.016  0.017  0.032  0.004  0.003  0.003
## --------------------------------------------------------------------------------
## Children 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        6    0.959    2.429    1.231 
## 
## lowest : 1.000 1.895 2.000 3.000 4.000, highest: 1.895 2.000 3.000 4.000 5.000
##                                               
## Value      1.000 1.895 2.000 3.000 4.000 5.000
## Frequency    169   282   209   133   126    81
## Proportion 0.169 0.282 0.209 0.133 0.126 0.081
## --------------------------------------------------------------------------------
## Education 
##        n  missing distinct 
##     1000        0        5 
## 
## lowest : Bachelors           Graduate Degree     High School         Partial College     Partial High School
## highest: Bachelors           Graduate Degree     High School         Partial College     Partial High School
##                                                                       
## Value                Bachelors     Graduate Degree         High School
## Frequency                  306                 174                 179
## Proportion               0.306               0.174               0.179
##                                                   
## Value          Partial College Partial High School
## Frequency                  265                  76
## Proportion               0.265               0.076
## --------------------------------------------------------------------------------
## Occupation 
##        n  missing distinct 
##     1000        0        5 
## 
## lowest : Clerical       Management     Manual         Professional   Skilled Manual
## highest: Clerical       Management     Manual         Professional   Skilled Manual
##                                                                       
## Value            Clerical     Management         Manual   Professional
## Frequency             177            173            119            276
## Proportion          0.177          0.173          0.119          0.276
##                          
## Value      Skilled Manual
## Frequency             255
## Proportion          0.255
## --------------------------------------------------------------------------------
## Home.Owner 
##        n  missing distinct 
##      996        4        2 
##                       
## Value         No   Yes
## Frequency    314   682
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## Cars 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        5    0.925    1.798   0.8102 
## 
## lowest : 1.000 1.442 2.000 3.000 4.000, highest: 1.000 1.442 2.000 3.000 4.000
##                                         
## Value      1.000 1.442 2.000 3.000 4.000
## Frequency    267   247   342    85    59
## Proportion 0.267 0.247 0.342 0.085 0.059
## --------------------------------------------------------------------------------
## Commute.Distance 
##        n  missing distinct 
##     1000        0        5 
## 
## lowest : 0-1 Miles  1-2 Miles  10+ Miles  2-5 Miles  5-10 Miles
## highest: 0-1 Miles  1-2 Miles  10+ Miles  2-5 Miles  5-10 Miles
##                                                                  
## Value       0-1 Miles  1-2 Miles  10+ Miles  2-5 Miles 5-10 Miles
## Frequency         366        169        111        162        192
## Proportion      0.366      0.169      0.111      0.162      0.192
## --------------------------------------------------------------------------------
## Region 
##        n  missing distinct 
##     1000        0        3 
##                                                     
## Value             Europe North America       Pacific
## Frequency            300           508           192
## Proportion         0.300         0.508         0.192
## --------------------------------------------------------------------------------
## Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       54    0.999    44.18    12.79    28.00    30.00 
##      .25      .50      .75      .90      .95 
##    35.00    43.00    52.00    60.10    65.05 
## 
## lowest : 25 26 27 28 29, highest: 73 74 78 80 89
## --------------------------------------------------------------------------------
## Purchased.Bike 
##        n  missing distinct 
##     1000        0        2 
##                       
## Value         No   Yes
## Frequency    519   481
## Proportion 0.519 0.481
## --------------------------------------------------------------------------------

Explanation 1. calon pembeli kebanyakan sudah menikah (frequency 535) 2. kebanyakan laki-laki yang membeli 3. Income nya paling rendah 10000 paling tinggi 170000, frequency income terbanyak 60000 4. Kebanyakan yang beli pendidikan bachelors 5. Rata-rata yang membeli umur 45 6. Region paling banyak dari Amerika 508

3. Look for data anomalies

a. qqplot() function

library(car)
## Loading required package: carData
qqPlot(df$Income)

## [1] 13 44

b. Boxplot() function for Outlier Single Variable

out <- boxplot.stats(df$Income)$out

boxplot(df$Income,
  ylab = "",
  main = "Boxplot"
)

mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Explanation

outliers paling banyak 160000, dan outliers yaitu 150000, 160000, 170000 #### c. FindOutlier() function

ThreeSigma <- function(x, t = 3){

 mu <- mean(x, na.rm = TRUE)
 sig <- sd(x, na.rm = TRUE)
 if (sig == 0){
 message("All non-missing x-values are identical")
}
 up <- mu + t * sig
 down <- mu - t * sig
 out <- list(up = up, down = down)
 return(out)
 }

Hampel <- function(x, t = 3){

 mu <- median(x, na.rm = TRUE)
 sig <- mad(x, na.rm = TRUE)
 if (sig == 0){
 message("Hampel identifer implosion: MAD scale estimate is zero")
 }
 up <- mu + t * sig
 down <- mu - t * sig
 out <- list(up = up, down = down)
 return(out)
 }
   
BoxplotRule<- function(x, t = 1.5){

 xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
 xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
 Q <- xU - xL
 if (Q == 0){
 message("Boxplot rule implosion: interquartile distance is zero")
 }
 up <- xU + t * Q
 down <- xU - t * Q
 out <- list(up = up, down = down)
 return(out)
}   

ExtractDetails <- function(x, down, up){

 outClass <- rep("N", length(x))
 indexLo <- which(x < down)
 indexHi <- which(x > up)
 outClass[indexLo] <- "L"
 outClass[indexHi] <- "U"
 index <- union(indexLo, indexHi)
 values <- x[index]
 outClass <- outClass[index]
 nOut <- length(index)
 maxNom <- max(x[which(x <= up)])
 minNom <- min(x[which(x >= down)])
 outList <- list(nOut = nOut, lowLim = down,
 upLim = up, minNom = minNom,
 maxNom = maxNom, index = index,
 values = values,
 outClass = outClass)
 return(outList)
 }
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
 threeLims <- ThreeSigma(x, t = t3)
 HampLims <- Hampel(x, t = tH)
 boxLims <- BoxplotRule(x, t = tb)

 n <- length(x)
 nMiss <- length(which(is.na(x)))

 threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
 HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
 boxList <- ExtractDetails(x, boxLims$down, boxLims$up)

 sumFrame <- data.frame(method = "ThreeSigma", n = n,
 nMiss = nMiss, nOut = threeList$nOut,
 lowLim = threeList$lowLim,
 upLim = threeList$upLim,
 minNom = threeList$minNom,
 maxNom = threeList$maxNom)
 upFrame <- data.frame(method = "Hampel", n = n,
 nMiss = nMiss, nOut = HampList$nOut,
 lowLim = HampList$lowLim,
 upLim = HampList$upLim,
 minNom = HampList$minNom,
 maxNom = HampList$maxNom)
 sumFrame <- rbind.data.frame(sumFrame, upFrame)
 upFrame <- data.frame(method = "BoxplotRule", n = n,
 nMiss = nMiss, nOut = boxList$nOut,
 lowLim = boxList$lowLim,
 upLim = boxList$upLim,
 minNom = boxList$minNom,
 maxNom = boxList$maxNom)
 sumFrame <- rbind.data.frame(sumFrame, upFrame)

 threeFrame <- data.frame(index = threeList$index,
 values = threeList$values,
 type = threeList$outClass)
 HampFrame <- data.frame(index = HampList$index,
 values = HampList$values,
 type = HampList$outClass)
 boxFrame <- data.frame(index = boxList$index,
 values = boxList$values,
 type = boxList$outClass)
 outList <- list(summary = sumFrame, threeSigma = threeFrame,
 Hampel = HampFrame, boxplotRule = boxFrame)
 return(outList)
}
fullSummary <- FindOutliers(df$Income)
fullSummary$summary
##        method    n nMiss nOut    lowLim    upLim minNom maxNom
## 1  ThreeSigma 1000     0   10 -36657.59 149188.8  10000 130000
## 2      Hampel 1000     0   10 -28956.00 148956.0  10000 130000
## 3 BoxplotRule 1000     0   10  10000.00 130000.0  10000 130000

4. Look at the relations between key variables

count <- table(df$Income, df$Marital.Status)
count
##         
##              Married Single
##   10000    0      33     40
##   20000    1      34     39
##   30000    0      54     80
##   40000    2      90     61
##   50000    0      24     16
##   55930    1       3      2
##   60000    1     104     60
##   70000    0      66     57
##   80000    0      45     45
##   90000    1      21     16
##   1e+05    1      15     13
##   110000   0       6     10
##   120000   0      11      6
##   130000   0      23      9
##   150000   0       3      1
##   160000   0       1      2
##   170000   0       2      1

Explanation

Orang yang sudah menikah income nya paling besar bisa dilihat dari kolom di atas. Sedangkan orang yang single income nya lebih kecil rata-rata.

# Create the layout
nf <- layout( matrix(c(1,1,2,3), nrow=2, byrow=TRUE) )


# Fill with plots
mosaicplot(Age ~ Income, data = df, main = "", las = 1, shade = TRUE)

# Scatterplot between sugars and carbs
plot(df$Cars, df$Income)

boxplot(Age ~ Income, data = df, xlab = "Income", ylab ="Age")