Cleaning the dataset

Load libraries

library(tidyverse)
library(ggplot2)

Load and inspect the dataset

Dataset can be downloaded from kaggle.

df <- read_csv('bike_buyers.csv')
Rows: 1000 Columns: 13── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Marital Status, Gender, Education, Occupation, Home Owner, Commute Distance, Region, Purchas...
dbl (5): ID, Income, Children, Cars, Age
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t(head(df,3))
                 [,1]             [,2]              [,3]             
ID               "12496"          "24107"           "14177"          
Marital Status   "Married"        "Married"         "Married"        
Gender           "Female"         "Male"            "Male"           
Income           "40000"          "30000"           "80000"          
Children         "1"              "3"               "5"              
Education        "Bachelors"      "Partial College" "Partial College"
Occupation       "Skilled Manual" "Clerical"        "Professional"   
Home Owner       "Yes"            "Yes"             "No"             
Cars             "0"              "1"               "2"              
Commute Distance "0-1 Miles"      "0-1 Miles"       "2-5 Miles"      
Region           "Europe"         "Europe"          "Europe"         
Age              "42"             "43"              "60"             
Purchased Bike   "No"             "No"              "No"             

Size of the dataset: rows and columns.

dim(df)
[1] 1000   13

Rename variable names to lower case

df <- df %>% rename_with(tolower)
names(df) <- gsub(' ', '_', names(df))
names(df)
 [1] "id"               "marital_status"   "gender"           "income"           "children"        
 [6] "education"        "occupation"       "home_owner"       "cars"             "commute_distance"
[11] "region"           "age"              "purchased_bike"  

Check the data type of every variable.

# get variable type
data.frame(var_type = sapply(df, class)) %>% arrange(var_type)

Check unique values of all variables except for id, income and age.

selected_vars <- df %>% select(-c(id, income, age))
selected_vars %>% apply(MARGIN = 2, FUN = unique)
$marital_status
[1] "Married" "Single"  NA       

$gender
[1] "Female" "Male"   NA      

$children
[1] " 1" " 3" " 5" " 0" " 2" " 4" NA  

$education
[1] "Bachelors"           "Partial College"     "High School"         "Partial High School"
[5] "Graduate Degree"    

$occupation
[1] "Skilled Manual" "Clerical"       "Professional"   "Manual"         "Management"    

$home_owner
[1] "Yes" "No"  NA   

$cars
[1] " 0" " 1" " 2" " 4" NA   " 3"

$commute_distance
[1] "0-1 Miles"  "2-5 Miles"  "5-10 Miles" "1-2 Miles"  "10+ Miles" 

$region
[1] "Europe"        "Pacific"       "North America"

$purchased_bike
[1] "No"  "Yes"

Conversion of datatypes

Some variables could be converted to categorical variables (variables with only a few possible values). On the other hand, I will convert to logical variable the columns home_owner and purchased_bike.

df$marital_status <- as.factor(tolower(df$marital_status))
df$gender <- as.factor(tolower(df$gender))
df$education <- as.factor(df$education)
df$occupation <- as.factor(tolower(df$occupation))
# remove ' Miles' from commute_distance values

df$commute_distance <- gsub(' Miles', '', df$commute_distance)
df$commute_distance <- factor(df$commute_distance,
                                                                 levels = c("0-1", "1-2", "2-5", "5-10" , "10+"),
                                                                 ordered= TRUE )
df$region <- as.factor(df$region)
df$purchased_bike <- ifelse(df$purchased_bike =='Yes', TRUE, FALSE)

Check number of missing values

data.frame( count = df %>% apply(MARGIN=2, FUN=function(x) sum(is.na(x))) ) %>%
    arrange(-count)

I will check the rows with missing values:

df[!complete.cases(df$gender),]

For simplicity’s sake, I will drop all rows with missing values. Since the size of the dataset is 1000 rows, dropping at most 11 rows will only suppose losing 0.011% of the data.

df <- df %>% filter(complete.cases(df))

Get a sample of the total dataframe.

t(sample_n(df, 3))
                 [,1]          [,2]             [,3]        
id               "20870"       "25419"          "18145"     
marital_status   "single"      "single"         "married"   
gender           "female"      "male"           "male"      
income           "10000"       "50000"          "80000"     
children         "2"           "2"              "5"         
education        "High School" "Bachelors"      "Bachelors" 
occupation       "manual"      "skilled manual" "management"
home_owner       "Yes"         "No"             "No"        
cars             "1"           "1"              "2"         
commute_distance "0-1"         "0-1"            "2-5"       
region           "Europe"      "North America"  "Europe"    
age              "38"          "38"             "62"        
purchased_bike   "TRUE"        "TRUE"           "FALSE"     

Exploratory Data Analysis

I will answer the following questions to find insights from the dataset.

  1. What is the average age of bike buyers in the dataset?
  2. Can you provide the distribution of bike purchases based on gender?
  3. Which bike model has the highest sales in the dataset?
  4. What is the median income of bike buyers?
  5. Is there a correlation between age and the frequency of bike purchases?
  6. Can you identify the top three regions with the highest bike sales?
  7. What is the most common occupation among bike buyers?
  8. How does the education level of bike buyers vary across different regions?
  9. What is the average number of bikes purchased per household?
  10. Is there a relationship between marital status and bike purchases?

1. What is the average age of bike buyers in the dataset?

mean(df$age)
[1] 44.2563

2. Can you provide the distribution of bike purchases based on gender?

ggplot(df, aes(y = sum(df$purchased_bike), x=gender, fill=gender) ) +
    geom_col() +
    labs(title='Number of bikes purchased by gender', y='Bikes purchased' )

LS0tCnRpdGxlOiAiRURBIChFeHBsb3JhdG9yeSBEYXRhIEFuYWx5c3lzKSBvZiBCaWtlIEJ1eWVycyBkYXRhc2V0IHBlcmZvcm1lZCB3aXRoIFIiCmF1dGhvcjogJ0RhbmkgW0BjaHVzazJdKGh0dHBzOi8vdHdpdHRlci5jb20vY2h1c2syKScKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Kb3B0aW9ucyhyZXByLm1heHRleHQgPSAxMDApCmBgYAoKIyBDbGVhbmluZyB0aGUgZGF0YXNldAoKIyMjIExvYWQgbGlicmFyaWVzCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ3Bsb3QyKQpgYGAKCiMjIyBMb2FkIGFuZCBpbnNwZWN0IHRoZSBkYXRhc2V0CkRhdGFzZXQgY2FuIGJlIFtkb3dubG9hZGVkIGZyb20ga2FnZ2xlXShodHRwczovL3d3dy5rYWdnbGUuY29tL2RhdGFzZXRzL2hlZXJhbGRlZGhpYS9iaWtlLWJ1eWVycykuCmBgYHtyfQpkZiA8LSByZWFkX2NzdignYmlrZV9idXllcnMuY3N2JykKdChoZWFkKGRmLDMpKQpgYGAKIyMjIFNpemUgb2YgdGhlIGRhdGFzZXQ6IHJvd3MgYW5kIGNvbHVtbnMuCmBgYHtyfQpkaW0oZGYpCmBgYAoKCiMjIyBSZW5hbWUgdmFyaWFibGUgbmFtZXMgdG8gbG93ZXIgY2FzZQpgYGB7cn0KZGYgPC0gZGYgJT4lIHJlbmFtZV93aXRoKHRvbG93ZXIpCm5hbWVzKGRmKSA8LSBnc3ViKCcgJywgJ18nLCBuYW1lcyhkZikpCm5hbWVzKGRmKQpgYGAKCiMjIyBDaGVjayB0aGUgZGF0YSB0eXBlIG9mIGV2ZXJ5IHZhcmlhYmxlLgpgYGB7cn0KIyBnZXQgdmFyaWFibGUgdHlwZQpkYXRhLmZyYW1lKHZhcl90eXBlID0gc2FwcGx5KGRmLCBjbGFzcykpICU+JSBhcnJhbmdlKHZhcl90eXBlKQpgYGAKCiMjIyBDaGVjayB1bmlxdWUgdmFsdWVzIG9mIGFsbCB2YXJpYWJsZXMgZXhjZXB0IGZvciBgaWRgLCBgaW5jb21lYCBhbmQgYGFnZWAuCmBgYHtyfQpzZWxlY3RlZF92YXJzIDwtIGRmICU+JSBzZWxlY3QoLWMoaWQsIGluY29tZSwgYWdlKSkKc2VsZWN0ZWRfdmFycyAlPiUgYXBwbHkoTUFSR0lOID0gMiwgRlVOID0gdW5pcXVlKQpgYGAKCiMjIyBDb252ZXJzaW9uIG9mIGRhdGF0eXBlcwpTb21lIHZhcmlhYmxlcyBjb3VsZCBiZSBjb252ZXJ0ZWQgdG8gY2F0ZWdvcmljYWwgdmFyaWFibGVzICh2YXJpYWJsZXMgd2l0aCBvbmx5IGEgZmV3IHBvc3NpYmxlIHZhbHVlcykuIE9uIHRoZSBvdGhlciBoYW5kLCBJIHdpbGwgY29udmVydCB0byBsb2dpY2FsIHZhcmlhYmxlIHRoZSBjb2x1bW5zIGBob21lX293bmVyYCBhbmQgYHB1cmNoYXNlZF9iaWtlYC4KYGBge3J9CmRmJG1hcml0YWxfc3RhdHVzIDwtIGFzLmZhY3Rvcih0b2xvd2VyKGRmJG1hcml0YWxfc3RhdHVzKSkKZGYkZ2VuZGVyIDwtIGFzLmZhY3Rvcih0b2xvd2VyKGRmJGdlbmRlcikpCmRmJGVkdWNhdGlvbiA8LSBhcy5mYWN0b3IoZGYkZWR1Y2F0aW9uKQpkZiRvY2N1cGF0aW9uIDwtIGFzLmZhY3Rvcih0b2xvd2VyKGRmJG9jY3VwYXRpb24pKQojIHJlbW92ZSAnIE1pbGVzJyBmcm9tIGNvbW11dGVfZGlzdGFuY2UgdmFsdWVzCgpkZiRjb21tdXRlX2Rpc3RhbmNlIDwtIGdzdWIoJyBNaWxlcycsICcnLCBkZiRjb21tdXRlX2Rpc3RhbmNlKQpkZiRjb21tdXRlX2Rpc3RhbmNlIDwtIGZhY3RvcihkZiRjb21tdXRlX2Rpc3RhbmNlLAoJCQkJCQkJCQkJCQkJCQkJIGxldmVscyA9IGMoIjAtMSIsICIxLTIiLCAiMi01IiwgIjUtMTAiICwgIjEwKyIpLAoJCQkJCQkJCQkJCQkJCQkJIG9yZGVyZWQ9IFRSVUUgKQpkZiRyZWdpb24gPC0gYXMuZmFjdG9yKGRmJHJlZ2lvbikKZGYkcHVyY2hhc2VkX2Jpa2UgPC0gaWZlbHNlKGRmJHB1cmNoYXNlZF9iaWtlID09J1llcycsIFRSVUUsIEZBTFNFKQpgYGAKCiMjIyBDaGVjayBudW1iZXIgb2YgbWlzc2luZyB2YWx1ZXMKYGBge3J9CmRhdGEuZnJhbWUoIGNvdW50ID0gZGYgJT4lIGFwcGx5KE1BUkdJTj0yLCBGVU49ZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSkgKSAlPiUKCWFycmFuZ2UoLWNvdW50KQpgYGAKCkkgd2lsbCBjaGVjayB0aGUgcm93cyB3aXRoIG1pc3NpbmcgdmFsdWVzOgpgYGB7cn0KZGZbIWNvbXBsZXRlLmNhc2VzKGRmJGdlbmRlciksXQpgYGAKCkZvciBzaW1wbGljaXR5J3Mgc2FrZSwgSSB3aWxsIGRyb3AgYWxsIHJvd3Mgd2l0aCBtaXNzaW5nIHZhbHVlcy4gU2luY2UgdGhlIHNpemUgb2YgdGhlIGRhdGFzZXQgaXMgMTAwMCByb3dzLCBkcm9wcGluZyBhdCBtb3N0IDExIHJvd3Mgd2lsbCBvbmx5IHN1cHBvc2UgbG9zaW5nIDAuMDExJSBvZiB0aGUgZGF0YS4KYGBge3J9CmRmIDwtIGRmICU+JSBmaWx0ZXIoY29tcGxldGUuY2FzZXMoZGYpKQpgYGAKCiMjIyBHZXQgYSBzYW1wbGUgb2YgdGhlIHRvdGFsIGRhdGFmcmFtZS4KYGBge3J9CnQoc2FtcGxlX24oZGYsIDMpKQpgYGAKCiMgRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcwoKSSB3aWxsIGFuc3dlciB0aGUgZm9sbG93aW5nIHF1ZXN0aW9ucyB0byBmaW5kIGluc2lnaHRzIGZyb20gdGhlIGRhdGFzZXQuCgoxLiBXaGF0IGlzIHRoZSBhdmVyYWdlIGFnZSBvZiBiaWtlIGJ1eWVycyBpbiB0aGUgZGF0YXNldD8KMi4gQ2FuIHlvdSBwcm92aWRlIHRoZSBkaXN0cmlidXRpb24gb2YgYmlrZSBwdXJjaGFzZXMgYmFzZWQgb24gZ2VuZGVyPwozLiBXaGljaCBiaWtlIG1vZGVsIGhhcyB0aGUgaGlnaGVzdCBzYWxlcyBpbiB0aGUgZGF0YXNldD8KNC4gV2hhdCBpcyB0aGUgbWVkaWFuIGluY29tZSBvZiBiaWtlIGJ1eWVycz8KNS4gSXMgdGhlcmUgYSBjb3JyZWxhdGlvbiBiZXR3ZWVuIGFnZSBhbmQgdGhlIGZyZXF1ZW5jeSBvZiBiaWtlIHB1cmNoYXNlcz8KNi4gQ2FuIHlvdSBpZGVudGlmeSB0aGUgdG9wIHRocmVlIHJlZ2lvbnMgd2l0aCB0aGUgaGlnaGVzdCBiaWtlIHNhbGVzPwo3LiBXaGF0IGlzIHRoZSBtb3N0IGNvbW1vbiBvY2N1cGF0aW9uIGFtb25nIGJpa2UgYnV5ZXJzPwo4LiBIb3cgZG9lcyB0aGUgZWR1Y2F0aW9uIGxldmVsIG9mIGJpa2UgYnV5ZXJzIHZhcnkgYWNyb3NzIGRpZmZlcmVudCByZWdpb25zPwo5LiBXaGF0IGlzIHRoZSBhdmVyYWdlIG51bWJlciBvZiBiaWtlcyBwdXJjaGFzZWQgcGVyIGhvdXNlaG9sZD8KMTAuIElzIHRoZXJlIGEgcmVsYXRpb25zaGlwIGJldHdlZW4gbWFyaXRhbCBzdGF0dXMgYW5kIGJpa2UgcHVyY2hhc2VzPwoKIyMjIDEuIFdoYXQgaXMgdGhlIGF2ZXJhZ2UgYWdlIG9mIGJpa2UgYnV5ZXJzIGluIHRoZSBkYXRhc2V0PwpgYGB7cn0KbWVhbihkZiRhZ2UpCmBgYAoKIyMjIDIuIENhbiB5b3UgcHJvdmlkZSB0aGUgZGlzdHJpYnV0aW9uIG9mIGJpa2UgcHVyY2hhc2VzIGJhc2VkIG9uIGdlbmRlcj8KYGBge3J9CmdncGxvdChkZiwgYWVzKHkgPSBzdW0oZGYkcHVyY2hhc2VkX2Jpa2UpLCB4PWdlbmRlciwgZmlsbD1nZW5kZXIpICkgKwoJZ2VvbV9jb2woKSArCglsYWJzKHRpdGxlPSdOdW1iZXIgb2YgYmlrZXMgcHVyY2hhc2VkIGJ5IGdlbmRlcicsIHk9J0Jpa2VzIHB1cmNoYXNlZCcgKQpgYGAKCg==