library(tidyverse)
library(ggplot2)
Dataset can be downloaded from kaggle.
df <- read_csv('bike_buyers.csv')
Rows: 1000 Columns: 13── Column specification ─────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): Marital Status, Gender, Education, Occupation, Home Owner, Commute Distance, Region, Purchas...
dbl (5): ID, Income, Children, Cars, Age
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t(head(df,3))
[,1] [,2] [,3]
ID "12496" "24107" "14177"
Marital Status "Married" "Married" "Married"
Gender "Female" "Male" "Male"
Income "40000" "30000" "80000"
Children "1" "3" "5"
Education "Bachelors" "Partial College" "Partial College"
Occupation "Skilled Manual" "Clerical" "Professional"
Home Owner "Yes" "Yes" "No"
Cars "0" "1" "2"
Commute Distance "0-1 Miles" "0-1 Miles" "2-5 Miles"
Region "Europe" "Europe" "Europe"
Age "42" "43" "60"
Purchased Bike "No" "No" "No"
dim(df)
[1] 1000 13
df <- df %>% rename_with(tolower)
names(df) <- gsub(' ', '_', names(df))
names(df)
[1] "id" "marital_status" "gender" "income" "children"
[6] "education" "occupation" "home_owner" "cars" "commute_distance"
[11] "region" "age" "purchased_bike"
# get variable type
data.frame(var_type = sapply(df, class)) %>% arrange(var_type)
id,
income and age.selected_vars <- df %>% select(-c(id, income, age))
selected_vars %>% apply(MARGIN = 2, FUN = unique)
$marital_status
[1] "Married" "Single" NA
$gender
[1] "Female" "Male" NA
$children
[1] " 1" " 3" " 5" " 0" " 2" " 4" NA
$education
[1] "Bachelors" "Partial College" "High School" "Partial High School"
[5] "Graduate Degree"
$occupation
[1] "Skilled Manual" "Clerical" "Professional" "Manual" "Management"
$home_owner
[1] "Yes" "No" NA
$cars
[1] " 0" " 1" " 2" " 4" NA " 3"
$commute_distance
[1] "0-1 Miles" "2-5 Miles" "5-10 Miles" "1-2 Miles" "10+ Miles"
$region
[1] "Europe" "Pacific" "North America"
$purchased_bike
[1] "No" "Yes"
Some variables could be converted to categorical variables (variables
with only a few possible values). On the other hand, I will convert to
logical variable the columns home_owner and
purchased_bike.
df$marital_status <- as.factor(tolower(df$marital_status))
df$gender <- as.factor(tolower(df$gender))
df$education <- as.factor(df$education)
df$occupation <- as.factor(tolower(df$occupation))
# remove ' Miles' from commute_distance values
df$commute_distance <- gsub(' Miles', '', df$commute_distance)
df$commute_distance <- factor(df$commute_distance,
levels = c("0-1", "1-2", "2-5", "5-10" , "10+"),
ordered= TRUE )
df$region <- as.factor(df$region)
df$purchased_bike <- ifelse(df$purchased_bike =='Yes', TRUE, FALSE)
data.frame( count = df %>% apply(MARGIN=2, FUN=function(x) sum(is.na(x))) ) %>%
arrange(-count)
I will check the rows with missing values:
df[!complete.cases(df$gender),]
For simplicity’s sake, I will drop all rows with missing values. Since the size of the dataset is 1000 rows, dropping at most 11 rows will only suppose losing 0.011% of the data.
df <- df %>% filter(complete.cases(df))
t(sample_n(df, 3))
[,1] [,2] [,3]
id "20870" "25419" "18145"
marital_status "single" "single" "married"
gender "female" "male" "male"
income "10000" "50000" "80000"
children "2" "2" "5"
education "High School" "Bachelors" "Bachelors"
occupation "manual" "skilled manual" "management"
home_owner "Yes" "No" "No"
cars "1" "1" "2"
commute_distance "0-1" "0-1" "2-5"
region "Europe" "North America" "Europe"
age "38" "38" "62"
purchased_bike "TRUE" "TRUE" "FALSE"
I will answer the following questions to find insights from the dataset.
mean(df$age)
[1] 44.2563
ggplot(df, aes(y = sum(df$purchased_bike), x=gender, fill=gender) ) +
geom_col() +
labs(title='Number of bikes purchased by gender', y='Bikes purchased' )