Exploring Retail Customer Data: A Practical Data Science Project

A retail company has collected information from customers who made online or in-store purchases. The dataset includes customer names, demographic information, province, profession, phone number, email address, credit card information, purchase amount, preferred colour, IP address, visit time, weekday, and date.

Your task is to use dplyr functions from the tidyverse to explore the dataset and answer the questions below.

The dataset customer_data.csv required for this task is available in your designated Google Drive Folder.

1. Load the required packages and read the dataset. Use janitor::clean_names() to clean the column names.

library(tidyverse)
library(lubridate)

customer <- read_csv("customer_data.csv") %>% 
  janitor::clean_names() 
names(customer)

 [1] "prefix"     "first"      "last"       "email"      "gender"    
 [6] "age"        "company"    "profession" "phone"      "postal"    
[11] "province"   "cc_no"      "cc_exp"     "cc_type"    "price_cad" 
[16] "fav_color"  "ip"         "weekday"    "ampm"       "date"

2. View the first 5 rows of the dataset.

customer %>% 
  slice(1:5)

# A tibble: 5 × 20
  prefix first last  email gender   age company profession phone postal province
  <chr>  <chr> <chr> <chr> <chr>  <dbl> <chr>   <chr>      <chr> <chr>  <chr>   
1 Dr.    Ray   Mort… sebv… Male      38 Medtro… Health Th… (987… B6V 3… MB      
2 Miss   Clau… Rodr… acu@… Female    51 Ames D… Health Th… (356… G7M 5… SK      
3 Miss   Harry Meyer zuz@… Female    51 CSX Co… Political… (539… A0Z 6… NS      
4 Miss   Edith Gilb… hans… Female    55 Murphy… Transport… (984… P9I 9… YT      
5 Dr.    Lura  Murp… webe… Female    20 PETsMA… Statistic… (902… S1A 6… ON      
# ℹ 9 more variables: cc_no <dbl>, cc_exp <chr>, cc_type <chr>,
#   price_cad <dbl>, fav_color <chr>, ip <chr>, weekday <chr>, ampm <chr>,
#   date <chr>

3. How many rows and columns are in the dataset?

glimpse(customer)

Rows: 30,000
Columns: 20
$ prefix     <chr> "Dr.", "Miss", "Miss", "Miss", "Dr.", "Mrs.", "Mr.", "Mr.",…
$ first      <chr> "Ray", "Claudia", "Harry", "Edith", "Lura", "Andre", "Cody"…
$ last       <chr> "Morton", "Rodriquez", "Meyer", "Gilbert", "Murphy", "McGui…
$ email      <chr> "sebvajom@kol.km", "acu@jatsot.ug", "zuz@lo.wf", "hansohsi@…
$ gender     <chr> "Male", "Female", "Female", "Female", "Female", "Male", "Fe…
$ age        <dbl> 38, 51, 51, 55, 20, 50, 62, 36, 36, 53, 61, 27, 54, 36, 33,…
$ company    <chr> "Medtronic Inc.", "Ames Department Stores, Inc.", "CSX Corp…
$ profession <chr> "Health Therapist", "Health Therapist", "Political Scientis…
$ phone      <chr> "(987) 619-2695", "(356) 736-7352", "(539) 246-1806", "(984…
$ postal     <chr> "B6V 3W3", "G7M 5F3", "A0Z 6P9", "P9I 9H3", "S1A 6K0", "G8J…
$ province   <chr> "MB", "SK", "NS", "YT", "ON", "AB", "PE", "NU", "MB", "NT",…
$ cc_no      <dbl> 5.02e+15, 5.02e+15, 6.30e+15, 3.53e+15, 4.03e+15, 5.61e+15,…
$ cc_exp     <chr> "05/2018", "07/2028", "02/2023", "02/2028", "10/2025", "12/…
$ cc_type    <chr> "Solo", "Visa", "Switch", "Maestro", "Diners Club Internati…
$ price_cad  <dbl> 8.36, 68.31, 34.65, 64.59, 20.83, 25.32, 52.88, 10.69, 99.5…
$ fav_color  <chr> "Blue", "Black", "Black", "White", "Yellow", "Red", "Blue",…
$ ip         <chr> "126.23.139.2", "106.198.76.211", "186.150.187.29", "80.140…
$ weekday    <chr> "Sunday", "Tuesday", "Wednesday", "Saturday", "Friday", "Th…
$ ampm       <chr> "pm", "am", "pm", "am", "pm", "pm", "am", "pm", "am", "pm",…
$ date       <chr> "04/05/1930", "12/20/1926", "08/20/1931", "06/18/2001", "06…

4. Create a smaller dataset containing only customer name, gender, age, province, profession, phone, email, credit card type, spending amount, weekday, and date. Name the new dataset customer_small.

customer_small<- customer %>% 
  mutate(customer_name = paste(prefix, first, last)) %>% 
  rename(spendingAmount=price_cad) %>% 
  select(c(customer_name,gender, age, province, profession, phone, email, cc_type, weekday, date,-prefix,-first,-last))

customer_small

# A tibble: 30,000 × 10
   customer_name    gender   age province profession phone email cc_type weekday
   <chr>            <chr>  <dbl> <chr>    <chr>      <chr> <chr> <chr>   <chr>  
 1 Dr. Ray Morton   Male      38 MB       Health Th… (987… sebv… Solo    Sunday 
 2 Miss Claudia Ro… Female    51 SK       Health Th… (356… acu@… Visa    Tuesday
 3 Miss Harry Meyer Female    51 NS       Political… (539… zuz@… Switch  Wednes…
 4 Miss Edith Gilb… Female    55 YT       Transport… (984… hans… Maestro Saturd…
 5 Dr. Lura Murphy  Female    20 ON       Statistic… (902… webe… Diners… Friday 
 6 Mrs. Andre McGu… Male      50 AB       Communica… (770… to@h… Laser   Thursd…
 7 Mr. Cody Thomps… Female    62 PE       Special E… (837… nebu… Solo    Thursd…
 8 Mr. Cory Duncan  Male      36 NU       Urban Pla… (529… ruce… Diners… Friday 
 9 Miss Dorothy Wa… Female    36 MB       Veterinar… (279… cakp… Diners… Wednes…
10 Dr. Rose Tate    Female    53 NT       Software … (410… nafi… Discov… Saturd…
# ℹ 29,990 more rows
# ℹ 1 more variable: date <chr>

5. Create the following new variables in customer:

customer<- customer %>% 
  mutate(full_name=paste(first, last),.before=1) %>% 
  rename(price_CAD=price_cad) %>% 
  mutate(spending_group=case_when(
  price_CAD==0~"No spending",
  price_CAD<30 ~ "Low",
  price_CAD>=30 & price_CAD<80 ~ "Medium",
  price_CAD>80 ~"High")) %>% 
  select(full_name,price_CAD,spending_group,everything())

6. Find customers who spent 100 CAD or more. Keep only their full name, province, profession, credit card type, and spending amount.

customer %>% 
  filter(price_CAD>=100) %>% 
  select(full_name, province,profession,cc_type,price_CAD)

# A tibble: 3 × 5
  full_name        province profession             cc_type  price_CAD
  <chr>            <chr>    <chr>                  <chr>        <dbl>
1 Gregory Brown    PE       Novelist               Visa           100
2 Cody Christensen PE       Hospital Administrator JCB            100
3 Lizzie Dixon     NB       Compensation Analyst   Bankcard       100

7. Find female Structural Engineers from Alberta (AB).

customer %>% 
   filter(gender=="Female"& province=="AB" & profession=="Structural Engineer")

# A tibble: 4 × 22
  full_name     price_CAD spending_group prefix first last    email gender   age
  <chr>             <dbl> <chr>          <chr>  <chr> <chr>   <chr> <chr>  <dbl>
1 Roy Stanley        31.9 Medium         Dr.    Roy   Stanley apiu… Female    39
2 Lora Kennedy       53.3 Medium         Mr.    Lora  Kennedy denn… Female    51
3 Nell Richards      76.3 Medium         Mrs.   Nell  Richar… dob@… Female    18
4 Don McDaniel       62.7 Medium         Mrs.   Don   McDani… naf@… Female    37
# ℹ 13 more variables: company <chr>, profession <chr>, phone <chr>,
#   postal <chr>, province <chr>, cc_no <dbl>, cc_exp <chr>, cc_type <chr>,
#   fav_color <chr>, ip <chr>, weekday <chr>, ampm <chr>, date <chr>

8. Display the 10 highest-spending customers. Sort the data from highest to lowest spending.

customer %>% 
  arrange(desc(price_CAD)) %>% 
  slice(1:10)

# A tibble: 10 × 22
   full_name      price_CAD spending_group prefix first last  email gender   age
   <chr>              <dbl> <chr>          <chr>  <chr> <chr> <chr> <chr>  <dbl>
 1 Gregory Brown      100   High           Mrs.   Greg… Brown hav@… Female    31
 2 Cody Christen…     100   High           Mrs.   Cody  Chri… get@… Male      28
 3 Lizzie Dixon       100   High           Miss   Lizz… Dixon goh@… Female    38
 4 Jessie Lawren…     100.0 High           Mr.    Jess… Lawr… vavo… Female    58
 5 Christopher E…     100.0 High           Mrs.   Chri… Estr… ekit… Male      65
 6 Dollie Edwards     100.0 High           Dr.    Doll… Edwa… uriv… Female    40
 7 Edward Murray      100.0 High           Mrs.   Edwa… Murr… zeg@… Female    51
 8 Timothy Bass       100.0 High           Mr.    Timo… Bass  jo@d… Female    23
 9 Iva Morton         100.0 High           Miss   Iva   Mort… vumj… Male      48
10 Ruth West          100.0 High           Miss   Ruth  West  ara@… Female    26
# ℹ 13 more variables: company <chr>, profession <chr>, phone <chr>,
#   postal <chr>, province <chr>, cc_no <dbl>, cc_exp <chr>, cc_type <chr>,
#   fav_color <chr>, ip <chr>, weekday <chr>, ampm <chr>, date <chr>

9. Find the minimum, maximum, mean, and median age of customers.

customer %>% 
  summarise(mean_age=mean(age),median_age=median(age), max(age), min(age))

# A tibble: 1 × 4
  mean_age median_age `max(age)` `min(age)`
     <dbl>      <dbl>      <dbl>      <dbl>
1     41.6         42         65         18

10. Find the minimum, maximum, mean, and total spending amount.

customer %>% 
  summarise(mean_spendingAmount=mean(price_CAD),total_spendingAmount=sum(price_CAD), max(price_CAD), min(price_CAD))

# A tibble: 1 × 4
  mean_spendingAmount total_spendingAmount `max(price_CAD)` `min(price_CAD)`
                <dbl>                <dbl>            <dbl>            <dbl>
1                50.0             1499723.              100                0

11. Calculate the number of customers and average spending by gender.

customer %>%
  group_by(gender) %>%
  summarise(n_customers = n(), avg_spending = mean(price_CAD))

# A tibble: 2 × 3
  gender n_customers avg_spending
  <chr>        <int>        <dbl>
1 Female       15147         49.9
2 Male         14853         50.0

12. Calculate the number of customers and average spending by province. Then arrange the result by average spending from highest to lowest.

customer %>%
  group_by(province) %>%
  summarise(n_customers = n(), avg_spending = mean(price_CAD)) %>%
  arrange(desc(avg_spending))

# A tibble: 13 × 3
   province n_customers avg_spending
   <chr>          <int>        <dbl>
 1 NL              2320         51.5
 2 PE              2284         51.3
 3 NS              2234         50.8
 4 AB              2325         50.4
 5 ON              2372         50.3
 6 QC              2308         50.2
 7 MB              2338         50.1
 8 YT              2308         50.0
 9 NT              2300         49.7
10 NU              2350         49.5
11 BC              2299         49.1
12 SK              2335         48.5
13 NB              2227         48.5

13. Find the five most common professions.

customer %>%
  count(profession, sort = TRUE) %>%
  slice(1:5)

# A tibble: 5 × 2
  profession                 n
  <chr>                  <int>
1 Preschool Teacher        112
2 Distribution Manager     107
3 Scientific Programmer    106
4 Hospital Administrator   102
5 Market Res. Analyst      101

14. Find the number of customers by credit card type. Sort the result from highest to lowest frequency.

customer %>%
  count(cc_type, sort = TRUE)

# A tibble: 17 × 2
   cc_type                                n
   <chr>                              <int>
 1 Solo                                1867
 2 Visa Electron                       1815
 3 American Express                    1810
 4 Switch                              1806
 5 Laser                               1798
 6 China UnionPay                      1788
 7 Diners Club enRoute                 1758
 8 Maestro                             1756
 9 JCB                                 1751
10 Diners Club United States & Canada  1746
11 Diners Club International           1745
12 Mastercard                          1741
13 Discover Card                       1739
14 Diners Club Carte Blanche           1732
15 Bankcard                            1728
16 Visa                                1721
17 InstaPayment                        1699

15. Extract email providers, such as gmail.com or yahoo.com, and find the top 5 most common email providers. (The email provider is the part of the email address that comes after the “@” symbol)

?stringr

customer %>%
  mutate(email_provider = str_extract(email, "(?<=@).+")) %>%
  count(email_provider, sort = TRUE) %>%
  slice(1:5)

# A tibble: 5 × 2
  email_provider     n
  <chr>          <int>
1 gmail.com       1687
2 me.com          1676
3 outlook.com     1664
4 live.com        1660
5 hotmail.com     1659

16. Find how many customers use an email address containing am.edu.

customer %>%
  filter(str_detect(email, "am\\.edu")) %>%
  nrow()

[1] 1

17. Find the number of customers who visited on each weekday. Which weekday has the highest number of customers?

customer %>%
  count(weekday, sort = TRUE)

# A tibble: 7 × 2
  weekday       n
  <chr>     <int>
1 Saturday   4376
2 Wednesday  4365
3 Thursday   4327
4 Friday     4316
5 Monday     4216
6 Sunday     4213
7 Tuesday    4187

18. Find the top 3 provinces by total spending. First summarise total spending by province, then select the top 3 rows.

customer %>%
  group_by(province) %>%
  summarise(total_spending = sum(price_CAD)) %>%
  arrange(desc(total_spending)) %>%
  slice(1:3)

# A tibble: 3 × 2
  province total_spending
  <chr>             <dbl>
1 NL              119450.
2 ON              119379.
3 PE              117166.

19. Find the 5 customers with the lowest positive spending amount. Exclude customers who spent 0 CAD.

customer %>%
  filter(price_CAD > 0) %>%
  arrange(price_CAD) %>%
  slice(1:5)

# A tibble: 5 × 22
  full_name       price_CAD spending_group prefix first last  email gender   age
  <chr>               <dbl> <chr>          <chr>  <chr> <chr> <chr> <chr>  <dbl>
1 Eric Malone          0.01 Low            Mr.    Eric  Malo… woeb… Male      33
2 Marguerite Bla…      0.02 Low            Miss   Marg… Black wu@t… Female    59
3 Ruby Marsh           0.02 Low            Dr.    Ruby  Marsh suwm… Male      18
4 Catherine Byrd       0.03 Low            Dr.    Cath… Byrd  idoh… Female    22
5 Lucy Duncan          0.03 Low            Dr.    Lucy  Dunc… losi… Female    50
# ℹ 13 more variables: company <chr>, profession <chr>, phone <chr>,
#   postal <chr>, province <chr>, cc_no <dbl>, cc_exp <chr>, cc_type <chr>,
#   fav_color <chr>, ip <chr>, weekday <chr>, ampm <chr>, date <chr>

20. Find customers whose credit cards expire in 2019. Display full name, email, credit card type, expiry date, and spending amount.

customer %>%
  mutate(cc_exp_date = my(cc_exp)) %>%
  filter(year(cc_exp_date) == 2019) %>%
  select(full_name, email, cc_type, cc_exp, price_CAD)

# A tibble: 2,684 × 5
   full_name          email            cc_type                  cc_exp price_CAD
   <chr>              <chr>            <chr>                    <chr>      <dbl>
 1 Winifred Henderson ezamaw@radi.kn   Laser                    12/20…      99.5
 2 Carolyn Horton     dehje@mowakbo.tj InstaPayment             07/20…      44.5
 3 Virgie Adams       lodbinogu@ir.edu Diners Club Carte Blanc… 02/20…      66.0
 4 Blake Hart         nup@pov.nc       InstaPayment             10/20…      89.9
 5 Leila Carpenter    ke@wufiol.wf     Visa Electron            11/20…      29.6
 6 Charlie Cruz       ercono@mod.th    Mastercard               02/20…      35.3
 7 Lelia Hamilton     dog@di.cw        Diners Club United Stat… 05/20…      42.2
 8 Ronald Sims        puosmu@lug.ug    Switch                   09/20…      73.4
 9 Anthony Weaver     uwa@wovuog.uk    JCB                      03/20…      91.7
10 Tyler Logan        wobguz@vendu.ng  JCB                      06/20…      44.8
# ℹ 2,674 more rows

21. Find customers with the credit card number 5020000000000230. Then find how many unique email addresses are associated with this card.

customer %>%
  filter(cc_no == "5020000000000230") %>%
  summarise(unique_emails = n_distinct(email))

# A tibble: 1 × 1
  unique_emails
          <int>
1             2

22. Find the top 5 professions among customers who spent at least 75 CAD.

customer %>%
  filter(price_CAD >= 75) %>%
  count(profession, sort = TRUE) %>%
  slice(1:5)

# A tibble: 5 × 2
  profession               n
  <chr>                <int>
1 Preschool Teacher       32
2 Glamour Photographer    31
3 Estimator               30
4 Advertising Director    29
5 Broker                  29

23. Identify phone numbers that appear more than once. Then arrange the result by phone number.

customer %>%
  count(phone) %>%
  filter(n > 1) %>%
  arrange(phone)

# A tibble: 1 × 2
  phone              n
  <chr>          <int>
1 (263) 382-8004     2

24. Find the distribution of customers by spending group.

customer %>%
  count(spending_group)

# A tibble: 5 × 2
  spending_group     n
  <chr>          <int>
1 High            5936
2 Low             8938
3 Medium         15123
4 No spending        2
5 <NA>               1