Loading Libraries:

knitr::opts_chunk$set(warning = FALSE, fig.align = "center", out.width = "85%", 
                      message = FALSE, cache = TRUE)
library(openintro)
library(tidyverse)
library(feather)

Part I: Lab Exercise


Lab 1: read a csv file into a tibble from your disk.

setwd("~/Desktop")
bank_data <- read_csv("BankChurners.csv")
glimpse(bank_data)
## Rows: 10,127
## Columns: 23
## $ CLIENTNUM                                                                                                                          <dbl> …
## $ Attrition_Flag                                                                                                                     <chr> …
## $ Customer_Age                                                                                                                       <dbl> …
## $ Gender                                                                                                                             <chr> …
## $ Dependent_count                                                                                                                    <dbl> …
## $ Education_Level                                                                                                                    <chr> …
## $ Marital_Status                                                                                                                     <chr> …
## $ Income_Category                                                                                                                    <chr> …
## $ Card_Category                                                                                                                      <chr> …
## $ Months_on_book                                                                                                                     <dbl> …
## $ Total_Relationship_Count                                                                                                           <dbl> …
## $ Months_Inactive_12_mon                                                                                                             <dbl> …
## $ Contacts_Count_12_mon                                                                                                              <dbl> …
## $ Credit_Limit                                                                                                                       <dbl> …
## $ Total_Revolving_Bal                                                                                                                <dbl> …
## $ Avg_Open_To_Buy                                                                                                                    <dbl> …
## $ Total_Amt_Chng_Q4_Q1                                                                                                               <dbl> …
## $ Total_Trans_Amt                                                                                                                    <dbl> …
## $ Total_Trans_Ct                                                                                                                     <dbl> …
## $ Total_Ct_Chng_Q4_Q1                                                                                                                <dbl> …
## $ Avg_Utilization_Ratio                                                                                                              <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 <dbl> …
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 <dbl> …

Lab 2: “John Smith WA 418-Y11-4111
Mary Hartford CA 319-Z19-4341
Evan Nolan IL 219-532-c301”

1. Try to use read_csv to read the following text. What do you get?

read_csv("John Smith          WA        418-Y11-4111
Mary Hartford       CA        319-Z19-4341
Evan Nolan          IL        219-532-c301", col_names = F)
## # A tibble: 3 × 1
##   X1                                        
##   <chr>                                     
## 1 John Smith          WA        418-Y11-4111
## 2 Mary Hartford       CA        319-Z19-4341
## 3 Evan Nolan          IL        219-532-c301

2. Try to use read_delim to read the same text. What do you get?

read_delim("John Smith          WA        418-Y11-4111
Mary Hartford       CA        319-Z19-4341
Evan Nolan          IL        219-532-c301")
## # A tibble: 2 × 20
##   John  Smith  ...3  ...4  ...5  ...6  ...7  ...8  ...9  ...10 ...11 WA    ...13
##   <chr> <chr>  <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <chr> <lgl> <lgl> <chr> <lgl>
## 1 Mary  Hartf… NA    NA    NA    NA    NA    NA    CA    NA    NA    <NA>  NA   
## 2 Evan  Nolan  NA    NA    NA    NA    NA    NA    <NA>  NA    NA    IL    NA   
## # ℹ 7 more variables: ...14 <lgl>, ...15 <lgl>, ...16 <lgl>, ...17 <chr>,
## #   ...18 <lgl>, ...19 <lgl>, `418-Y11-4111` <chr>

Lab 3: Divide the bank customer data into two parts - data for female customers only and for male customers only. Save them into two different files “female_bank_data.rds” and “male_bank_data.rds”.

Bank_F <- bank_data %>%
  filter(Gender == "F") 

write_rds(Bank_F, 'female_bank_data.rds')
read_rds('female_bank_data.rds')
## # A tibble: 5,358 × 23
##    CLIENTNUM Attrition_Flag  Customer_Age Gender Dependent_count Education_Level
##        <dbl> <chr>                  <dbl> <chr>            <dbl> <chr>          
##  1 818770008 Existing Custo…           49 F                    5 Graduate       
##  2 769911858 Existing Custo…           40 F                    4 High School    
##  3 712396908 Existing Custo…           57 F                    2 Graduate       
##  4 709327383 Existing Custo…           45 F                    2 Graduate       
##  5 708508758 Attrited Custo…           62 F                    0 Graduate       
##  6 811604133 Existing Custo…           47 F                    4 Unknown        
##  7 771071958 Existing Custo…           41 F                    3 Graduate       
##  8 718813833 Existing Custo…           44 F                    3 Uneducated     
##  9 788658483 Existing Custo…           53 F                    2 College        
## 10 715318008 Existing Custo…           55 F                    1 College        
## # ℹ 5,348 more rows
## # ℹ 17 more variables: Marital_Status <chr>, Income_Category <chr>,
## #   Card_Category <chr>, Months_on_book <dbl>, Total_Relationship_Count <dbl>,
## #   Months_Inactive_12_mon <dbl>, Contacts_Count_12_mon <dbl>,
## #   Credit_Limit <dbl>, Total_Revolving_Bal <dbl>, Avg_Open_To_Buy <dbl>,
## #   Total_Amt_Chng_Q4_Q1 <dbl>, Total_Trans_Amt <dbl>, Total_Trans_Ct <dbl>,
## #   Total_Ct_Chng_Q4_Q1 <dbl>, Avg_Utilization_Ratio <dbl>, …
Bank_M <- bank_data %>%
  filter(Gender == "M") 

write_rds(Bank_M, 'male_bank_data.rds')
read_rds('male_bank_data.rds')
## # A tibble: 4,769 × 23
##    CLIENTNUM Attrition_Flag  Customer_Age Gender Dependent_count Education_Level
##        <dbl> <chr>                  <dbl> <chr>            <dbl> <chr>          
##  1 768805383 Existing Custo…           45 M                    3 High School    
##  2 713982108 Existing Custo…           51 M                    3 Graduate       
##  3 709106358 Existing Custo…           40 M                    3 Uneducated     
##  4 713061558 Existing Custo…           44 M                    2 Graduate       
##  5 810347208 Existing Custo…           51 M                    4 Unknown        
##  6 818906208 Existing Custo…           32 M                    0 High School    
##  7 710930508 Existing Custo…           37 M                    3 Uneducated     
##  8 719661558 Existing Custo…           48 M                    2 Graduate       
##  9 708790833 Existing Custo…           42 M                    5 Uneducated     
## 10 710821833 Existing Custo…           65 M                    1 Unknown        
## # ℹ 4,759 more rows
## # ℹ 17 more variables: Marital_Status <chr>, Income_Category <chr>,
## #   Card_Category <chr>, Months_on_book <dbl>, Total_Relationship_Count <dbl>,
## #   Months_Inactive_12_mon <dbl>, Contacts_Count_12_mon <dbl>,
## #   Credit_Limit <dbl>, Total_Revolving_Bal <dbl>, Avg_Open_To_Buy <dbl>,
## #   Total_Amt_Chng_Q4_Q1 <dbl>, Total_Trans_Amt <dbl>, Total_Trans_Ct <dbl>,
## #   Total_Ct_Chng_Q4_Q1 <dbl>, Avg_Utilization_Ratio <dbl>, …