library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
setwd("~/Data Study/Data 101 MC/Week6")
telecom <- read_csv("telecom.csv")
## Rows: 10 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): customerID, TotalCharges, PaymentMethod, Churn
## dbl (1): MonthlyCharges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Take a quick look around the data (using functions str, dim, etc.)

str(telecom)
## spec_tbl_df [10 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ customerID    : chr [1:10] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ MonthlyCharges: num [1:10] 29.9 57 NA 42.3 70.7 ...
##  $ TotalCharges  : chr [1:10] "109.9" "na" "108.15" "1840.75" ...
##  $ PaymentMethod : chr [1:10] "Electronic check" "Mailed check" "--" "Bank transfer" ...
##  $ Churn         : chr [1:10] "yes" "yes" "yes" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   customerID = col_character(),
##   ..   MonthlyCharges = col_double(),
##   ..   TotalCharges = col_character(),
##   ..   PaymentMethod = col_character(),
##   ..   Churn = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(telecom)
## [1] 10  5

Review question: print out the table of value counts for the Chrun variable.

count(telecom,Churn)
## # A tibble: 2 × 2
##   Churn     n
##   <chr> <int>
## 1 no        5
## 2 yes       5

Print out to the screen (do not save to a new data frame) the rows where MonthlyCharges > 55

filter(telecom, MonthlyCharges>55)
## # A tibble: 4 × 5
##   customerID MonthlyCharges TotalCharges PaymentMethod    Churn
##   <chr>               <dbl> <chr>        <chr>            <chr>
## 1 5575-GNVDE           57.0 na           Mailed check     yes  
## 2 9237-HQITU           70.7 <NA>         Electronic check no   
## 3 1452-KIOVK           89.1 1949.4       Credit card      no   
## 4 7892-POOKP          105.  3046.05      Electronic check no

Change the data types of variables customerID and TotalCharges to appropriate types

telecom %>%
  mutate_at(c("TotalCharges"), as.numeric)
## Warning in mask$eval_all_mutate(quo): 강제형변환에 의해 생성된 NA 입니다
## # A tibble: 10 × 5
##    customerID MonthlyCharges TotalCharges PaymentMethod    Churn
##    <chr>               <dbl>        <dbl> <chr>            <chr>
##  1 7590-VHVEG           29.8         110. Electronic check yes  
##  2 5575-GNVDE           57.0          NA  Mailed check     yes  
##  3 3668-QPYBK           NA           108. --               yes  
##  4 7795-CFOCW           42.3        1841. Bank transfer    no   
##  5 9237-HQITU           70.7          NA  Electronic check no   
##  6 9305-CDSKC          NaN           820. --               yes  
##  7 1452-KIOVK           89.1        1949. Credit card      no   
##  8 6713-OKOMC           NA            NA  <NA>             yes  
##  9 7892-POOKP          105.         3046. Electronic check no   
## 10 8451-AJOMK           54.1         355. Electronic check no

Change all of the missing or not available data to “NA”

telecom <- telecom %>%
  mutate_all(na_if,"--")
telecom
## # A tibble: 10 × 5
##    customerID MonthlyCharges TotalCharges PaymentMethod    Churn
##    <chr>               <dbl> <chr>        <chr>            <chr>
##  1 7590-VHVEG           29.8 109.9        Electronic check yes  
##  2 5575-GNVDE           57.0 na           Mailed check     yes  
##  3 3668-QPYBK           NA   108.15       <NA>             yes  
##  4 7795-CFOCW           42.3 1840.75      Bank transfer    no   
##  5 9237-HQITU           70.7 <NA>         Electronic check no   
##  6 9305-CDSKC          NaN   820.5        <NA>             yes  
##  7 1452-KIOVK           89.1 1949.4       Credit card      no   
##  8 6713-OKOMC           NA   N/A          <NA>             yes  
##  9 7892-POOKP          105.  3046.05      Electronic check no   
## 10 8451-AJOMK           54.1 354.95       Electronic check no

Print out to the screen (do not save tot a new data frame) the data with any rows containing “NA” removed

telecom[rowSums(is.na(telecom))>0,]
## # A tibble: 4 × 5
##   customerID MonthlyCharges TotalCharges PaymentMethod    Churn
##   <chr>               <dbl> <chr>        <chr>            <chr>
## 1 3668-QPYBK           NA   108.15       <NA>             yes  
## 2 9237-HQITU           70.7 <NA>         Electronic check no   
## 3 9305-CDSKC          NaN   820.5        <NA>             yes  
## 4 6713-OKOMC           NA   N/A          <NA>             yes

Drop the Churn column from the data frame

telecomNoChurn <- select(telecom,-Churn)
telecomNoChurn
## # A tibble: 10 × 4
##    customerID MonthlyCharges TotalCharges PaymentMethod   
##    <chr>               <dbl> <chr>        <chr>           
##  1 7590-VHVEG           29.8 109.9        Electronic check
##  2 5575-GNVDE           57.0 na           Mailed check    
##  3 3668-QPYBK           NA   108.15       <NA>            
##  4 7795-CFOCW           42.3 1840.75      Bank transfer   
##  5 9237-HQITU           70.7 <NA>         Electronic check
##  6 9305-CDSKC          NaN   820.5        <NA>            
##  7 1452-KIOVK           89.1 1949.4       Credit card     
##  8 6713-OKOMC           NA   N/A          <NA>            
##  9 7892-POOKP          105.  3046.05      Electronic check
## 10 8451-AJOMK           54.1 354.95       Electronic check