Import Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/smhenderson/Desktop/DATA101/R/Data101/data")
telecom <- read_csv("telecom.csv")
## Rows: 10 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): customerID, TotalCharges, PaymentMethod, Churn
## dbl (1): MonthlyCharges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Review Data

str(telecom)
## spc_tbl_ [10 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ customerID    : chr [1:10] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ MonthlyCharges: num [1:10] 29.9 57 NA 42.3 70.7 ...
##  $ TotalCharges  : chr [1:10] "109.9" "na" "108.15" "1840.75" ...
##  $ PaymentMethod : chr [1:10] "Electronic check" "Mailed check" "--" "Bank transfer" ...
##  $ Churn         : chr [1:10] "yes" "yes" "yes" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   customerID = col_character(),
##   ..   MonthlyCharges = col_double(),
##   ..   TotalCharges = col_character(),
##   ..   PaymentMethod = col_character(),
##   ..   Churn = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(telecom)
## [1] 10  5

Change the data types of variables to appropriate types

#customerID works as a character variabla
#TotalCharges
telecom$TotalCharges <- as.numeric(telecom$TotalCharges)
## Warning: NAs introduced by coercion
str(telecom)
## spc_tbl_ [10 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ customerID    : chr [1:10] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ MonthlyCharges: num [1:10] 29.9 57 NA 42.3 70.7 ...
##  $ TotalCharges  : num [1:10] 110 NA 108 1841 NA ...
##  $ PaymentMethod : chr [1:10] "Electronic check" "Mailed check" "--" "Bank transfer" ...
##  $ Churn         : chr [1:10] "yes" "yes" "yes" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   customerID = col_character(),
##   ..   MonthlyCharges = col_double(),
##   ..   TotalCharges = col_character(),
##   ..   PaymentMethod = col_character(),
##   ..   Churn = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Change missing/not available data to “NA”

telecom2 <- telecom %>%
  mutate(PaymentMethod = replace(PaymentMethod, PaymentMethod == "--", NA))

Drop the Churn column from the data frame

telecom3 <- telecom2 %>%
  select(-Churn)