Import Data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/smhenderson/Desktop/DATA101/R/Data101/data")
telecom <- read_csv("telecom.csv")
## Rows: 10 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): customerID, TotalCharges, PaymentMethod, Churn
## dbl (1): MonthlyCharges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Review Data
str(telecom)
## spc_tbl_ [10 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customerID : chr [1:10] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ MonthlyCharges: num [1:10] 29.9 57 NA 42.3 70.7 ...
## $ TotalCharges : chr [1:10] "109.9" "na" "108.15" "1840.75" ...
## $ PaymentMethod : chr [1:10] "Electronic check" "Mailed check" "--" "Bank transfer" ...
## $ Churn : chr [1:10] "yes" "yes" "yes" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. customerID = col_character(),
## .. MonthlyCharges = col_double(),
## .. TotalCharges = col_character(),
## .. PaymentMethod = col_character(),
## .. Churn = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
dim(telecom)
## [1] 10 5
Print table of value counts for the Churn variable
table(telecom$Churn)
##
## no yes
## 5 5
Print the rows where MonthlyCharge > 55
telecom %>%
filter(MonthlyCharges > 55)
## # A tibble: 4 × 5
## customerID MonthlyCharges TotalCharges PaymentMethod Churn
## <chr> <dbl> <chr> <chr> <chr>
## 1 5575-GNVDE 57.0 na Mailed check yes
## 2 9237-HQITU 70.7 <NA> Electronic check no
## 3 1452-KIOVK 89.1 1949.4 Credit card no
## 4 7892-POOKP 105. 3046.05 Electronic check no
Change the data types of variables to appropriate types
#customerID works as a character variabla
#TotalCharges
telecom$TotalCharges <- as.numeric(telecom$TotalCharges)
## Warning: NAs introduced by coercion
str(telecom)
## spc_tbl_ [10 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customerID : chr [1:10] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ MonthlyCharges: num [1:10] 29.9 57 NA 42.3 70.7 ...
## $ TotalCharges : num [1:10] 110 NA 108 1841 NA ...
## $ PaymentMethod : chr [1:10] "Electronic check" "Mailed check" "--" "Bank transfer" ...
## $ Churn : chr [1:10] "yes" "yes" "yes" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. customerID = col_character(),
## .. MonthlyCharges = col_double(),
## .. TotalCharges = col_character(),
## .. PaymentMethod = col_character(),
## .. Churn = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Change missing/not available data to “NA”
telecom2 <- telecom %>%
mutate(PaymentMethod = replace(PaymentMethod, PaymentMethod == "--", NA))
Print the data with any rows containing “NA” removed
print(na.omit(telecom2))
## # A tibble: 5 × 5
## customerID MonthlyCharges TotalCharges PaymentMethod Churn
## <chr> <dbl> <dbl> <chr> <chr>
## 1 7590-VHVEG 29.8 110. Electronic check yes
## 2 7795-CFOCW 42.3 1841. Bank transfer no
## 3 1452-KIOVK 89.1 1949. Credit card no
## 4 7892-POOKP 105. 3046. Electronic check no
## 5 8451-AJOMK 54.1 355. Electronic check no
Drop the Churn column from the data frame
telecom3 <- telecom2 %>%
select(-Churn)