remove(list = ls()) # clear environment
zipcode_population <- telecom_zipcode_population <- read.csv("~/Downloads/telecom_zipcode_population.csv")
customer_churn <- telecom_customer_churn <- read.csv("~/Downloads/telecom_customer_churn.csv")
data_dictionary <- read.csv("~/Downloads/telecom_data_dictionary.csv", comment.char="#")
# importing all the datasets
- After analyzing the datasets, the zipcode population dataset and the
customer churn dataset have a shared variable of Zip.code. Since many
customers can live in the same zipcode, these datasets must be in a one
to many relationship. I will now merge the customer churn and zipcode
population datasets below.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(data_dictionary)
## Table Field
## 1 Customer Churn CustomerID
## 2 Customer Churn Gender
## 3 Customer Churn Age
## 4 Customer Churn Married
## 5 Customer Churn Number of Dependents
## 6 Customer Churn City
## Description
## 1 A unique ID that identifies each customer
## 2 The customer\x92s gender: Male, Female
## 3 The customer\x92s current age, in years, at the time the fiscal quarter ended (Q2 2022)
## 4 Indicates if the customer is married: Yes, No
## 5 Indicates the number of dependents that live with the customer (dependents could be children, parents, grandparents, etc.)
## 6 The city of the customer\x92s primary residence in California
head(customer_churn)
## Customer.ID Gender Age Married Number.of.Dependents City Zip.code
## 1 0002-ORFBO Female 37 Yes 0 Frazier Park 93225
## 2 0003-MKNFE Male 46 No 0 Glendale 91206
## 3 0004-TLHLJ Male 50 No 0 Costa Mesa 92627
## 4 0011-IGKFF Male 78 Yes 0 Martinez 94553
## 5 0013-EXCHZ Female 75 Yes 0 Camarillo 93010
## 6 0013-MHZWF Female 23 No 3 Midpines 95345
## Latitude Longitude Number.of.Referrals Tenure.in.Months Offer Phone.Service
## 1 34.82766 -118.9991 2 9 None Yes
## 2 34.16251 -118.2039 0 9 None Yes
## 3 33.64567 -117.9226 0 4 Offer E Yes
## 4 38.01446 -122.1154 1 13 Offer D Yes
## 5 34.22785 -119.0799 3 3 None Yes
## 6 37.58150 -119.9728 0 9 Offer E Yes
## Avg.Monthly.Long.Distance.Charges Multiple.Lines Internet.Service
## 1 42.39 No Yes
## 2 10.69 Yes Yes
## 3 33.65 No Yes
## 4 27.82 No Yes
## 5 7.38 No Yes
## 6 16.77 No Yes
## Internet.Type Avg.Monthly.GB.Download Online.Security Online.Backup
## 1 Cable 16 No Yes
## 2 Cable 10 No No
## 3 Fiber Optic 30 No No
## 4 Fiber Optic 4 No Yes
## 5 Fiber Optic 11 No No
## 6 Cable 73 No No
## Device.Protection.Plan Premium.Tech.Support Streaming.TV Streaming.Movies
## 1 No Yes Yes No
## 2 No No No Yes
## 3 Yes No No No
## 4 Yes No Yes Yes
## 5 No Yes Yes No
## 6 No Yes Yes Yes
## Streaming.Music Unlimited.Data Contract Paperless.Billing
## 1 No Yes One Year Yes
## 2 Yes No Month-to-Month No
## 3 No Yes Month-to-Month Yes
## 4 No Yes Month-to-Month Yes
## 5 No Yes Month-to-Month Yes
## 6 Yes Yes Month-to-Month Yes
## Payment.Method Monthly.Charge Total.Charges Total.Refunds
## 1 Credit Card 65.6 593.30 0.00
## 2 Credit Card -4.0 542.40 38.33
## 3 Bank Withdrawal 73.9 280.85 0.00
## 4 Bank Withdrawal 98.0 1237.85 0.00
## 5 Credit Card 83.9 267.40 0.00
## 6 Credit Card 69.4 571.45 0.00
## Total.Extra.Data.Charges Total.Long.Distance.Charges Total.Revenue
## 1 0 381.51 974.81
## 2 10 96.21 610.28
## 3 0 134.60 415.45
## 4 0 361.66 1599.51
## 5 0 22.14 289.54
## 6 0 150.93 722.38
## Customer.Status Churn.Category Churn.Reason
## 1 Stayed
## 2 Stayed
## 3 Churned Competitor Competitor had better devices
## 4 Churned Dissatisfaction Product dissatisfaction
## 5 Churned Dissatisfaction Network reliability
## 6 Stayed
head(zipcode_population)
## Zip.Code Population
## 1 90001 54492
## 2 90002 44586
## 3 90003 58198
## 4 90004 67852
## 5 90005 43019
## 6 90006 62784
- Since the customer churn is the main dataset, a left join is used to
merge
# merged data into summary_stats
summary_stats <- customer_churn |>
left_join(zipcode_population, by = c("Zip.code" = "Zip.Code")) |>
summary()
print(summary_stats)
## Customer.ID Gender Age Married
## Length:7043 Length:7043 Min. :19.00 Length:7043
## Class :character Class :character 1st Qu.:32.00 Class :character
## Mode :character Mode :character Median :46.00 Mode :character
## Mean :46.51
## 3rd Qu.:60.00
## Max. :80.00
##
## Number.of.Dependents City Zip.code Latitude
## Min. :0.0000 Length:7043 Min. :90001 Min. :32.56
## 1st Qu.:0.0000 Class :character 1st Qu.:92101 1st Qu.:33.99
## Median :0.0000 Mode :character Median :93518 Median :36.21
## Mean :0.4687 Mean :93486 Mean :36.20
## 3rd Qu.:0.0000 3rd Qu.:95329 3rd Qu.:38.16
## Max. :9.0000 Max. :96150 Max. :41.96
##
## Longitude Number.of.Referrals Tenure.in.Months Offer
## Min. :-124.3 Min. : 0.000 Min. : 1.00 Length:7043
## 1st Qu.:-121.8 1st Qu.: 0.000 1st Qu.: 9.00 Class :character
## Median :-119.6 Median : 0.000 Median :29.00 Mode :character
## Mean :-119.8 Mean : 1.952 Mean :32.39
## 3rd Qu.:-118.0 3rd Qu.: 3.000 3rd Qu.:55.00
## Max. :-114.2 Max. :11.000 Max. :72.00
##
## Phone.Service Avg.Monthly.Long.Distance.Charges Multiple.Lines
## Length:7043 Min. : 1.01 Length:7043
## Class :character 1st Qu.:13.05 Class :character
## Mode :character Median :25.69 Mode :character
## Mean :25.42
## 3rd Qu.:37.68
## Max. :49.99
## NA's :682
## Internet.Service Internet.Type Avg.Monthly.GB.Download
## Length:7043 Length:7043 Min. : 2.00
## Class :character Class :character 1st Qu.:13.00
## Mode :character Mode :character Median :21.00
## Mean :26.19
## 3rd Qu.:30.00
## Max. :85.00
## NA's :1526
## Online.Security Online.Backup Device.Protection.Plan
## Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Premium.Tech.Support Streaming.TV Streaming.Movies Streaming.Music
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Unlimited.Data Contract Paperless.Billing Payment.Method
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Monthly.Charge Total.Charges Total.Refunds Total.Extra.Data.Charges
## Min. :-10.00 Min. : 18.8 Min. : 0.000 Min. : 0.000
## 1st Qu.: 30.40 1st Qu.: 400.1 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 70.05 Median :1394.5 Median : 0.000 Median : 0.000
## Mean : 63.60 Mean :2280.4 Mean : 1.962 Mean : 6.861
## 3rd Qu.: 89.75 3rd Qu.:3786.6 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :118.75 Max. :8684.8 Max. :49.790 Max. :150.000
##
## Total.Long.Distance.Charges Total.Revenue Customer.Status
## Min. : 0.00 Min. : 21.36 Length:7043
## 1st Qu.: 70.55 1st Qu.: 605.61 Class :character
## Median : 401.44 Median : 2108.64 Mode :character
## Mean : 749.10 Mean : 3034.38
## 3rd Qu.:1191.10 3rd Qu.: 4801.15
## Max. :3564.72 Max. :11979.34
##
## Churn.Category Churn.Reason Population
## Length:7043 Length:7043 Min. : 11
## Class :character Class :character 1st Qu.: 2344
## Mode :character Mode :character Median : 17554
## Mean : 22140
## 3rd Qu.: 36125
## Max. :105285
##
Creating a table for average age and average population for each
zipcode
library(dplyr)
str(summary_stats)
## 'table' chr [1:7, 1:39] "Length:7043 " "Class :character " ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:7] "" "" "" "" ...
## ..$ : chr [1:39] "Customer.ID" " Gender" " Age" " Married" ...
summary_stats <- read.csv("~/Downloads/telecom_customer_churn.csv")
summary_stats$Age <- as.numeric(summary_stats$Age)
summary_stats$Zip.code <- as.numeric(summary_stats$Zip.code)
population <- read.csv("~/Downloads/telecom_zipcode_population.csv")
zipcode_population$Population <- as.numeric(zipcode_population$Population)
telecom_customer_churn$Age <- as.numeric(telecom_customer_churn$Age)
zipcode_population$Zip.Code <- as.numeric(as.character(zipcode_population$Zip.Code))
# converting all the necessary data types to numeric
rm(telecom_zipcode_population) # removing unnecessary data
rm(population)
rm(data_dictionary)
rm(customer_churn)
?dplyr
zipcode_population <- zipcode_population |>
rename(Zip.code = Zip.Code)
# Renamed the two variables to match
merged_data <- summary_stats |>
left_join(zipcode_population, by = "Zip.code")
# merged data again
merged_data |>
group_by(Zip.code) |>
summarise(Avg_Population = mean(Population, na.rm = TRUE),
Avg_Age = median(Age, na.rm = TRUE)
)
## # A tibble: 1,626 × 3
## Zip.code Avg_Population Avg_Age
## <dbl> <dbl> <dbl>
## 1 90001 54492 51
## 2 90002 44586 51
## 3 90003 58198 49
## 4 90004 67852 44
## 5 90005 43019 42.5
## 6 90006 62784 41
## 7 90007 45025 42
## 8 90008 30852 42
## 9 90010 1957 26
## 10 90011 101215 30
## # ℹ 1,616 more rows
# form the table