rm(list=ls())
telecom_zipcode_population <- read.csv("C:/Users/aarav/Downloads/telecom_zipcode_population.csv")
telecom_customer_churn <- read.csv("C:/Users/aarav/Downloads/telecom_customer_churn.csv")
telecom_data_dictionary <- read.csv("C:/Users/aarav/Downloads/telecom_data_dictionary.csv", comment.char="#")Day_4_HW
Import Data
- 3 Datasets
Dataset Information
telecom_zipcode_population:
names(telecom_zipcode_population)[1] "Zip.Code" "Population"
length(unique(telecom_zipcode_population$Zip.Code))[1] 1671
telecom_customer_churn:
names(telecom_customer_churn) [1] "Customer.ID" "Gender"
[3] "Age" "Married"
[5] "Number.of.Dependents" "City"
[7] "Zip.code" "Latitude"
[9] "Longitude" "Number.of.Referrals"
[11] "Tenure.in.Months" "Offer"
[13] "Phone.Service" "Avg.Monthly.Long.Distance.Charges"
[15] "Multiple.Lines" "Internet.Service"
[17] "Internet.Type" "Avg.Monthly.GB.Download"
[19] "Online.Security" "Online.Backup"
[21] "Device.Protection.Plan" "Premium.Tech.Support"
[23] "Streaming.TV" "Streaming.Movies"
[25] "Streaming.Music" "Unlimited.Data"
[27] "Contract" "Paperless.Billing"
[29] "Payment.Method" "Monthly.Charge"
[31] "Total.Charges" "Total.Refunds"
[33] "Total.Extra.Data.Charges" "Total.Long.Distance.Charges"
[35] "Total.Revenue" "Customer.Status"
[37] "Churn.Category" "Churn.Reason"
length(unique(telecom_customer_churn$Customer.ID))[1] 7043
length(unique(telecom_customer_churn$Zip.code))[1] 1626
telecom_data_dictionary:
names(telecom_data_dictionary)[1] "Table" "Field" "Description"
Unique and Common Variables:
telecom_customer_churnis the main dataset, because it contains the most variables and objects.Customer.IDis the unique variable in this datasettelecom_zipcode_populationhasZip.codeas the its unique variable, and shares variableZip.codewithtelecom_customer_churnIf
telecom_zipcode_populationwas to be merged withtelecom_customer_churn, it would be a one to many merge, becausetelecom_zipcode_populationhas 1671 unique objects for variableZip.code, andtelecom_customer_churnhas 1626 unique objects for variableZip.code.
Merging the Datasets
?mergestarting httpd help server ... done
telecom_zipcode__customer <-
merge(x = telecom_customer_churn,
y = telecom_zipcode_population,
by.x = "Zip.code",
by.y = "Zip.Code"
)Summary Statistics
library(psych)
describe(telecom_zipcode__customer) vars n mean sd median trimmed
Zip.code 1 7043 93486.07 1856.77 93518.00 93571.63
Customer.ID* 2 7043 3522.00 2033.28 3522.00 3522.00
Gender* 3 7043 1.50 0.50 2.00 1.51
Age 4 7043 46.51 16.75 46.00 46.05
Married* 5 7043 1.48 0.50 1.00 1.48
Number.of.Dependents 6 7043 0.47 0.96 0.00 0.23
City* 7 7043 589.04 307.80 605.00 597.55
Latitude 8 7043 36.20 2.47 36.21 36.06
Longitude 9 7043 -119.76 2.15 -119.60 -119.73
Number.of.Referrals 10 7043 1.95 3.00 0.00 1.32
Tenure.in.Months 11 7043 32.39 24.54 29.00 31.44
Offer* 12 7043 2.40 1.83 1.00 2.12
Phone.Service* 13 7043 1.90 0.30 2.00 2.00
Avg.Monthly.Long.Distance.Charges 14 6361 25.42 14.20 25.69 25.41
Multiple.Lines* 15 7043 2.33 0.64 2.00 2.40
Internet.Service* 16 7043 1.78 0.41 2.00 1.85
Internet.Type* 17 7043 2.88 1.18 3.00 2.97
Avg.Monthly.GB.Download 18 5517 26.19 19.59 21.00 23.57
Online.Security* 19 7043 2.07 0.71 2.00 2.09
Online.Backup* 20 7043 2.13 0.74 2.00 2.16
Device.Protection.Plan* 21 7043 2.13 0.74 2.00 2.16
Premium.Tech.Support* 22 7043 2.07 0.71 2.00 2.09
Streaming.TV* 23 7043 2.17 0.76 2.00 2.21
Streaming.Movies* 24 7043 2.17 0.76 2.00 2.21
Streaming.Music* 25 7043 2.14 0.74 2.00 2.17
Unlimited.Data* 26 7043 2.46 0.83 3.00 2.57
Contract* 27 7043 1.75 0.85 1.00 1.69
Paperless.Billing* 28 7043 1.59 0.49 2.00 1.62
Payment.Method* 29 7043 1.50 0.60 1.00 1.43
Monthly.Charge 30 7043 63.60 31.20 70.05 64.06
Total.Charges 31 7043 2280.38 2266.22 1394.55 1966.73
Total.Refunds 32 7043 1.96 7.90 0.00 0.00
Total.Extra.Data.Charges 33 7043 6.86 25.10 0.00 0.04
Total.Long.Distance.Charges 34 7043 749.10 846.66 401.44 605.63
Total.Revenue 35 7043 3034.38 2865.20 2108.64 2660.75
Customer.Status* 36 7043 2.40 0.88 3.00 2.51
Churn.Category* 37 7043 1.67 1.28 1.00 1.36
Churn.Reason* 38 7043 2.96 4.42 1.00 1.71
Population 39 7043 22139.60 21152.39 17554.00 19503.96
mad min max range skew
Zip.code 2379.57 90001.00 96150.00 6149.00 -0.21
Customer.ID* 2610.86 1.00 7043.00 7042.00 0.00
Gender* 0.00 1.00 2.00 1.00 -0.02
Age 20.76 19.00 80.00 61.00 0.16
Married* 0.00 1.00 2.00 1.00 0.07
Number.of.Dependents 0.00 0.00 9.00 9.00 2.11
City* 375.10 1.00 1106.00 1105.00 -0.21
Latitude 3.22 32.56 41.96 9.41 0.31
Longitude 2.74 -124.30 -114.19 10.11 -0.09
Number.of.Referrals 0.00 0.00 11.00 11.00 1.45
Tenure.in.Months 32.62 1.00 72.00 71.00 0.24
Offer* 0.00 1.00 6.00 5.00 0.92
Phone.Service* 0.00 1.00 2.00 1.00 -2.73
Avg.Monthly.Long.Distance.Charges 18.22 1.01 49.99 48.98 0.00
Multiple.Lines* 1.48 1.00 3.00 2.00 -0.42
Internet.Service* 0.00 1.00 2.00 1.00 -1.38
Internet.Type* 1.48 1.00 4.00 3.00 -0.55
Avg.Monthly.GB.Download 13.34 2.00 85.00 83.00 1.18
Online.Security* 1.48 1.00 3.00 2.00 -0.10
Online.Backup* 1.48 1.00 3.00 2.00 -0.21
Device.Protection.Plan* 1.48 1.00 3.00 2.00 -0.21
Premium.Tech.Support* 1.48 1.00 3.00 2.00 -0.11
Streaming.TV* 1.48 1.00 3.00 2.00 -0.29
Streaming.Movies* 1.48 1.00 3.00 2.00 -0.30
Streaming.Music* 1.48 1.00 3.00 2.00 -0.22
Unlimited.Data* 0.00 1.00 3.00 2.00 -1.02
Contract* 0.00 1.00 3.00 2.00 0.49
Paperless.Billing* 0.00 1.00 2.00 1.00 -0.38
Payment.Method* 0.00 1.00 3.00 2.00 0.76
Monthly.Charge 36.32 -10.00 118.75 128.75 -0.28
Total.Charges 1808.40 18.80 8684.80 8666.00 0.96
Total.Refunds 0.00 0.00 49.79 49.79 4.33
Total.Extra.Data.Charges 0.00 0.00 150.00 150.00 4.09
Total.Long.Distance.Charges 566.53 0.00 3564.72 3564.72 1.24
Total.Revenue 2620.66 21.36 11979.34 11957.98 0.92
Customer.Status* 0.00 1.00 3.00 2.00 -0.88
Churn.Category* 0.00 1.00 6.00 5.00 1.91
Churn.Reason* 0.00 1.00 21.00 20.00 2.65
Population 23471.04 11.00 105285.00 105274.00 0.91
kurtosis se
Zip.code -1.17 22.12
Customer.ID* -1.20 24.23
Gender* -2.00 0.01
Age -1.00 0.20
Married* -2.00 0.01
Number.of.Dependents 4.44 0.01
City* -1.13 3.67
Latitude -1.16 0.03
Longitude -1.19 0.03
Number.of.Referrals 0.72 0.04
Tenure.in.Months -1.39 0.29
Offer* -0.70 0.02
Phone.Service* 5.43 0.00
Avg.Monthly.Long.Distance.Charges -1.21 0.18
Multiple.Lines* -0.71 0.01
Internet.Service* -0.11 0.00
Internet.Type* -1.24 0.01
Avg.Monthly.GB.Download 0.63 0.26
Online.Security* -0.99 0.01
Online.Backup* -1.15 0.01
Device.Protection.Plan* -1.15 0.01
Premium.Tech.Support* -1.01 0.01
Streaming.TV* -1.21 0.01
Streaming.Movies* -1.21 0.01
Streaming.Music* -1.16 0.01
Unlimited.Data* -0.76 0.01
Contract* -1.44 0.01
Paperless.Billing* -1.86 0.01
Payment.Method* -0.40 0.01
Monthly.Charge -1.13 0.37
Total.Charges -0.23 27.00
Total.Refunds 18.33 0.09
Total.Extra.Data.Charges 16.44 0.30
Total.Long.Distance.Charges 0.64 10.09
Total.Revenue -0.20 34.14
Customer.Status* -1.12 0.01
Churn.Category* 2.76 0.02
Churn.Reason* 6.36 0.05
Population 0.32 252.05
3 Variable Table:
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
?group_by
df <- telecom_zipcode__customer |>
group_by(Zip.code) |>
summarise(Avg_Pop = mean(Population),
Avg_Age = mean(Age))
df# A tibble: 1,626 × 3
Zip.code Avg_Pop Avg_Age
<int> <dbl> <dbl>
1 90001 54492 52.2
2 90002 44586 50.2
3 90003 58198 46.2
4 90004 67852 39.4
5 90005 43019 42.2
6 90006 62784 37
7 90007 45025 43.8
8 90008 30852 45.6
9 90010 1957 30
10 90011 101215 38.4
# ℹ 1,616 more rows