Day_4_HW

Import Data

  • 3 Datasets
rm(list=ls())

telecom_zipcode_population <- read.csv("C:/Users/aarav/Downloads/telecom_zipcode_population.csv")

telecom_customer_churn <- read.csv("C:/Users/aarav/Downloads/telecom_customer_churn.csv")

telecom_data_dictionary <- read.csv("C:/Users/aarav/Downloads/telecom_data_dictionary.csv", comment.char="#")

Dataset Information

telecom_zipcode_population:

names(telecom_zipcode_population)
[1] "Zip.Code"   "Population"
length(unique(telecom_zipcode_population$Zip.Code))
[1] 1671

telecom_customer_churn:

names(telecom_customer_churn)
 [1] "Customer.ID"                       "Gender"                           
 [3] "Age"                               "Married"                          
 [5] "Number.of.Dependents"              "City"                             
 [7] "Zip.code"                          "Latitude"                         
 [9] "Longitude"                         "Number.of.Referrals"              
[11] "Tenure.in.Months"                  "Offer"                            
[13] "Phone.Service"                     "Avg.Monthly.Long.Distance.Charges"
[15] "Multiple.Lines"                    "Internet.Service"                 
[17] "Internet.Type"                     "Avg.Monthly.GB.Download"          
[19] "Online.Security"                   "Online.Backup"                    
[21] "Device.Protection.Plan"            "Premium.Tech.Support"             
[23] "Streaming.TV"                      "Streaming.Movies"                 
[25] "Streaming.Music"                   "Unlimited.Data"                   
[27] "Contract"                          "Paperless.Billing"                
[29] "Payment.Method"                    "Monthly.Charge"                   
[31] "Total.Charges"                     "Total.Refunds"                    
[33] "Total.Extra.Data.Charges"          "Total.Long.Distance.Charges"      
[35] "Total.Revenue"                     "Customer.Status"                  
[37] "Churn.Category"                    "Churn.Reason"                     
length(unique(telecom_customer_churn$Customer.ID))
[1] 7043
length(unique(telecom_customer_churn$Zip.code))
[1] 1626

telecom_data_dictionary:

names(telecom_data_dictionary)
[1] "Table"       "Field"       "Description"

Unique and Common Variables:

  • telecom_customer_churn is the main dataset, because it contains the most variables and objects. Customer.ID is the unique variable in this dataset

  • telecom_zipcode_population has Zip.code as the its unique variable, and shares variable Zip.code with telecom_customer_churn

  • If telecom_zipcode_population was to be merged with telecom_customer_churn, it would be a one to many merge, because telecom_zipcode_population has 1671 unique objects for variable Zip.code, and telecom_customer_churn has 1626 unique objects for variable Zip.code.

Merging the Datasets

?merge
starting httpd help server ... done
telecom_zipcode__customer <-
  merge(x = telecom_customer_churn, 
        y = telecom_zipcode_population,
        by.x = "Zip.code",
        by.y = "Zip.Code"
        )

Summary Statistics

library(psych)
describe(telecom_zipcode__customer)
                                  vars    n     mean       sd   median  trimmed
Zip.code                             1 7043 93486.07  1856.77 93518.00 93571.63
Customer.ID*                         2 7043  3522.00  2033.28  3522.00  3522.00
Gender*                              3 7043     1.50     0.50     2.00     1.51
Age                                  4 7043    46.51    16.75    46.00    46.05
Married*                             5 7043     1.48     0.50     1.00     1.48
Number.of.Dependents                 6 7043     0.47     0.96     0.00     0.23
City*                                7 7043   589.04   307.80   605.00   597.55
Latitude                             8 7043    36.20     2.47    36.21    36.06
Longitude                            9 7043  -119.76     2.15  -119.60  -119.73
Number.of.Referrals                 10 7043     1.95     3.00     0.00     1.32
Tenure.in.Months                    11 7043    32.39    24.54    29.00    31.44
Offer*                              12 7043     2.40     1.83     1.00     2.12
Phone.Service*                      13 7043     1.90     0.30     2.00     2.00
Avg.Monthly.Long.Distance.Charges   14 6361    25.42    14.20    25.69    25.41
Multiple.Lines*                     15 7043     2.33     0.64     2.00     2.40
Internet.Service*                   16 7043     1.78     0.41     2.00     1.85
Internet.Type*                      17 7043     2.88     1.18     3.00     2.97
Avg.Monthly.GB.Download             18 5517    26.19    19.59    21.00    23.57
Online.Security*                    19 7043     2.07     0.71     2.00     2.09
Online.Backup*                      20 7043     2.13     0.74     2.00     2.16
Device.Protection.Plan*             21 7043     2.13     0.74     2.00     2.16
Premium.Tech.Support*               22 7043     2.07     0.71     2.00     2.09
Streaming.TV*                       23 7043     2.17     0.76     2.00     2.21
Streaming.Movies*                   24 7043     2.17     0.76     2.00     2.21
Streaming.Music*                    25 7043     2.14     0.74     2.00     2.17
Unlimited.Data*                     26 7043     2.46     0.83     3.00     2.57
Contract*                           27 7043     1.75     0.85     1.00     1.69
Paperless.Billing*                  28 7043     1.59     0.49     2.00     1.62
Payment.Method*                     29 7043     1.50     0.60     1.00     1.43
Monthly.Charge                      30 7043    63.60    31.20    70.05    64.06
Total.Charges                       31 7043  2280.38  2266.22  1394.55  1966.73
Total.Refunds                       32 7043     1.96     7.90     0.00     0.00
Total.Extra.Data.Charges            33 7043     6.86    25.10     0.00     0.04
Total.Long.Distance.Charges         34 7043   749.10   846.66   401.44   605.63
Total.Revenue                       35 7043  3034.38  2865.20  2108.64  2660.75
Customer.Status*                    36 7043     2.40     0.88     3.00     2.51
Churn.Category*                     37 7043     1.67     1.28     1.00     1.36
Churn.Reason*                       38 7043     2.96     4.42     1.00     1.71
Population                          39 7043 22139.60 21152.39 17554.00 19503.96
                                       mad      min       max     range  skew
Zip.code                           2379.57 90001.00  96150.00   6149.00 -0.21
Customer.ID*                       2610.86     1.00   7043.00   7042.00  0.00
Gender*                               0.00     1.00      2.00      1.00 -0.02
Age                                  20.76    19.00     80.00     61.00  0.16
Married*                              0.00     1.00      2.00      1.00  0.07
Number.of.Dependents                  0.00     0.00      9.00      9.00  2.11
City*                               375.10     1.00   1106.00   1105.00 -0.21
Latitude                              3.22    32.56     41.96      9.41  0.31
Longitude                             2.74  -124.30   -114.19     10.11 -0.09
Number.of.Referrals                   0.00     0.00     11.00     11.00  1.45
Tenure.in.Months                     32.62     1.00     72.00     71.00  0.24
Offer*                                0.00     1.00      6.00      5.00  0.92
Phone.Service*                        0.00     1.00      2.00      1.00 -2.73
Avg.Monthly.Long.Distance.Charges    18.22     1.01     49.99     48.98  0.00
Multiple.Lines*                       1.48     1.00      3.00      2.00 -0.42
Internet.Service*                     0.00     1.00      2.00      1.00 -1.38
Internet.Type*                        1.48     1.00      4.00      3.00 -0.55
Avg.Monthly.GB.Download              13.34     2.00     85.00     83.00  1.18
Online.Security*                      1.48     1.00      3.00      2.00 -0.10
Online.Backup*                        1.48     1.00      3.00      2.00 -0.21
Device.Protection.Plan*               1.48     1.00      3.00      2.00 -0.21
Premium.Tech.Support*                 1.48     1.00      3.00      2.00 -0.11
Streaming.TV*                         1.48     1.00      3.00      2.00 -0.29
Streaming.Movies*                     1.48     1.00      3.00      2.00 -0.30
Streaming.Music*                      1.48     1.00      3.00      2.00 -0.22
Unlimited.Data*                       0.00     1.00      3.00      2.00 -1.02
Contract*                             0.00     1.00      3.00      2.00  0.49
Paperless.Billing*                    0.00     1.00      2.00      1.00 -0.38
Payment.Method*                       0.00     1.00      3.00      2.00  0.76
Monthly.Charge                       36.32   -10.00    118.75    128.75 -0.28
Total.Charges                      1808.40    18.80   8684.80   8666.00  0.96
Total.Refunds                         0.00     0.00     49.79     49.79  4.33
Total.Extra.Data.Charges              0.00     0.00    150.00    150.00  4.09
Total.Long.Distance.Charges         566.53     0.00   3564.72   3564.72  1.24
Total.Revenue                      2620.66    21.36  11979.34  11957.98  0.92
Customer.Status*                      0.00     1.00      3.00      2.00 -0.88
Churn.Category*                       0.00     1.00      6.00      5.00  1.91
Churn.Reason*                         0.00     1.00     21.00     20.00  2.65
Population                        23471.04    11.00 105285.00 105274.00  0.91
                                  kurtosis     se
Zip.code                             -1.17  22.12
Customer.ID*                         -1.20  24.23
Gender*                              -2.00   0.01
Age                                  -1.00   0.20
Married*                             -2.00   0.01
Number.of.Dependents                  4.44   0.01
City*                                -1.13   3.67
Latitude                             -1.16   0.03
Longitude                            -1.19   0.03
Number.of.Referrals                   0.72   0.04
Tenure.in.Months                     -1.39   0.29
Offer*                               -0.70   0.02
Phone.Service*                        5.43   0.00
Avg.Monthly.Long.Distance.Charges    -1.21   0.18
Multiple.Lines*                      -0.71   0.01
Internet.Service*                    -0.11   0.00
Internet.Type*                       -1.24   0.01
Avg.Monthly.GB.Download               0.63   0.26
Online.Security*                     -0.99   0.01
Online.Backup*                       -1.15   0.01
Device.Protection.Plan*              -1.15   0.01
Premium.Tech.Support*                -1.01   0.01
Streaming.TV*                        -1.21   0.01
Streaming.Movies*                    -1.21   0.01
Streaming.Music*                     -1.16   0.01
Unlimited.Data*                      -0.76   0.01
Contract*                            -1.44   0.01
Paperless.Billing*                   -1.86   0.01
Payment.Method*                      -0.40   0.01
Monthly.Charge                       -1.13   0.37
Total.Charges                        -0.23  27.00
Total.Refunds                        18.33   0.09
Total.Extra.Data.Charges             16.44   0.30
Total.Long.Distance.Charges           0.64  10.09
Total.Revenue                        -0.20  34.14
Customer.Status*                     -1.12   0.01
Churn.Category*                       2.76   0.02
Churn.Reason*                         6.36   0.05
Population                            0.32 252.05

3 Variable Table:

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
?group_by

df <-  telecom_zipcode__customer |>
  group_by(Zip.code) |>
  summarise(Avg_Pop = mean(Population),
            Avg_Age = mean(Age))

df
# A tibble: 1,626 × 3
   Zip.code Avg_Pop Avg_Age
      <int>   <dbl>   <dbl>
 1    90001   54492    52.2
 2    90002   44586    50.2
 3    90003   58198    46.2
 4    90004   67852    39.4
 5    90005   43019    42.2
 6    90006   62784    37  
 7    90007   45025    43.8
 8    90008   30852    45.6
 9    90010    1957    30  
10    90011  101215    38.4
# ℹ 1,616 more rows