telecom_zipcode_population <- read.csv("C:/Users/Adi/Downloads/telecom_zipcode_population.csv")

telecom_customer_churn <- read.csv("C:/Users/Adi/Downloads/telecom_customer_churn.csv")
customer_loss <- telecom_customer_churn

zipcode_popularity <- telecom_zipcode_population
length(unique(customer_loss$Customer.ID))
## [1] 7043
length(unique(zipcode_popularity$Zip.Code))
## [1] 1671

The Unique Variables

For Customer_loss, the unique variable is customer id and the unique variable for zip code popularity is Zip code. However since since customer id isn’t present in both, we find out that

merged_telecom <- merge(x = customer_loss, y = zipcode_popularity, by.x = "Zip.code", by.y = "Zip.Code")

This would be a one to many merge

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(merged_telecom, type = "text", title = "Telecom Summary Statistics")
## 
## Telecom Summary Statistics
## =================================================================================
## Statistic                           N      Mean     St. Dev.    Min       Max    
## ---------------------------------------------------------------------------------
## Zip.code                          7,043 93,486.070 1,856.768   90,001    96,150  
## Age                               7,043   46.510     16.750      19        80    
## Number.of.Dependents              7,043   0.469      0.963       0         9     
## Latitude                          7,043   36.197     2.469     32.556    41.962  
## Longitude                         7,043  -119.757    2.154    -124.301  -114.193 
## Number.of.Referrals               7,043   1.952      3.001       0         11    
## Tenure.in.Months                  7,043   32.387     24.542      1         72    
## Avg.Monthly.Long.Distance.Charges 6,361   25.421     14.200    1.010     49.990  
## Avg.Monthly.GB.Download           5,517   26.190     19.587      2         85    
## Monthly.Charge                    7,043   63.596     31.205   -10.000   118.750  
## Total.Charges                     7,043 2,280.381  2,266.220   18.800  8,684.800 
## Total.Refunds                     7,043   1.962      7.903     0.000     49.790  
## Total.Extra.Data.Charges          7,043   6.861      25.105      0        150    
## Total.Long.Distance.Charges       7,043  749.099    846.660    0.000   3,564.720 
## Total.Revenue                     7,043 3,034.379  2,865.205   21.360  11,979.340
## Population                        7,043 22,139.600 21,152.390    11     105,285  
## ---------------------------------------------------------------------------------
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
merged_telecom$calculated_age <- 
dplyr::mutate(merged_telecom, age = mean(Age))
custom_table <- merged_telecom |> group_by(Zip.code) |> 
  dplyr::mutate(average_age = mean(Age))

custom_table <-
dplyr::select(custom_table, Zip.code, average_age, Population)


custom_table <- 
dplyr::rename(custom_table, average_population = Population)

conflicted::conflict_prefer(name = "distinct", winner = "dplyr")
## [conflicted] Will prefer dplyr::distinct over any other package.
custom_table <- distinct(custom_table)

tail(custom_table)
## # A tibble: 6 × 3
## # Groups:   Zip.code [6]
##   Zip.code average_age average_population
##      <int>       <dbl>              <int>
## 1    96142        40.3               1291
## 2    96143        49                 4806
## 3    96145        53.3               4002
## 4    96146        44                  942
## 5    96148        49                  678
## 6    96150        50                33038