telecom_zipcode_population <- read.csv("C:/Users/Adi/Downloads/telecom_zipcode_population.csv")
telecom_customer_churn <- read.csv("C:/Users/Adi/Downloads/telecom_customer_churn.csv")
customer_loss <- telecom_customer_churn
zipcode_popularity <- telecom_zipcode_population
length(unique(customer_loss$Customer.ID))
## [1] 7043
length(unique(zipcode_popularity$Zip.Code))
## [1] 1671
For Customer_loss, the unique variable is customer id and the unique variable for zip code popularity is Zip code. However since since customer id isn’t present in both, we find out that
merged_telecom <- merge(x = customer_loss, y = zipcode_popularity, by.x = "Zip.code", by.y = "Zip.Code")
This would be a one to many merge
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(merged_telecom, type = "text", title = "Telecom Summary Statistics")
##
## Telecom Summary Statistics
## =================================================================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------------------------------------------
## Zip.code 7,043 93,486.070 1,856.768 90,001 96,150
## Age 7,043 46.510 16.750 19 80
## Number.of.Dependents 7,043 0.469 0.963 0 9
## Latitude 7,043 36.197 2.469 32.556 41.962
## Longitude 7,043 -119.757 2.154 -124.301 -114.193
## Number.of.Referrals 7,043 1.952 3.001 0 11
## Tenure.in.Months 7,043 32.387 24.542 1 72
## Avg.Monthly.Long.Distance.Charges 6,361 25.421 14.200 1.010 49.990
## Avg.Monthly.GB.Download 5,517 26.190 19.587 2 85
## Monthly.Charge 7,043 63.596 31.205 -10.000 118.750
## Total.Charges 7,043 2,280.381 2,266.220 18.800 8,684.800
## Total.Refunds 7,043 1.962 7.903 0.000 49.790
## Total.Extra.Data.Charges 7,043 6.861 25.105 0 150
## Total.Long.Distance.Charges 7,043 749.099 846.660 0.000 3,564.720
## Total.Revenue 7,043 3,034.379 2,865.205 21.360 11,979.340
## Population 7,043 22,139.600 21,152.390 11 105,285
## ---------------------------------------------------------------------------------
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
merged_telecom$calculated_age <-
dplyr::mutate(merged_telecom, age = mean(Age))
custom_table <- merged_telecom |> group_by(Zip.code) |>
dplyr::mutate(average_age = mean(Age))
custom_table <-
dplyr::select(custom_table, Zip.code, average_age, Population)
custom_table <-
dplyr::rename(custom_table, average_population = Population)
conflicted::conflict_prefer(name = "distinct", winner = "dplyr")
## [conflicted] Will prefer dplyr::distinct over any other package.
custom_table <- distinct(custom_table)
tail(custom_table)
## # A tibble: 6 × 3
## # Groups: Zip.code [6]
## Zip.code average_age average_population
## <int> <dbl> <int>
## 1 96142 40.3 1291
## 2 96143 49 4806
## 3 96145 53.3 4002
## 4 96146 44 942
## 5 96148 49 678
## 6 96150 50 33038