Day4 HW

Import

{remove(list=ls())}
library(readxl)

?setwd()
starting httpd help server ... done
# setwd("C:\Users\HP\Documents\ClassIn Files\ClassIn\Day4")
# getwd()

dd <- read.csv("C:/Users/HP/Downloads/telecom_data_dictionary.csv")
cc <- read.csv("C:/Users/HP/Downloads/telecom_customer_churn.csv")
zp <- read.csv("C:/Users/HP/Downloads/telecom_zipcode_population.csv")
library(dplyr)

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
zp <- zp %>%
  rename(Zip.code = Zip.Code)

Merge

A. Unique ID in each dataset

length(unique(cc$Zip.code))
[1] 1626
length(unique(zp$Zip.code))
[1] 1671

B. One to Many

C. Summary Stats on the merged data.

cc_ID <- as.character(cc$Zip.code)
zp_ID <- as.character(zp$Zip.Code) 
test <- cbind(cc_ID , zp_ID) 
# combined vector.

combined <-
  merge(x = cc,
        y = zp,
        by = c("Zip.code") 
        )
?gsub

# removing dollar sign from string 
cc$Tenure.in.Months <- gsub(pattern = "\\$", 
                        replacement = "", 
                                  x = cc$Tenure.in.Months
                            )

?as.numeric
cc$Tenure.in.Months <- as.numeric(cc$Tenure.in.Months)

# View the structure of the combined dataset to confirm changes
str(combined)
'data.frame':   7043 obs. of  39 variables:
 $ Zip.code                         : int  90001 90001 90001 90001 90002 90002 90002 90002 90003 90003 ...
 $ Customer.ID                      : chr  "7590-VHVEG" "3307-TLCUD" "0136-IFMYD" "3217-FZDMN" ...
 $ Gender                           : chr  "Female" "Male" "Male" "Female" ...
 $ Age                              : int  36 36 71 66 31 46 56 68 60 49 ...
 $ Married                          : chr  "Yes" "Yes" "Yes" "No" ...
 $ Number.of.Dependents             : int  0 0 0 0 0 0 0 0 0 3 ...
 $ City                             : chr  "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
 $ Latitude                         : num  34 34 34 34 33.9 ...
 $ Longitude                        : num  -118 -118 -118 -118 -118 ...
 $ Number.of.Referrals              : int  0 1 1 0 9 0 0 0 4 2 ...
 $ Tenure.in.Months                 : int  1 17 69 8 58 34 13 38 59 3 ...
 $ Offer                            : chr  "Offer E" "None" "None" "Offer E" ...
 $ Phone.Service                    : chr  "No" "No" "Yes" "Yes" ...
 $ Avg.Monthly.Long.Distance.Charges: num  NA NA 18.41 5.21 18.68 ...
 $ Multiple.Lines                   : chr  "" "" "Yes" "No" ...
 $ Internet.Service                 : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ Internet.Type                    : chr  "DSL" "Cable" "Fiber Optic" "Fiber Optic" ...
 $ Avg.Monthly.GB.Download          : int  10 10 17 8 NA 16 NA 13 14 22 ...
 $ Online.Security                  : chr  "No" "Yes" "No" "No" ...
 $ Online.Backup                    : chr  "Yes" "No" "Yes" "No" ...
 $ Device.Protection.Plan           : chr  "No" "Yes" "Yes" "Yes" ...
 $ Premium.Tech.Support             : chr  "No" "No" "Yes" "No" ...
 $ Streaming.TV                     : chr  "No" "No" "Yes" "Yes" ...
 $ Streaming.Movies                 : chr  "No" "No" "Yes" "Yes" ...
 $ Streaming.Music                  : chr  "No" "No" "No" "No" ...
 $ Unlimited.Data                   : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ Contract                         : chr  "Month-to-Month" "Month-to-Month" "Two Year" "Month-to-Month" ...
 $ Paperless.Billing                : chr  "Yes" "No" "Yes" "Yes" ...
 $ Payment.Method                   : chr  "Bank Withdrawal" "Mailed Check" "Bank Withdrawal" "Credit Card" ...
 $ Monthly.Charge                   : num  29.9 34.4 110 94.5 -4 ...
 $ Total.Charges                    : num  29.9 592.8 7634.2 743 1186 ...
 $ Total.Refunds                    : num  0 0 0 0 0 ...
 $ Total.Extra.Data.Charges         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Total.Long.Distance.Charges      : num  0 0 1270.3 41.7 1083.4 ...
 $ Total.Revenue                    : num  29.9 592.8 8904.5 784.6 2269.4 ...
 $ Customer.Status                  : chr  "Joined" "Stayed" "Stayed" "Churned" ...
 $ Churn.Category                   : chr  "" "" "" "Competitor" ...
 $ Churn.Reason                     : chr  "" "" "" "Competitor had better devices" ...
 $ Population                       : int  54492 54492 54492 54492 44586 44586 44586 44586 58198 58198 ...
library("psych")
describe(cc$Tenure.in.Months)
   vars    n  mean    sd median trimmed   mad min max range skew kurtosis   se
X1    1 7043 32.39 24.54     29   31.44 32.62   1  72    71 0.24    -1.39 0.29

Avg age and Avg population

combined |> 
group_by(Zip.code) |>
  summarise(Avg_Population = mean(Population, na.rm = TRUE),
            Avg_Age = median(Age, na.rm = TRUE)
            )
# A tibble: 1,626 × 3
   Zip.code Avg_Population Avg_Age
      <int>          <dbl>   <dbl>
 1    90001          54492    51  
 2    90002          44586    51  
 3    90003          58198    49  
 4    90004          67852    44  
 5    90005          43019    42.5
 6    90006          62784    41  
 7    90007          45025    42  
 8    90008          30852    42  
 9    90010           1957    26  
10    90011         101215    30  
# ℹ 1,616 more rows
library("openxlsx")

# setwd("C:\Users\HP\Documents\ClassIn Files\ClassIn\Day4")

write.xlsx(combined,
           file = "final_telecom_merged_data.xlsx"
          )