library(latexpdf)

library(latexpdf)
## Warning: package 'latexpdf' was built under R version 3.4.2

library(readr)

library(readr)

Read in train.csv and train.demographics.csv files

convert to dataframes

custchurn <- read.csv(“Train.csv”) df.churn <- custchurn custdemo <- read.csv(“Train_Demographics.csv”) df.demo <- custdemo

custchurn <- read.csv("Train (3).csv")
df.churn <- custchurn
custdemo <- read.csv("Train_Demographics.csv")
df.demo <- custdemo

Merging both training datasets

library(sqldf) #Manipulate R dataframes using SQL

library(sqldf)  #Manipulate R dataframes using SQL
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.4
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.4

Using merge() function

M1 <- merge.data.frame(df.churn,df.demo,by.x=“CustomerID”,by.y=“HouseholdID”)

M1 <- merge.data.frame(df.churn,df.demo,by.x="CustomerID",by.y="HouseholdID")

Using sqldf library

require(sqldf) M2 <- sqldf(“SELECT * FROM custchurn JOIN custdemo ON custchurn.CustomerID = custdemo.HouseholdID ORDER BY CustomerID”)

require(sqldf)
M2 <- sqldf("SELECT * FROM custchurn JOIN custdemo ON custchurn.CustomerID = custdemo.HouseholdID ORDER BY CustomerID")

Keep both columns

M2 <- M2[,-3]

M2 <- M2[,-3]

Ensuring dataframes are identical

identical(M1,M2) #boolean value of TRUE

identical(M1,M2)  #boolean value of TRUE
## [1] TRUE

Structure of Data

str(M2) colnames(M2) head(M2)

str(M2)
## 'data.frame':    5298 obs. of  9 variables:
##  $ CustomerID   : Factor w/ 5298 levels "Cust1201","Cust1202",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Churn        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ Country      : Factor w/ 2 levels "?","India": 2 2 2 2 2 2 2 2 2 2 ...
##  $ State        : Factor w/ 2 levels "?","Maharashtra": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Retired      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HasPartner   : int  2 2 2 2 2 2 2 2 2 1 ...
##  $ HasDependents: int  2 2 2 1 2 2 2 2 2 1 ...
##  $ Education    : Factor w/ 6 levels "","Graduation",..: 4 6 6 6 6 3 2 6 3 6 ...
##  $ Gender       : Factor w/ 3 levels "","Female","Male": 3 3 3 3 2 3 3 2 2 2 ...
colnames(M2)
## [1] "CustomerID"    "Churn"         "Country"       "State"        
## [5] "Retired"       "HasPartner"    "HasDependents" "Education"    
## [9] "Gender"
head(M2)
##   CustomerID Churn Country       State Retired HasPartner HasDependents
## 1   Cust1201    No   India Maharashtra       0          2             2
## 2   Cust1202    No   India Maharashtra       0          2             2
## 3   Cust1203    No   India Maharashtra       0          2             2
## 4   Cust1204    No   India Maharashtra       0          2             1
## 5   Cust1205   Yes   India Maharashtra       0          2             2
## 6   Cust1206    No   India Maharashtra       0          2             2
##                  Education Gender
## 1                  Masters   Male
## 2 ProfessionalQalification   Male
## 3 ProfessionalQalification   Male
## 4 ProfessionalQalification   Male
## 5 ProfessionalQalification Female
## 6      Highschool or below   Male