travel_churn

library(readr)
Customertravel <- read_csv("C:/Users/USER/Desktop/Customertravel.csv")

## Rows: 954 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): FrequentFlyer, AnnualIncomeClass, AccountSyncedToSocialMedia, Booke...
## dbl (3): Age, ServicesOpted, Target
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

customer<-Customertravel%>%distinct()
dim(customer)

## [1] 447   7

colnames(customer)[2] <- "Flies"
colnames(customer)[3] <- "income"
colnames(customer)[4] <- "Service"
colnames(customer)[5] <- "Social_media"
colnames(customer)[6] <- "Booked_hotel"
colnames(customer)[7] <- "churn"
names(customer)<-str_to_title(names(customer))
names(customer)

## [1] "Age"          "Flies"        "Income"       "Service"      "Social_media"
## [6] "Booked_hotel" "Churn"

CHANGE TO CATEGORICAL

customer$Churn<-as.factor(customer$Churn)
customer$Booked_hotel<-as.factor(customer$Booked_hotel)
customer$Social_media<-as.factor(customer$Social_media)
customer$Income<-as.factor(customer$Income)
customer$Flies<-as.factor(customer$Flies)
customer$Service<-as.factor(customer$Service)
str(customer)

## tibble [447 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Age         : num [1:447] 34 34 37 30 30 27 34 34 30 36 ...
##  $ Flies       : Factor w/ 3 levels "No","No Record",..: 1 3 1 1 1 3 1 1 1 3 ...
##  $ Income      : Factor w/ 3 levels "High Income",..: 3 2 3 3 2 1 3 2 2 1 ...
##  $ Service     : Factor w/ 6 levels "1","2","3","4",..: 6 5 3 2 1 1 4 2 3 1 ...
##  $ Social_media: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 1 1 ...
##  $ Booked_hotel: Factor w/ 2 levels "No","Yes": 2 1 1 1 1 2 2 1 2 1 ...
##  $ Churn       : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...

summary(customer)

##       Age              Flies               Income    Service Social_media
##  Min.   :27.00   No       :250   High Income  : 69   1:150   No :261     
##  1st Qu.:29.00   No Record: 53   Low Income   :205   2: 71   Yes:186     
##  Median :31.00   Yes      :144   Middle Income:173   3: 63               
##  Mean   :32.23                                       4: 80               
##  3rd Qu.:35.00                                       5: 55               
##  Max.   :38.00                                       6: 28               
##  Booked_hotel Churn  
##  No :258      0:325  
##  Yes:189      1:122  
##                      
##                      
##                      
##

var(customer$Flies =="Yes")%>%sqrt()

## [1] 0.4678228

sd(customer$Flies =="Yes")

## [1] 0.4678228

mean(customer$Flies=="Yes")

## [1] 0.3221477

you can note square root of variance=standard deviation model

model1<-glm(Churn~Income,data=customer,family = "binomial")
summary(model1)

## 
## Call:
## glm(formula = Churn ~ Income, family = "binomial", data = customer)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.3216     0.2439   1.319 0.187318    
## IncomeLow Income     -1.1115     0.2867  -3.877 0.000106 ***
## IncomeMiddle Income  -2.4746     0.3485  -7.100 1.25e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 524.02  on 446  degrees of freedom
## Residual deviance: 463.96  on 444  degrees of freedom
## AIC: 469.96
## 
## Number of Fisher Scoring iterations: 4

low income are 1.3146 lower to churn out than high income cadre in log odds term;middle income are 2.9277 lower to churn than the high income cadre.All income levels are statistically significant;having p values less than 0.5.

model2

model2<-glm(Churn~Income+Booked_hotel,data=customer,family = "binomial")
summary(model2)

## 
## Call:
## glm(formula = Churn ~ Income + Booked_hotel, family = "binomial", 
##     data = customer)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.4532     0.2533   1.789 0.073555 .  
## IncomeLow Income     -1.0569     0.2891  -3.655 0.000257 ***
## IncomeMiddle Income  -2.3494     0.3531  -6.654 2.85e-11 ***
## Booked_hotelYes      -0.5221     0.2444  -2.137 0.032630 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 524.02  on 446  degrees of freedom
## Residual deviance: 459.31  on 443  degrees of freedom
## AIC: 467.31
## 
## Number of Fisher Scoring iterations: 4

these who booked hotel are 0.7137 lower to churn than those who did not book hotel;they are 30 percent less likely to churn.

model3

model3<-glm(Churn~Income+Booked_hotel+Flies,data=customer,family = "binomial")
summary(model3)

## 
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies, family = "binomial", 
##     data = customer)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -0.65692    0.40945  -1.604 0.108625    
## IncomeLow Income    -0.40179    0.33935  -1.184 0.236415    
## IncomeMiddle Income -1.23658    0.47419  -2.608 0.009113 ** 
## Booked_hotelYes     -0.52032    0.24895  -2.090 0.036611 *  
## FliesNo Record      -0.01837    0.42987  -0.043 0.965917    
## FliesYes             1.10966    0.32214   3.445 0.000572 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 524.02  on 446  degrees of freedom
## Residual deviance: 446.61  on 441  degrees of freedom
## AIC: 458.61
## 
## Number of Fisher Scoring iterations: 4

model4

model4<-glm(Churn~Income+Booked_hotel+Flies+Service,data=customer,family = "binomial")
summary(model4)

## 
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies + Service, 
##     family = "binomial", data = customer)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.83128    0.50498  -3.626 0.000287 ***
## IncomeLow Income    -0.40676    0.37457  -1.086 0.277504    
## IncomeMiddle Income -1.47648    0.52059  -2.836 0.004566 ** 
## Booked_hotelYes     -0.67091    0.29838  -2.249 0.024543 *  
## FliesNo Record       0.03698    0.45311   0.082 0.934949    
## FliesYes             1.69483    0.37764   4.488 7.19e-06 ***
## Service2             2.02964    0.41803   4.855 1.20e-06 ***
## Service3             0.97177    0.48173   2.017 0.043670 *  
## Service4             0.73993    0.38234   1.935 0.052954 .  
## Service5             1.78707    0.43487   4.109 3.97e-05 ***
## Service6             2.65206    0.59383   4.466 7.97e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 524.02  on 446  degrees of freedom
## Residual deviance: 402.61  on 436  degrees of freedom
## AIC: 424.61
## 
## Number of Fisher Scoring iterations: 5

model5

model5<-glm(Churn~Income+Booked_hotel+Flies+Service+Age,data=customer,family = "binomial")
summary(model5)

## 
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies + Service + 
##     Age, family = "binomial", data = customer)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.90897    1.26461   1.510 0.131163    
## IncomeLow Income    -0.25851    0.38131  -0.678 0.497808    
## IncomeMiddle Income -1.30562    0.52707  -2.477 0.013245 *  
## Booked_hotelYes     -0.69388    0.30201  -2.298 0.021586 *  
## FliesNo Record      -0.02253    0.45743  -0.049 0.960708    
## FliesYes             1.95858    0.39533   4.954 7.26e-07 ***
## Service2             2.19100    0.42917   5.105 3.31e-07 ***
## Service3             1.09997    0.48469   2.269 0.023241 *  
## Service4             0.72587    0.39035   1.860 0.062954 .  
## Service5             1.68633    0.44893   3.756 0.000172 ***
## Service6             2.74710    0.59664   4.604 4.14e-06 ***
## Age                 -0.12517    0.03933  -3.182 0.001460 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 524.02  on 446  degrees of freedom
## Residual deviance: 391.96  on 435  degrees of freedom
## AIC: 415.96
## 
## Number of Fisher Scoring iterations: 5

are churners and non churners have same age t.test

t<-t.test(customer$Age~customer$Churn,mu=0)
t

## 
##  Welch Two Sample t-test
## 
## data:  customer$Age by customer$Churn
## t = 2.1814, df = 197.73, p-value = 0.03033
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.08068199 1.60057905
## sample estimates:
## mean in group 0 mean in group 1 
##        32.45538        31.61475

HO:mean of churners =mean of non-churners we fail to accepted the null hypothesis;conclude there is statistical difference between churners and non churners in trems of age

is there difference in age  of these who are in non socialmedia and these who are not in scocialmedia
t.test

```r
t<-t.test(customer$Age~customer$Social_media,mu=0)
t

## 
##  Welch Two Sample t-test
## 
## data:  customer$Age by customer$Social_media
## t = -0.66816, df = 408.26, p-value = 0.5044
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -0.8701725  0.4286944
## sample estimates:
##  mean in group No mean in group Yes 
##          32.13410          32.35484

HO:there is no difference in age for these in social meadia and these who are not in social media. we accept null hypothesis;conclude these who are in social media are not different from these who are not in the social media;pv>0.5

ustomer$Flies =="Yes")

CHI-SQUARE TEST

tab<-table(customer$Churn,customer$Social_media)
barplot(tab,beside = T,legend=T)

CHI<-chisq.test(tab,correct = T)
CHI

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 3.5403, df = 1, p-value = 0.05989

attributes(CHI)

## $names
## [1] "statistic" "parameter" "p.value"   "method"    "data.name" "observed" 
## [7] "expected"  "residuals" "stdres"   
## 
## $class
## [1] "htest"

FISHERS EXACT TEST

fish<-fisher.test(tab,conf.int = T,conf.level = 0.99)
fish

## 
##  Fisher's Exact Test for Count Data
## 
## data:  tab
## p-value = 0.05268
## alternative hypothesis: true odds ratio is not equal to 1
## 99 percent confidence interval:
##  0.860886 2.710787
## sample estimates:
## odds ratio 
##    1.52689

attributes(fish)

## $names
## [1] "p.value"     "conf.int"    "estimate"    "null.value"  "alternative"
## [6] "method"      "data.name"  
## 
## $class
## [1] "htest"

CHI-SQUARE TEST

tab<-table(customer$Flies,customer$Income)
barplot(tab,beside = T,legend=T)

CHI<-chisq.test(tab,correct = T)
CHI

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 230.02, df = 4, p-value < 2.2e-16

attributes(CHI)

## $names
## [1] "statistic" "parameter" "p.value"   "method"    "data.name" "observed" 
## [7] "expected"  "residuals" "stdres"   
## 
## $class
## [1] "htest"

travel_churn

mugo

12 November 2023