library(readr)
Customertravel <- read_csv("C:/Users/USER/Desktop/Customertravel.csv")
## Rows: 954 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): FrequentFlyer, AnnualIncomeClass, AccountSyncedToSocialMedia, Booke...
## dbl (3): Age, ServicesOpted, Target
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
customer<-Customertravel%>%distinct()
dim(customer)
## [1] 447 7
colnames(customer)[2] <- "Flies"
colnames(customer)[3] <- "income"
colnames(customer)[4] <- "Service"
colnames(customer)[5] <- "Social_media"
colnames(customer)[6] <- "Booked_hotel"
colnames(customer)[7] <- "churn"
names(customer)<-str_to_title(names(customer))
names(customer)
## [1] "Age" "Flies" "Income" "Service" "Social_media"
## [6] "Booked_hotel" "Churn"
CHANGE TO CATEGORICAL
customer$Churn<-as.factor(customer$Churn)
customer$Booked_hotel<-as.factor(customer$Booked_hotel)
customer$Social_media<-as.factor(customer$Social_media)
customer$Income<-as.factor(customer$Income)
customer$Flies<-as.factor(customer$Flies)
customer$Service<-as.factor(customer$Service)
str(customer)
## tibble [447 × 7] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:447] 34 34 37 30 30 27 34 34 30 36 ...
## $ Flies : Factor w/ 3 levels "No","No Record",..: 1 3 1 1 1 3 1 1 1 3 ...
## $ Income : Factor w/ 3 levels "High Income",..: 3 2 3 3 2 1 3 2 2 1 ...
## $ Service : Factor w/ 6 levels "1","2","3","4",..: 6 5 3 2 1 1 4 2 3 1 ...
## $ Social_media: Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 2 2 1 1 ...
## $ Booked_hotel: Factor w/ 2 levels "No","Yes": 2 1 1 1 1 2 2 1 2 1 ...
## $ Churn : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...
summary(customer)
## Age Flies Income Service Social_media
## Min. :27.00 No :250 High Income : 69 1:150 No :261
## 1st Qu.:29.00 No Record: 53 Low Income :205 2: 71 Yes:186
## Median :31.00 Yes :144 Middle Income:173 3: 63
## Mean :32.23 4: 80
## 3rd Qu.:35.00 5: 55
## Max. :38.00 6: 28
## Booked_hotel Churn
## No :258 0:325
## Yes:189 1:122
##
##
##
##
var(customer$Flies =="Yes")%>%sqrt()
## [1] 0.4678228
sd(customer$Flies =="Yes")
## [1] 0.4678228
mean(customer$Flies=="Yes")
## [1] 0.3221477
you can note square root of variance=standard deviation model
model1<-glm(Churn~Income,data=customer,family = "binomial")
summary(model1)
##
## Call:
## glm(formula = Churn ~ Income, family = "binomial", data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3216 0.2439 1.319 0.187318
## IncomeLow Income -1.1115 0.2867 -3.877 0.000106 ***
## IncomeMiddle Income -2.4746 0.3485 -7.100 1.25e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 524.02 on 446 degrees of freedom
## Residual deviance: 463.96 on 444 degrees of freedom
## AIC: 469.96
##
## Number of Fisher Scoring iterations: 4
low income are 1.3146 lower to churn out than high income cadre in log odds term;middle income are 2.9277 lower to churn than the high income cadre.All income levels are statistically significant;having p values less than 0.5.
model2
model2<-glm(Churn~Income+Booked_hotel,data=customer,family = "binomial")
summary(model2)
##
## Call:
## glm(formula = Churn ~ Income + Booked_hotel, family = "binomial",
## data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4532 0.2533 1.789 0.073555 .
## IncomeLow Income -1.0569 0.2891 -3.655 0.000257 ***
## IncomeMiddle Income -2.3494 0.3531 -6.654 2.85e-11 ***
## Booked_hotelYes -0.5221 0.2444 -2.137 0.032630 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 524.02 on 446 degrees of freedom
## Residual deviance: 459.31 on 443 degrees of freedom
## AIC: 467.31
##
## Number of Fisher Scoring iterations: 4
these who booked hotel are 0.7137 lower to churn than those who did not book hotel;they are 30 percent less likely to churn.
model3
model3<-glm(Churn~Income+Booked_hotel+Flies,data=customer,family = "binomial")
summary(model3)
##
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies, family = "binomial",
## data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.65692 0.40945 -1.604 0.108625
## IncomeLow Income -0.40179 0.33935 -1.184 0.236415
## IncomeMiddle Income -1.23658 0.47419 -2.608 0.009113 **
## Booked_hotelYes -0.52032 0.24895 -2.090 0.036611 *
## FliesNo Record -0.01837 0.42987 -0.043 0.965917
## FliesYes 1.10966 0.32214 3.445 0.000572 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 524.02 on 446 degrees of freedom
## Residual deviance: 446.61 on 441 degrees of freedom
## AIC: 458.61
##
## Number of Fisher Scoring iterations: 4
model4
model4<-glm(Churn~Income+Booked_hotel+Flies+Service,data=customer,family = "binomial")
summary(model4)
##
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies + Service,
## family = "binomial", data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.83128 0.50498 -3.626 0.000287 ***
## IncomeLow Income -0.40676 0.37457 -1.086 0.277504
## IncomeMiddle Income -1.47648 0.52059 -2.836 0.004566 **
## Booked_hotelYes -0.67091 0.29838 -2.249 0.024543 *
## FliesNo Record 0.03698 0.45311 0.082 0.934949
## FliesYes 1.69483 0.37764 4.488 7.19e-06 ***
## Service2 2.02964 0.41803 4.855 1.20e-06 ***
## Service3 0.97177 0.48173 2.017 0.043670 *
## Service4 0.73993 0.38234 1.935 0.052954 .
## Service5 1.78707 0.43487 4.109 3.97e-05 ***
## Service6 2.65206 0.59383 4.466 7.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 524.02 on 446 degrees of freedom
## Residual deviance: 402.61 on 436 degrees of freedom
## AIC: 424.61
##
## Number of Fisher Scoring iterations: 5
model5
model5<-glm(Churn~Income+Booked_hotel+Flies+Service+Age,data=customer,family = "binomial")
summary(model5)
##
## Call:
## glm(formula = Churn ~ Income + Booked_hotel + Flies + Service +
## Age, family = "binomial", data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.90897 1.26461 1.510 0.131163
## IncomeLow Income -0.25851 0.38131 -0.678 0.497808
## IncomeMiddle Income -1.30562 0.52707 -2.477 0.013245 *
## Booked_hotelYes -0.69388 0.30201 -2.298 0.021586 *
## FliesNo Record -0.02253 0.45743 -0.049 0.960708
## FliesYes 1.95858 0.39533 4.954 7.26e-07 ***
## Service2 2.19100 0.42917 5.105 3.31e-07 ***
## Service3 1.09997 0.48469 2.269 0.023241 *
## Service4 0.72587 0.39035 1.860 0.062954 .
## Service5 1.68633 0.44893 3.756 0.000172 ***
## Service6 2.74710 0.59664 4.604 4.14e-06 ***
## Age -0.12517 0.03933 -3.182 0.001460 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 524.02 on 446 degrees of freedom
## Residual deviance: 391.96 on 435 degrees of freedom
## AIC: 415.96
##
## Number of Fisher Scoring iterations: 5
are churners and non churners have same age t.test
t<-t.test(customer$Age~customer$Churn,mu=0)
t
##
## Welch Two Sample t-test
##
## data: customer$Age by customer$Churn
## t = 2.1814, df = 197.73, p-value = 0.03033
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.08068199 1.60057905
## sample estimates:
## mean in group 0 mean in group 1
## 32.45538 31.61475
HO:mean of churners =mean of non-churners we fail to accepted the null hypothesis;conclude there is statistical difference between churners and non churners in trems of age
is there difference in age of these who are in non socialmedia and these who are not in scocialmedia
t.test
```r
t<-t.test(customer$Age~customer$Social_media,mu=0)
t
##
## Welch Two Sample t-test
##
## data: customer$Age by customer$Social_media
## t = -0.66816, df = 408.26, p-value = 0.5044
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -0.8701725 0.4286944
## sample estimates:
## mean in group No mean in group Yes
## 32.13410 32.35484
HO:there is no difference in age for these in social meadia and these who are not in social media. we accept null hypothesis;conclude these who are in social media are not different from these who are not in the social media;pv>0.5
ustomer$Flies =="Yes")
CHI-SQUARE TEST
tab<-table(customer$Churn,customer$Social_media)
barplot(tab,beside = T,legend=T)
CHI<-chisq.test(tab,correct = T)
CHI
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab
## X-squared = 3.5403, df = 1, p-value = 0.05989
attributes(CHI)
## $names
## [1] "statistic" "parameter" "p.value" "method" "data.name" "observed"
## [7] "expected" "residuals" "stdres"
##
## $class
## [1] "htest"
FISHERS EXACT TEST
fish<-fisher.test(tab,conf.int = T,conf.level = 0.99)
fish
##
## Fisher's Exact Test for Count Data
##
## data: tab
## p-value = 0.05268
## alternative hypothesis: true odds ratio is not equal to 1
## 99 percent confidence interval:
## 0.860886 2.710787
## sample estimates:
## odds ratio
## 1.52689
attributes(fish)
## $names
## [1] "p.value" "conf.int" "estimate" "null.value" "alternative"
## [6] "method" "data.name"
##
## $class
## [1] "htest"
CHI-SQUARE TEST
tab<-table(customer$Flies,customer$Income)
barplot(tab,beside = T,legend=T)
CHI<-chisq.test(tab,correct = T)
CHI
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 230.02, df = 4, p-value < 2.2e-16
attributes(CHI)
## $names
## [1] "statistic" "parameter" "p.value" "method" "data.name" "observed"
## [7] "expected" "residuals" "stdres"
##
## $class
## [1] "htest"