## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: airports
##
## Loading required package: cherryblossom
##
## Loading required package: usdata
##
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
babiesdata1<- filter(babies, !is.na(weight) & !is.na(gestation))
cor(data1$weight,data1$gestation)
## [1] 0.02236553
ggplot(data1, mapping = aes(weight, gestation)) + geom_smooth() + geom_point() +
labs(title = "Weight vs Gestation length", x = "Weight in pounds",y = "Gestation length in days") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Answer: There is no correlation between the two variables from the graph and the correlation value.
my_data <- select(babies,gestation,parity,age,height,weight,smoke)
ggpairs(my_data)
data1 <- babies$gestation[babies$smoke == 0]
data2 <-babies$gestation[babies$smoke == 1]
t.test(data1, data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = 2.3939, df = 1092.6, p-value = 0.01684
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.3981501 4.0173229
## sample estimates:
## mean of x mean of y
## 280.1869 277.9792
Answer: The variable “smoke” is non-negligible variable with the gestation length according to the t test, and all other variables(numerical) show no non-neglibible correlation with the gestation length according to the graph.
my_data <- select(babies,bwt, gestation, parity,age,height,weight,smoke)
ggpairs(my_data)
data1 <- babies$bwt[babies$smoke == 0]
data2 <-babies$bwt[babies$smoke == 1]
t.test(data1, data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = 8.5813, df = 1003.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 6.89385 10.98148
## sample estimates:
## mean of x mean of y
## 123.0472 114.1095
Answer: The variable “smoke” has non-negligible correlation with the birthweight according to the t test, gestation length also has a non-negligible factor to the birthweight,all other variable didn’t show strong correlation with birthweight according to the graph.
bank_datalibrary(tidyverse)
library(openintro)
library(GGally)
bank_data <- read_csv("BankChurners.csv")
my_data <- select(bank_data,Gender,Education_Level,Marital_Status,Income_Category ,Card_Category )
ggpairs(my_data)
chisq.test(table(bank_data$Education_Level, bank_data$Income_Category))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Education_Level, bank_data$Income_Category)
## X-squared = 45.254, df = 30, p-value = 0.03655
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill =Education_Level ), position = "dodge") + coord_flip() +
labs(title = "Education level by income cagegory", x = "Income_Category",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Marital_Status, bank_data$Card_Category))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Marital_Status, bank_data$Card_Category)
## X-squared = 32.247, df = 9, p-value = 0.0001805
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Marital_Status), position = "dodge") +
labs(title = "Card category by Marital status", x = "Card category level",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Income_Category, bank_data$Gender))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Income_Category, bank_data$Gender)
## X-squared = 7138.4, df = 5, p-value < 2.2e-16
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill = Gender), position = "dodge") +
labs(title = "Income category by Gender", x = "Income category",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Income_Category, bank_data$Card_Category))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Income_Category, bank_data$Card_Category)
## X-squared = 100.17, df = 15, p-value = 1.211e-14
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill = Card_Category), position = "dodge") +
labs(title = "Income category by Card category", x = "Income category",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Card_Category, bank_data$Gender))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Card_Category, bank_data$Gender)
## X-squared = 75.01, df = 3, p-value = 3.605e-16
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Gender), position = "dodge") +
labs(title = "Card category by gender", x = "Card category",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Card_Category, bank_data$Marital_Status))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Card_Category, bank_data$Marital_Status)
## X-squared = 32.247, df = 9, p-value = 0.0001805
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Marital_Status), position = "dodge") +
labs(title = "Card category by Marital status", x = "Card category",y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
chisq.test(table(bank_data$Attrition_Flag, bank_data$Gender))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(bank_data$Attrition_Flag, bank_data$Gender)
## X-squared = 13.866, df = 1, p-value = 0.0001964
chisq.test(table(bank_data$Attrition_Flag, bank_data$Income_Category ))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Attrition_Flag, bank_data$Income_Category)
## X-squared = 12.832, df = 5, p-value = 0.025
data1 <- bank_data$Contacts_Count_12_mon[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Contacts_Count_12_mon[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = 20.901, df = 2280.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.5581957 0.6737818
## sample estimates:
## mean of x mean of y
## 2.972342 2.356353
data1 <- bank_data$Credit_Limit[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Credit_Limit[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -2.401, df = 2290.4, p-value = 0.01643
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1073.4010 -108.2751
## sample estimates:
## mean of x mean of y
## 8136.039 8726.878
data1 <- bank_data$Total_Revolving_Bal[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Revolving_Bal[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -24.047, df = 2067.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -631.3897 -536.1725
## sample estimates:
## mean of x mean of y
## 672.823 1256.604
data1 <- bank_data$Total_Amt_Chng_Q4_Q1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Amt_Chng_Q4_Q1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -13.423, df = 2310.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08966254 -0.06680336
## sample estimates:
## mean of x mean of y
## 0.6942766 0.7725095
data1 <- bank_data$Total_Trans_Amt[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Trans_Amt[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -22.686, df = 3264.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1694.425 -1424.835
## sample estimates:
## mean of x mean of y
## 3095.026 4654.656
data1 <- bank_data$Total_Trans_Ct[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Trans_Ct[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -54.142, df = 3386.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -24.59864 -22.87930
## sample estimates:
## mean of x mean of y
## 44.93362 68.67259
data1 <- bank_data$Total_Ct_Chng_Q4_Q1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Ct_Chng_Q4_Q1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -30.607, df = 2299.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2000965 -0.1759997
## sample estimates:
## mean of x mean of y
## 0.5543860 0.7424341
data1 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1480402 -0.1198331
## sample estimates:
## mean of x mean of y
## 0.1624751 0.2964118
data1 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = 9322.9, df = 1626.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.9945137 0.9949322
## sample estimates:
## mean of x mean of y
## 0.9949085986 0.0001856503
data1 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
##
## Welch Two Sample t-test
##
## data: data1 and data2
## t = -9322.9, df = 1626.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9949323 -0.9945137
## sample estimates:
## mean of x mean of y
## 0.00509138 0.99981440
Answer:There are total 12 variables that have non-negligible correlation or dependence with Attrition_Flag, they are: Gender,Income_Category,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2.