## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: airports
## 
## Loading required package: cherryblossom
## 
## Loading required package: usdata
## 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

1. Data set babies


** Question 1**: With the babies data set, investigate whether there is a correlation between mother’s weight and the gestation length.

data1<- filter(babies, !is.na(weight) & !is.na(gestation))
 cor(data1$weight,data1$gestation)
## [1] 0.02236553
  ggplot(data1, mapping = aes(weight, gestation)) + geom_smooth() + geom_point() +
    labs(title = "Weight vs Gestation length", x = "Weight in pounds",y = "Gestation length in days") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Answer: There is no correlation between the two variables from the graph and the correlation value.


** Question 2**: Investigate all factors of mother in the data set to see which factor may have an non-negligible correlation with the gestation length.

my_data <- select(babies,gestation,parity,age,height,weight,smoke)
ggpairs(my_data)

data1 <- babies$gestation[babies$smoke == 0]
data2  <-babies$gestation[babies$smoke == 1]
t.test(data1, data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = 2.3939, df = 1092.6, p-value = 0.01684
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.3981501 4.0173229
## sample estimates:
## mean of x mean of y 
##  280.1869  277.9792

Answer: The variable “smoke” is non-negligible variable with the gestation length according to the t test, and all other variables(numerical) show no non-neglibible correlation with the gestation length according to the graph.


** Question 3**: Investigate all factors in the data set to see which factor may have an non-negligible correlation with the birthweight of babies.

my_data <- select(babies,bwt, gestation, parity,age,height,weight,smoke)
ggpairs(my_data)

data1 <- babies$bwt[babies$smoke == 0]
data2  <-babies$bwt[babies$smoke == 1]
t.test(data1, data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = 8.5813, df = 1003.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   6.89385 10.98148
## sample estimates:
## mean of x mean of y 
##  123.0472  114.1095

Answer: The variable “smoke” has non-negligible correlation with the birthweight according to the t test, gestation length also has a non-negligible factor to the birthweight,all other variable didn’t show strong correlation with birthweight according to the graph.


2. Data set bank_data

library(tidyverse)
library(openintro)
library(GGally)
bank_data <- read_csv("BankChurners.csv")


** Question 1**: Find two numeric variables that are highly correlated by checking the correlation coefficient. Then create a graph to illustrate that.

my_data <- select(bank_data, Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon, Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal, Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1 ,Total_Trans_Amt ,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio ,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 , Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2)
ggpairs(my_data)    

ggplot(bank_data, mapping= aes(Customer_Age, Months_on_book)) + geom_point() + geom_smooth() + labs(title = "Customer age vs Relation with bank by month", x = "Customer age",y = "Relationship length in month") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

ggplot(bank_data, mapping= aes(Credit_Limit, Avg_Open_To_Buy)) + geom_point() + geom_smooth() + labs(title = "Credit Line vs Available line of credit", x = "Credit limit",y = "Available credit line") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

ggplot(bank_data, mapping= aes(Total_Trans_Amt , Total_Trans_Ct )) + geom_point() + geom_smooth() + labs(title = "Total_Trans_Amt vs Total_Trans_Ct", x = "Total Transaction amount",y = "Total Transaction count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

ggplot(bank_data, mapping= aes(Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 , Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 )) + geom_point()  + labs(title = "Total_Trans_Amt vs Total_Trans_Ct", x = "Total Transaction amount",y = "Total Transaction count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))


** Question 2**: Find two categorical variables (other than Attrition_Flag) that are strongly dependent of each other. Then create a graph to illustrate that.

my_data <- select(bank_data,Gender,Education_Level,Marital_Status,Income_Category ,Card_Category  )
ggpairs(my_data)

chisq.test(table(bank_data$Education_Level, bank_data$Income_Category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Education_Level, bank_data$Income_Category)
## X-squared = 45.254, df = 30, p-value = 0.03655
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill =Education_Level  ), position = "dodge") + coord_flip() +
  labs(title = "Education level by income cagegory", x = "Income_Category",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

chisq.test(table(bank_data$Marital_Status, bank_data$Card_Category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Marital_Status, bank_data$Card_Category)
## X-squared = 32.247, df = 9, p-value = 0.0001805
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Marital_Status), position = "dodge") +
  labs(title = "Card category by Marital status", x = "Card category level",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

chisq.test(table(bank_data$Income_Category, bank_data$Gender))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Income_Category, bank_data$Gender)
## X-squared = 7138.4, df = 5, p-value < 2.2e-16
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill = Gender), position = "dodge") +
  labs(title = "Income category by Gender", x = "Income category",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

chisq.test(table(bank_data$Income_Category, bank_data$Card_Category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Income_Category, bank_data$Card_Category)
## X-squared = 100.17, df = 15, p-value = 1.211e-14
ggplot(bank_data) + geom_bar(mapping = aes(x = Income_Category, fill = Card_Category), position = "dodge") +
  labs(title = "Income category by Card category", x = "Income category",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

chisq.test(table(bank_data$Card_Category, bank_data$Gender))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Card_Category, bank_data$Gender)
## X-squared = 75.01, df = 3, p-value = 3.605e-16
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Gender), position = "dodge") +
  labs(title = "Card category by gender", x = "Card category",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))

chisq.test(table(bank_data$Card_Category, bank_data$Marital_Status))
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Card_Category, bank_data$Marital_Status)
## X-squared = 32.247, df = 9, p-value = 0.0001805
ggplot(bank_data) + geom_bar(mapping = aes(x = Card_Category, fill = Marital_Status), position = "dodge") +
  labs(title = "Card category by Marital status", x = "Card category",y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)), axis.title= element_text(size = rel(1.1)))


** Question 3**: Find all variables that have non-negligible correlation or dependence with Attrition_Flag.

chisq.test(table(bank_data$Attrition_Flag, bank_data$Gender)) 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(bank_data$Attrition_Flag, bank_data$Gender)
## X-squared = 13.866, df = 1, p-value = 0.0001964
chisq.test(table(bank_data$Attrition_Flag, bank_data$Income_Category )) 
## 
##  Pearson's Chi-squared test
## 
## data:  table(bank_data$Attrition_Flag, bank_data$Income_Category)
## X-squared = 12.832, df = 5, p-value = 0.025
data1 <- bank_data$Contacts_Count_12_mon[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Contacts_Count_12_mon[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = 20.901, df = 2280.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.5581957 0.6737818
## sample estimates:
## mean of x mean of y 
##  2.972342  2.356353
data1 <- bank_data$Credit_Limit[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Credit_Limit[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -2.401, df = 2290.4, p-value = 0.01643
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1073.4010  -108.2751
## sample estimates:
## mean of x mean of y 
##  8136.039  8726.878
data1 <- bank_data$Total_Revolving_Bal[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Revolving_Bal[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -24.047, df = 2067.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -631.3897 -536.1725
## sample estimates:
## mean of x mean of y 
##   672.823  1256.604
data1 <- bank_data$Total_Amt_Chng_Q4_Q1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Amt_Chng_Q4_Q1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -13.423, df = 2310.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08966254 -0.06680336
## sample estimates:
## mean of x mean of y 
## 0.6942766 0.7725095
data1 <- bank_data$Total_Trans_Amt[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Trans_Amt[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -22.686, df = 3264.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1694.425 -1424.835
## sample estimates:
## mean of x mean of y 
##  3095.026  4654.656
data1 <- bank_data$Total_Trans_Ct[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Trans_Ct[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -54.142, df = 3386.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -24.59864 -22.87930
## sample estimates:
## mean of x mean of y 
##  44.93362  68.67259
data1 <- bank_data$Total_Ct_Chng_Q4_Q1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Total_Ct_Chng_Q4_Q1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -30.607, df = 2299.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2000965 -0.1759997
## sample estimates:
## mean of x mean of y 
## 0.5543860 0.7424341
data1 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Avg_Utilization_Ratio[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -18.623, df = 2336, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1480402 -0.1198331
## sample estimates:
## mean of x mean of y 
## 0.1624751 0.2964118
data1 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = 9322.9, df = 1626.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.9945137 0.9949322
## sample estimates:
##    mean of x    mean of y 
## 0.9949085986 0.0001856503
data1 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2[bank_data$Attrition_Flag != "Existing Customer"]
data2 <- bank_data$Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2[bank_data$Attrition_Flag == "Existing Customer"]
t.test(data1,data2)
## 
##  Welch Two Sample t-test
## 
## data:  data1 and data2
## t = -9322.9, df = 1626.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9949323 -0.9945137
## sample estimates:
##  mean of x  mean of y 
## 0.00509138 0.99981440

Answer:There are total 12 variables that have non-negligible correlation or dependence with Attrition_Flag, they are: Gender,Income_Category,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2.