library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.1
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## 载入需要的程序包:airports
## 载入需要的程序包:cherryblossom
## 载入需要的程序包:usdata
bank_data <- read_csv("D:/lilith/BankChurners.csv")
## Rows: 10127 Columns: 20
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Ca...
## dbl (14): Customer_Age, Dependent_count, Months_on_book, Total_Relationship_...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(bank_data, aes(Avg_Open_To_Buy, Credit_Limit)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
cor(bank_data$Avg_Open_To_Buy, bank_data$Credit_Limit)
## [1] 0.9959805
chisq.test(table(bank_data$Card_Category, bank_data$Income_Category))
## Warning in chisq.test(table(bank_data$Card_Category,
## bank_data$Income_Category)): Chi-squared近似算法有可能不准
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Card_Category, bank_data$Income_Category)
## X-squared = 100.17, df = 15, p-value = 1.211e-14
ggplot(bank_data) +
geom_bar(aes(y = Income_Category, fill = Card_Category), position = "fill")
bank_data$Attrition_binary <- ifelse(bank_data$Attrition_Flag == "Attrited Customer", 1, 0)
cor.test(bank_data$Total_Trans_Amt, bank_data$Attrition_binary)
##
## Pearson's product-moment correlation
##
## data: bank_data$Total_Trans_Amt and bank_data$Attrition_binary
## t = -17.211, df = 10125, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1874596 -0.1496129
## sample estimates:
## cor
## -0.1685984
cor.test(bank_data$Total_Trans_Ct, bank_data$Attrition_binary)
##
## Pearson's product-moment correlation
##
## data: bank_data$Total_Trans_Ct and bank_data$Attrition_binary
## t = -40.251, df = 10125, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3880723 -0.3544902
## sample estimates:
## cor
## -0.3714027
cor.test(bank_data$Months_Inactive_12_mon, bank_data$Attrition_binary)
##
## Pearson's product-moment correlation
##
## data: bank_data$Months_Inactive_12_mon and bank_data$Attrition_binary
## t = 15.521, df = 10125, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1333680 0.1714166
## sample estimates:
## cor
## 0.1524488
cor.test(bank_data$Contacts_Count_12_mon, bank_data$Attrition_binary)
##
## Pearson's product-moment correlation
##
## data: bank_data$Contacts_Count_12_mon and bank_data$Attrition_binary
## t = 21.021, df = 10125, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1857536 0.2230788
## sample estimates:
## cor
## 0.2044905
chisq.test(table(bank_data$Gender, bank_data$Attrition_Flag))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(bank_data$Gender, bank_data$Attrition_Flag)
## X-squared = 13.866, df = 1, p-value = 0.0001964
chisq.test(table(bank_data$Education_Level, bank_data$Attrition_Flag))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Education_Level, bank_data$Attrition_Flag)
## X-squared = 12.511, df = 6, p-value = 0.05149
chisq.test(table(bank_data$Marital_Status, bank_data$Attrition_Flag))
##
## Pearson's Chi-squared test
##
## data: table(bank_data$Marital_Status, bank_data$Attrition_Flag)
## X-squared = 6.0561, df = 3, p-value = 0.1089
Total_Trans_Amt Total_Trans_Ct Months_Inactive_12_mon Contacts_Count_12_mon