This dataset contains a cleaned version of this dataset from UCI machine learning repository on credit card approvals. Missing values have been filled and feature names and categorical names have been inferred, resulting in more context and it being easier to use.
# load required packages
# install.packages("plotly")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(ggplot2)
library(dplyr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# load disease and democracy data
setwd("C:/Users/wrxio/projects/Datasets")
creditcardApprove <- read_csv("clean_dataset.csv")
## Rows: 690 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): Industry, Ethnicity, Citizen, ZipCode
## dbl (12): Gender, Age, Debt, Married, BankCustomer, YearsEmployed, PriorDefa...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# check the result
head(creditcardApprove)
## # A tibble: 6 x 16
## Gender Age Debt Married BankCustomer Industry Ethnicity YearsEmployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communication~ White 2.5
## # ... with 8 more variables: PriorDefault <dbl>, Employed <dbl>,
## # CreditScore <dbl>, DriversLicense <dbl>, Citizen <chr>, ZipCode <chr>,
## # Income <dbl>, Approved <dbl>
# overall to check the dataset
summary(creditcardApprove)
## Gender Age Debt Married
## Min. :0.0000 Min. :13.75 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:22.67 1st Qu.: 1.000 1st Qu.:1.0000
## Median :1.0000 Median :28.46 Median : 2.750 Median :1.0000
## Mean :0.6957 Mean :31.51 Mean : 4.759 Mean :0.7609
## 3rd Qu.:1.0000 3rd Qu.:37.71 3rd Qu.: 7.207 3rd Qu.:1.0000
## Max. :1.0000 Max. :80.25 Max. :28.000 Max. :1.0000
## BankCustomer Industry Ethnicity YearsEmployed
## Min. :0.0000 Length:690 Length:690 Min. : 0.000
## 1st Qu.:1.0000 Class :character Class :character 1st Qu.: 0.165
## Median :1.0000 Mode :character Mode :character Median : 1.000
## Mean :0.7638 Mean : 2.223
## 3rd Qu.:1.0000 3rd Qu.: 2.625
## Max. :1.0000 Max. :28.500
## PriorDefault Employed CreditScore DriversLicense
## Min. :0.0000 Min. :0.0000 Min. : 0.0 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.0 1st Qu.:0.000
## Median :1.0000 Median :0.0000 Median : 0.0 Median :0.000
## Mean :0.5232 Mean :0.4275 Mean : 2.4 Mean :0.458
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.0 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :67.0 Max. :1.000
## Citizen ZipCode Income Approved
## Length:690 Length:690 Min. : 0.0 Min. :0.0000
## Class :character Class :character 1st Qu.: 0.0 1st Qu.:0.0000
## Mode :character Mode :character Median : 5.0 Median :0.0000
## Mean : 1017.4 Mean :0.4449
## 3rd Qu.: 395.5 3rd Qu.:1.0000
## Max. :100000.0 Max. :1.0000
creditcardApprove[,]
## # A tibble: 690 x 16
## Gender Age Debt Married BankCustomer Industry Ethnicity YearsEmployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communicatio~ White 2.5
## 7 1 33.2 1.04 1 1 Transport Black 6.5
## 8 0 22.9 11.6 1 1 InformationT~ White 0.04
## 9 1 54.4 0.5 0 0 Financials Black 3.96
## 10 1 42.5 4.92 0 0 Industrials White 3.16
## # ... with 680 more rows, and 8 more variables: PriorDefault <dbl>,
## # Employed <dbl>, CreditScore <dbl>, DriversLicense <dbl>, Citizen <chr>,
## # ZipCode <chr>, Income <dbl>, Approved <dbl>
names(creditcardApprove) <- tolower(names(creditcardApprove))
names(creditcardApprove) <- gsub(" ","",names(creditcardApprove))
names(creditcardApprove)
## [1] "gender" "age" "debt" "married"
## [5] "bankcustomer" "industry" "ethnicity" "yearsemployed"
## [9] "priordefault" "employed" "creditscore" "driverslicense"
## [13] "citizen" "zipcode" "income" "approved"
str(creditcardApprove)
## spec_tbl_df [690 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ gender : num [1:690] 1 0 0 1 1 1 1 0 1 1 ...
## $ age : num [1:690] 30.8 58.7 24.5 27.8 20.2 ...
## $ debt : num [1:690] 0 4.46 0.5 1.54 5.62 ...
## $ married : num [1:690] 1 1 1 1 1 1 1 1 0 0 ...
## $ bankcustomer : num [1:690] 1 1 1 1 1 1 1 1 0 0 ...
## $ industry : chr [1:690] "Industrials" "Materials" "Materials" "Industrials" ...
## $ ethnicity : chr [1:690] "White" "Black" "Black" "White" ...
## $ yearsemployed : num [1:690] 1.25 3.04 1.5 3.75 1.71 ...
## $ priordefault : num [1:690] 1 1 1 1 1 1 1 1 1 1 ...
## $ employed : num [1:690] 1 1 0 1 0 0 0 0 0 0 ...
## $ creditscore : num [1:690] 1 6 0 5 0 0 0 0 0 0 ...
## $ driverslicense: num [1:690] 0 0 0 1 0 1 1 0 0 1 ...
## $ citizen : chr [1:690] "ByBirth" "ByBirth" "ByBirth" "ByBirth" ...
## $ zipcode : chr [1:690] "00202" "00043" "00280" "00100" ...
## $ income : num [1:690] 0 560 824 3 0 ...
## $ approved : num [1:690] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Gender = col_double(),
## .. Age = col_double(),
## .. Debt = col_double(),
## .. Married = col_double(),
## .. BankCustomer = col_double(),
## .. Industry = col_character(),
## .. Ethnicity = col_character(),
## .. YearsEmployed = col_double(),
## .. PriorDefault = col_double(),
## .. Employed = col_double(),
## .. CreditScore = col_double(),
## .. DriversLicense = col_double(),
## .. Citizen = col_character(),
## .. ZipCode = col_character(),
## .. Income = col_double(),
## .. Approved = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# remove all the NA values for all columns if existing
creditcardApprove_nona <- creditcardApprove %>%
filter(!is.na(gender) & !is.na(age) & !is.na(debt) & !is.na(married) & !is.na(bankcustomer) & !is.na(industry) & !is.na(ethnicity) & !is.na(yearsemployed)& !is.na(priordefault) & !is.na(employed) & !is.na(creditscore) & !is.na(driverslicense) & !is.na(citizen) & !is.na(zipcode) & !is.na(income) & !is.na(approved))
head(creditcardApprove_nona)
## # A tibble: 6 x 16
## gender age debt married bankcustomer industry ethnicity yearsemployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communication~ White 2.5
## # ... with 8 more variables: priordefault <dbl>, employed <dbl>,
## # creditscore <dbl>, driverslicense <dbl>, citizen <chr>, zipcode <chr>,
## # income <dbl>, approved <dbl>
# Check the result
str(creditcardApprove_nona)
## spec_tbl_df [690 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ gender : num [1:690] 1 0 0 1 1 1 1 0 1 1 ...
## $ age : num [1:690] 30.8 58.7 24.5 27.8 20.2 ...
## $ debt : num [1:690] 0 4.46 0.5 1.54 5.62 ...
## $ married : num [1:690] 1 1 1 1 1 1 1 1 0 0 ...
## $ bankcustomer : num [1:690] 1 1 1 1 1 1 1 1 0 0 ...
## $ industry : chr [1:690] "Industrials" "Materials" "Materials" "Industrials" ...
## $ ethnicity : chr [1:690] "White" "Black" "Black" "White" ...
## $ yearsemployed : num [1:690] 1.25 3.04 1.5 3.75 1.71 ...
## $ priordefault : num [1:690] 1 1 1 1 1 1 1 1 1 1 ...
## $ employed : num [1:690] 1 1 0 1 0 0 0 0 0 0 ...
## $ creditscore : num [1:690] 1 6 0 5 0 0 0 0 0 0 ...
## $ driverslicense: num [1:690] 0 0 0 1 0 1 1 0 0 1 ...
## $ citizen : chr [1:690] "ByBirth" "ByBirth" "ByBirth" "ByBirth" ...
## $ zipcode : chr [1:690] "00202" "00043" "00280" "00100" ...
## $ income : num [1:690] 0 560 824 3 0 ...
## $ approved : num [1:690] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Gender = col_double(),
## .. Age = col_double(),
## .. Debt = col_double(),
## .. Married = col_double(),
## .. BankCustomer = col_double(),
## .. Industry = col_character(),
## .. Ethnicity = col_character(),
## .. YearsEmployed = col_double(),
## .. PriorDefault = col_double(),
## .. Employed = col_double(),
## .. CreditScore = col_double(),
## .. DriversLicense = col_double(),
## .. Citizen = col_character(),
## .. ZipCode = col_character(),
## .. Income = col_double(),
## .. Approved = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(creditcardApprove_nona)
## [1] 690 16
# Check the rows and columns, and no missing rows
creditcardApprove_nona[,]
## # A tibble: 690 x 16
## gender age debt married bankcustomer industry ethnicity yearsemployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communicatio~ White 2.5
## 7 1 33.2 1.04 1 1 Transport Black 6.5
## 8 0 22.9 11.6 1 1 InformationT~ White 0.04
## 9 1 54.4 0.5 0 0 Financials Black 3.96
## 10 1 42.5 4.92 0 0 Industrials White 3.16
## # ... with 680 more rows, and 8 more variables: priordefault <dbl>,
## # employed <dbl>, creditscore <dbl>, driverslicense <dbl>, citizen <chr>,
## # zipcode <chr>, income <dbl>, approved <dbl>
# Look at the result to check if still 690 x 16. The result no missing values
# Get the total Non-approved and Approved count
ggplot(creditcardApprove, aes(x = approved)) +
geom_bar(width=0.5, fill = "coral") + #Tried: fill = "blue" and worked
geom_text(stat='count', aes(label=stat(count)), vjust=-0.5,) +
theme_classic()+
ggtitle("Total Number for the Non-approved and Approved") +
xlab("Tatol Number for the Non-approved (0) and Approved (1)") +
ylab("Count Number") +
labs(fill = "Approved Survived by Yes (1) or No (0)")
# Get the result of Total Number of the Each Race In the Data Set
ggplot(creditcardApprove, aes(x = ethnicity, fill=ethnicity)) +
geom_bar(width=0.3,position = position_dodge()) +
#geom_bar(width=0.5, fill = "green", position = position_dodge()) +
geom_text(stat='count', aes(label=stat(count)), position=position_dodge(width=0.5), vjust=-0.5)+
theme_classic() +
ggtitle("Total Number of the Each Race In the Data Set") +
xlab("Race") +
ylab("Count Number for the Race") +
labs(fill = "Total Number of the Each Race")
# Get the result of Total Number Non-approved and Approved of the Each Sex In the Data Set
ggplot(creditcardApprove, aes(x = gender, fill=gender)) +
geom_bar(width=0.3,position = position_dodge()) +
geom_bar(width=0.3, fill = "green", position = position_dodge()) +
geom_text(stat='count', aes(label=stat(count)), position=position_dodge(width=0.5), vjust=-0.5)+
theme_classic() +
ggtitle("Total Number of the Each Sex In the Data Set") +
xlab("Sex (0-female, 1-male)") +
ylab("Count Number for the Sex Category") +
labs(fill = "Total Number of the Each Sex Category")
# Get the result of Age Density
ggplot(creditcardApprove, aes(x = age)) +
geom_density(fill='coral') +
theme_classic()+
ggtitle("Age Density") +
xlab("Age") +
ylab("Density") +
labs(fill = "Age Density")
# Check the total number of rows and columns
dim(creditcardApprove)
## [1] 690 16
# Use Side-by-Side Boxplots to show Approved filled by Age Category.
boxp1 <- creditcardApprove %>% ggplot() + geom_boxplot(aes(y=approved, group=age, fill=age)) +
ggtitle("Side-by-Side Boxplots For Approved filled by Age") +
xlab("Age") +
ylab("Approved")
boxp1
#boxp12 <- boxp1 + guides(fill=FALSE)
#boxp12
head(creditcardApprove)
## # A tibble: 6 x 16
## gender age debt married bankcustomer industry ethnicity yearsemployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communication~ White 2.5
## # ... with 8 more variables: priordefault <dbl>, employed <dbl>,
## # creditscore <dbl>, driverslicense <dbl>, citizen <chr>, zipcode <chr>,
## # income <dbl>, approved <dbl>
# Change the theme
ggplot(creditcardApprove, aes(x = income, y = approved)) +
xlab("Income") +
ylab("Approved") +
theme_minimal(base_size = 12)
# Show Income versus Approved
p1 <- ggplot(creditcardApprove, aes(x = income, y = approved)) +
labs(title = "Income versus Approved",
caption = "Source:UCI machine learning repository on credit card approvals") +
xlab("Income") +
ylab("Approved") +
theme_minimal(base_size = 12)
p1 + geom_point()
cor(creditcardApprove$income, creditcardApprove$approved)
## [1] 0.1756572
fit1 <- lm(income ~ approved, data = creditcardApprove)
summary(fit1)
##
## Call:
## lm(formula = income ~ approved, data = creditcardApprove)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2039 -1659 -199 -171 97961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 198.6 262.3 0.757 0.449
## approved 1840.3 393.2 4.680 3.45e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5133 on 688 degrees of freedom
## Multiple R-squared: 0.03086, Adjusted R-squared: 0.02945
## F-statistic: 21.9 on 1 and 688 DF, p-value: 3.452e-06
head(creditcardApprove)
## # A tibble: 6 x 16
## gender age debt married bankcustomer industry ethnicity yearsemployed
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 30.8 0 1 1 Industrials White 1.25
## 2 0 58.7 4.46 1 1 Materials Black 3.04
## 3 0 24.5 0.5 1 1 Materials Black 1.5
## 4 1 27.8 1.54 1 1 Industrials White 3.75
## 5 1 20.2 5.62 1 1 Industrials White 1.71
## 6 1 32.1 4 1 1 Communication~ White 2.5
## # ... with 8 more variables: priordefault <dbl>, employed <dbl>,
## # creditscore <dbl>, driverslicense <dbl>, citizen <chr>, zipcode <chr>,
## # income <dbl>, approved <dbl>
# Change the theme
ggplot(creditcardApprove, aes(x = age, y = approved)) +
xlab("Age") +
ylab("Approved") +
theme_minimal(base_size = 12)
# Show Age versus Approved
p1 <- ggplot(creditcardApprove, aes(x = age, y = approved)) +
labs(title = "Age versus Approved",
caption = "Source:UCI machine learning repository on credit card approvals") +
xlab("Age") +
ylab("Approved") +
theme_minimal(base_size = 12)
p1 + geom_point()
cor(creditcardApprove$age, creditcardApprove$approved)
## [1] 0.1640862
fit1 <- lm(age ~ approved, data = creditcardApprove)
summary(fit1)
##
## Call:
## lm(formula = age ~ approved, data = creditcardApprove)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.936 -8.693 -2.396 6.289 50.477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.7730 0.5983 49.767 < 2e-16 ***
## approved 3.9132 0.8969 4.363 1.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.71 on 688 degrees of freedom
## Multiple R-squared: 0.02692, Adjusted R-squared: 0.02551
## F-statistic: 19.04 on 1 and 688 DF, p-value: 1.479e-05
# Change the theme
ggplot(creditcardApprove, aes(x = debt, y = approved)) +
xlab("Debt") +
ylab("Approved") +
theme_minimal(base_size = 12)
# Show Debt versus Approved
p1 <- ggplot(creditcardApprove, aes(x = debt, y = approved)) +
labs(title = "Debt versus Approved",
caption = "Source:UCI machine learning repository on credit card approvals") +
xlab("Debt") +
ylab("Approved") +
theme_minimal(base_size = 12)
p1 + geom_point()
cor(creditcardApprove$debt, creditcardApprove$approved)
## [1] 0.2062937
fit1 <- lm(debt ~ approved, data = creditcardApprove)
summary(fit1)
##
## Call:
## lm(formula = debt ~ approved, data = creditcardApprove)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.905 -3.300 -1.590 2.644 22.495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8399 0.2491 15.42 < 2e-16 ***
## approved 2.0650 0.3734 5.53 4.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.875 on 688 degrees of freedom
## Multiple R-squared: 0.04256, Adjusted R-squared: 0.04117
## F-statistic: 30.58 on 1 and 688 DF, p-value: 4.552e-08
# Change the theme
ggplot(creditcardApprove, aes(x = yearsemployed, y = approved)) +
xlab("Years Employed") +
ylab("Approved") +
theme_minimal(base_size = 12)
# Show Years Employed versus Approved
p1 <- ggplot(creditcardApprove, aes(x = yearsemployed, y = approved)) +
labs(title = "Years Employed versus Approved",
caption = "Source:UCI machine learning repository on credit card approvals") +
xlab("Years Employed") +
ylab("Approved") +
theme_minimal(base_size = 12)
p1 + geom_point()
cor(creditcardApprove$yearsemployed, creditcardApprove$approved)
## [1] 0.3224754
fit1 <- lm(yearsemployed ~ approved, data = creditcardApprove)
summary(fit1)
##
## Call:
## lm(formula = yearsemployed ~ approved, data = creditcardApprove)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4279 -1.2579 -0.9679 0.5621 25.0721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2579 0.1620 7.766 2.95e-14 ***
## approved 2.1700 0.2428 8.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.17 on 688 degrees of freedom
## Multiple R-squared: 0.104, Adjusted R-squared: 0.1027
## F-statistic: 79.85 on 1 and 688 DF, p-value: < 2.2e-16
# Change the theme
ggplot(creditcardApprove, aes(x = creditscore, y = approved)) +
xlab("Credit Score") +
ylab("Approved") +
theme_minimal(base_size = 12)
# Show Credit Score versus Approved
p1 <- ggplot(creditcardApprove, aes(x = creditscore, y = approved)) +
labs(title = "Credit Score versus Approved",
caption = "Source:UCI machine learning repository on credit card approvals") +
xlab("Credit Score") +
ylab("Approved") +
theme_minimal(base_size = 12)
p1 + geom_point()
cor(creditcardApprove$creditscore, creditcardApprove$approved)
## [1] 0.40641
fit1 <- lm(creditscore ~ approved, data = creditcardApprove)
summary(fit1)
##
## Call:
## lm(formula = creditscore ~ approved, data = creditcardApprove)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.606 -0.632 -0.632 0.388 62.394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6319 0.2272 2.781 0.00557 **
## approved 3.9740 0.3406 11.667 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.446 on 688 degrees of freedom
## Multiple R-squared: 0.1652, Adjusted R-squared: 0.164
## F-statistic: 136.1 on 1 and 688 DF, p-value: < 2.2e-16
# The result shows some linear regression between credit score and approved. The higher credit score, the more approved
# How to explain these - the higher income, the lower credit score
# Show Credit Score versus Income
p2 <- ggplot(creditcardApprove, aes(x = income, y = creditscore)) +
labs(title = "Credit Score versus Income",
caption = "Source: The UCI machine learning repository on credit card approvals") +
xlab("Income") +
ylab("Credit Score") +
theme_minimal(base_size = 12)
p2 + geom_point()
# Remove DC and US as outliers - row 46 the Credit Score = 40 and row 123 the Credit Score = 67, they are too high and over the max score
creditcardApprove2 <- creditcardApprove[creditcardApprove$creditscore != "40" & creditcardApprove$creditscore != "67",]
# show geom_point() after removing income != 0 as outlines
p3 <- ggplot(creditcardApprove2, aes(x = income, y = creditscore)) +
labs(title = "Credit Score versus Income",
caption = "Source: The UCI machine learning repository on credit card approvals") +
xlab("Income") +
ylab("Credit Score") +
theme_minimal(base_size = 12)
p3 + geom_point()
# the scatterplot appears to show a correlation assessment
p4 <- p3 + xlim(0,25)+ ylim(0,30)
p4 + geom_point()
## Warning: Removed 292 rows containing missing values (geom_point).
# Add a smoother in red with a confidence interval
p5 <- p4 + geom_point() + geom_smooth(color = "red")
p5
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 292 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.125
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.125
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 4.8235e-030
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.125
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.125
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 4.8235e-030
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 1
## Warning: Removed 292 rows containing missing values (geom_point).
# Show Credit Score versus Income
p6 <- p4 + geom_point(size = 3, alpha = 0.5, aes(color = income)) + geom_smooth(method = 'lm', se =FALSE, color = "red", lty = 2, size = 0.3)
p6
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 292 rows containing non-finite values (stat_smooth).
## Warning: Removed 292 rows containing missing values (geom_point).
# Add a linear regression with confidence interval
p7 <- p4 + geom_point(size = 2, alpha = 0.3, aes(color = income)) + geom_smooth(method='lm',se=FALSE,formula=y~x,color = "black", lty = 5, size = 1)
p7
## Warning: Removed 292 rows containing non-finite values (stat_smooth).
## Warning: Removed 292 rows containing missing values (geom_point).
Notice how the “aes” function colors the points by values in the data, rather than setting them to a single color. ggplot2 recognizes that income_group is a categorical variable, and uses its default qualitative color palette.
Now run this code, to see the different effect of setting the aes color mapping for the entire chart, rather than just one geom layer.
ggplot(creditcardApprove2, aes(x = income, y = creditscore, color=income)) +
labs(title = "Credit Score versus Income",
caption = "Source: The UCI machine learning repository on credit card approvals") +
xlab("Income") +
ylab("Credit Score") +
theme_minimal(base_size = 14, base_family = "Georgia") +
geom_point(size = 3, alpha = 0.5) +
geom_smooth(method=lm, se=FALSE, lty = 1, size = 0.1)
## `geom_smooth()` using formula 'y ~ x'
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
cor(creditcardApprove2$income, creditcardApprove2$creditscore)
## [1] 0.08478451
fit1 <- lm(creditscore ~ income, data = creditcardApprove2)
summary(fit1)
##
## Call:
## lm(formula = creditscore ~ income, data = creditcardApprove2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5967 -2.1861 -2.1861 0.7511 20.7347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.186e+00 1.528e-01 14.306 <2e-16 ***
## income 6.411e-05 2.876e-05 2.229 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.934 on 686 degrees of freedom
## Multiple R-squared: 0.007188, Adjusted R-squared: 0.005741
## F-statistic: 4.967 on 1 and 686 DF, p-value: 0.02616
# the result does not show the good linear regression
Set your working directory to access your files
# load required packages
library(readr)
library(ggplot2)
library(scales)
## Warning: package 'scales' was built under R version 4.1.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(dplyr)
##Make a range of simple charts using the highcharter package
Highcharter is a package within the htmlwidgets framework that connects R to the Highcharts and Highstock JavaScript visualization libraries. For more information, see https://github.com/jbkunst/highcharter/
Also check out this site: https://cran.r-project.org/web/packages/highcharter/vignettes/charting-data-frames.html
Now install and load highcharter, plus RColorBrewer, which will make it possible to use ColorBrewer color palettes.
Also load dplyr and readr for loading and processing data.
# install highcharter, RColorBrewer
#install.packages("highcharter","RColorBrewer")
# load required packages
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(RColorBrewer)
First, prepare the data using dplyr.
# prepare data
x <- creditcardApprove2 %>%
group_by(gender, ethnicity) %>%
summarize(approved = sum(approved, na.rm = TRUE)) %>%
arrange(gender, ethnicity)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
# basic area chart, default options
highchart () %>%
hc_add_series(data = x,
type = "area",
hcaes(x = gender,
y = approved,
group = ethnicity))
# prepare data
x <- creditcardApprove2 %>%
group_by(income, ethnicity) %>%
summarize(approved = sum(approved, na.rm = TRUE)) %>%
arrange(income,ethnicity)
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
will try tomorrow morning again
# basic area chart, default options
highchart () %>%
hc_add_series(data = x,
type = "area",
hcaes(x = income,
y = approved,
group = ethnicity))
# prepare data
x <- creditcardApprove2 %>%
group_by(income, citizen) %>%
summarize(approved = sum(approved, na.rm = TRUE)) %>%
arrange(income,citizen)
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
will try tomorrow morning again
# basic area chart, default options
highchart () %>%
hc_add_series(data = x,
type = "area",
hcaes(x = income,
y = approved,
group = citizen))