knitr::opts_chunk$set(echo = TRUE)
# Read the data file loanapp.csv from GutHub
LoanData = read.table(file="https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-Repos/main/loanapp.csv", header=TRUE,sep=",")
# Question 1. Use the summary function to gain an overview of the data set
summary(LoanData)
## X occ loanamt action msa
## Min. : 1 Min. :1.000 Min. : 2.0 Min. :1.000 Min. :1120
## 1st Qu.: 498 1st Qu.:1.000 1st Qu.:100.0 1st Qu.:1.000 1st Qu.:1120
## Median : 995 Median :1.000 Median :126.0 Median :1.000 Median :1120
## Mean : 995 Mean :1.032 Mean :143.2 Mean :1.276 Mean :1120
## 3rd Qu.:1492 3rd Qu.:1.000 3rd Qu.:165.0 3rd Qu.:1.000 3rd Qu.:1120
## Max. :1989 Max. :3.000 Max. :980.0 Max. :3.000 Max. :1120
##
## suffolk appinc typur unit
## Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 48.00 1st Qu.:0.000 1st Qu.:1.000
## Median :0.0000 Median : 64.00 Median :0.000 Median :1.000
## Mean :0.1543 Mean : 84.68 Mean :1.531 Mean :1.122
## 3rd Qu.:0.0000 3rd Qu.: 88.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1.0000 Max. :972.00 Max. :9.000 Max. :4.000
## NA's :4
## married dep emp yjob
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.000
## Mean :0.6586 Mean :0.7709 Mean :0.2097 Mean :0.449
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :8.0000 Max. :9.0000 Max. :9.000
## NA's :3 NA's :3
## self atotinc cototinc hexp
## Min. :0.0000 Min. : 0 Min. : 0 Min. : 154
## 1st Qu.:0.0000 1st Qu.: 2876 1st Qu.: 0 1st Qu.: 1054
## Median :0.0000 Median : 3813 Median : 1145 Median : 1317
## Mean :0.1292 Mean : 5196 Mean : 1547 Mean : 1505
## 3rd Qu.:0.0000 3rd Qu.: 5596 3rd Qu.: 2417 3rd Qu.: 1715
## Max. :1.0000 Max. :81000 Max. :41667 Max. :10798
##
## price other liq rep
## Min. : 25.0 Min. : 0.00 Min. : 0 Min. :0.000
## 1st Qu.: 129.0 1st Qu.: 0.00 1st Qu.: 20 1st Qu.:1.000
## Median : 163.0 Median : 0.00 Median : 38 Median :1.000
## Mean : 196.3 Mean : 2.37 Mean : 4618 Mean :1.503
## 3rd Qu.: 225.0 3rd Qu.: 0.00 3rd Qu.: 83 3rd Qu.:2.000
## Max. :1535.0 Max. :1020.00 Max. :1000000 Max. :9.000
## NA's :9
## gdlin lines mortg cons
## Min. : 0.000 Min. : 0.0 Min. :1.000 Min. :1.00
## 1st Qu.: 1.000 1st Qu.: 7.0 1st Qu.:1.000 1st Qu.:1.00
## Median : 1.000 Median : 12.0 Median :2.000 Median :1.00
## Mean : 1.583 Mean : 516.4 Mean :1.708 Mean :2.11
## 3rd Qu.: 1.000 3rd Qu.: 19.0 3rd Qu.:2.000 3rd Qu.:2.00
## Max. :666.000 Max. :999999.4 Max. :4.000 Max. :6.00
##
## pubrec hrat obrat fixadj
## Min. :0.00000 Min. : 1.00 Min. : 0.00 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:21.00 1st Qu.:28.00 1st Qu.:0.0000
## Median :0.00000 Median :25.77 Median :33.00 Median :0.0000
## Mean :0.06888 Mean :24.79 Mean :32.39 Mean :0.3082
## 3rd Qu.:0.00000 3rd Qu.:29.00 3rd Qu.:37.00 3rd Qu.:1.0000
## Max. :1.00000 Max. :72.00 Max. :95.00 Max. :1.0000
##
## term apr prop inss
## Min. : 6 Min. : 25.0 Min. :1.000 Min. :0.0000
## 1st Qu.: 360 1st Qu.: 135.0 1st Qu.:2.000 1st Qu.:0.0000
## Median : 360 Median : 169.0 Median :2.000 Median :0.0000
## Mean : 2352 Mean : 205.1 Mean :1.861 Mean :0.2001
## 3rd Qu.: 360 3rd Qu.: 230.0 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :999999 Max. :4316.0 Max. :3.000 Max. :1.0000
##
## inson gift cosign unver
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.01508 Mean :0.1599 Mean :0.02866 Mean :0.04274
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.00000
##
## review netw unem min30
## Min. : 0.0 Min. :-7919.0 Min. : 1.800 Min. :0.00000
## 1st Qu.: 1.0 1st Qu.: 43.0 1st Qu.: 3.100 1st Qu.:0.00000
## Median : 2.0 Median : 95.0 Median : 3.200 Median :0.00000
## Mean :113.7 Mean : 266.6 Mean : 3.882 Mean :0.05703
## 3rd Qu.: 3.0 3rd Qu.: 229.6 3rd Qu.: 3.900 3rd Qu.:0.00000
## Max. :999.0 Max. :28023.0 Max. :10.600 Max. :1.00000
## NA's :183
## bd mi old vr
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.4208 Mean :0.8728 Mean :0.4676 Mean :0.4098
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## sch black hispan male
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:1.0000
## Median :1.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.7717 Mean :0.09904 Mean :0.05581 Mean :0.8131
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## NA's :15
## reject approve mortno mortperf
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.1227 Mean :0.8773 Mean :0.3318 Mean :0.6385
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## mortlat1 mortlat2 chist multi
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :1.0000 Median :0.00000
## Mean :0.01911 Mean :0.01056 Mean :0.8376 Mean :0.08615
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## NA's :4
## loanprc thick white
## Min. :0.02105 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.70000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.80000 Median :0.0000 Median :1.0000
## Mean :0.77064 Mean :0.1051 Mean :0.8451
## 3rd Qu.:0.89894 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :2.57143 Max. :1.0000 Max. :1.0000
## NA's :9
# Question 1. Display the mean and median for at least two attributes.
MeanLoan <- sapply(LoanData[, c("loanamt", "netw", "loanprc")], mean)
MedianLoan <- sapply(LoanData[, c("loanamt", "netw", "loanprc")], median)
MeanMedianLoanDF <- data.frame(rbind(MeanLoan, MedianLoan))
print(MeanMedianLoanDF)
## loanamt netw loanprc
## MeanLoan 143.2453 266.5691 0.7706397
## MedianLoan 126.0000 95.0000 0.8000000
# Question 2. Create a new data frame with a subset of the columns and rows. Make sure to rename it.
LoanData_sub <- subset(LoanData, loanamt > 300 & term > 240)
LoanData_sub <- LoanData_sub[1:10, c("married", "loanamt", "price", "liq", "netw", "loanprc")]
head(LoanData_sub)
## married loanamt price liq netw loanprc
## 19 1 349 387 57.0 598.0 0.9018088
## 45 1 315 355 42.9 66.7 0.8873239
## 47 1 632 790 183.0 1427.0 0.8000000
## 61 1 310 457 154.0 193.0 0.6783370
## 89 1 732 975 1213.0 1550.0 0.7507693
## 99 0 320 630 421.2 1051.2 0.5079365
# Question 3. Create new column names for the new data frame.
colnames(LoanData_sub) <- c("Marital Status", "Loan Amount","Loan Price", "Liquid", "Networth", "Loan PRC")
head(LoanData_sub)
## Marital Status Loan Amount Loan Price Liquid Networth Loan PRC
## 19 1 349 387 57.0 598.0 0.9018088
## 45 1 315 355 42.9 66.7 0.8873239
## 47 1 632 790 183.0 1427.0 0.8000000
## 61 1 310 457 154.0 193.0 0.6783370
## 89 1 732 975 1213.0 1550.0 0.7507693
## 99 0 320 630 421.2 1051.2 0.5079365
# Question 4. Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.
summary(LoanData_sub)
## Marital Status Loan Amount Loan Price Liquid
## Min. :0.0 Min. :304.0 Min. :355.0 Min. : 42.9
## 1st Qu.:1.0 1st Qu.:312.8 1st Qu.:387.8 1st Qu.: 75.0
## Median :1.0 Median :334.5 Median :538.5 Median : 168.5
## Mean :0.9 Mean :417.4 Mean :563.4 Mean : 324.9
## 3rd Qu.:1.0 3rd Qu.:450.0 3rd Qu.:645.0 3rd Qu.: 407.1
## Max. :1.0 Max. :732.0 Max. :975.0 Max. :1213.0
## Networth Loan PRC
## Min. : 66.7 Min. :0.5079
## 1st Qu.: 366.2 1st Qu.:0.7007
## Median : 746.7 Median :0.7754
## Mean : 754.1 Mean :0.7544
## 3rd Qu.:1006.7 3rd Qu.:0.8000
## Max. :1550.0 Max. :0.9018
MeanLoan <- sapply(LoanData_sub[, c("Loan Amount", "Networth", "Loan PRC")], mean)
MedianLoan <- sapply(LoanData_sub[, c("Loan Amount", "Networth", "Loan PRC")], median)
MeanMedianLoanSubDF <- data.frame(rbind(MeanLoan, MedianLoan))
print(MeanMedianLoanSubDF)
## Loan.Amount Networth Loan.PRC
## MeanLoan 417.4 754.131 0.7544290
## MedianLoan 334.5 746.655 0.7753846
compare_mn_mdn <- MeanMedianLoanDF - MeanMedianLoanSubDF
names(compare_mn_mdn) <- paste(names(compare_mn_mdn), "_Diff", sep = "")
print(compare_mn_mdn)
## loanamt_Diff netw_Diff loanprc_Diff
## MeanLoan -274.1547 -487.5619 0.01621070
## MedianLoan -208.5000 -651.6550 0.02461538
# Question 5. For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.
LoanData_sub$`Marital Status` <- with(LoanData_sub, replace(`Marital Status`, `Marital Status`=="0", "Single"))
LoanData_sub$`Marital Status` <- with(LoanData_sub, replace(`Marital Status`, `Marital Status`=="1", "Married"))
LoanData_sub$`Marital Status` <- with(LoanData_sub, replace(`Marital Status`, `Marital Status`=="NA", "Divorced"))
head(LoanData_sub)
## Marital Status Loan Amount Loan Price Liquid Networth Loan PRC
## 19 Married 349 387 57.0 598.0 0.9018088
## 45 Married 315 355 42.9 66.7 0.8873239
## 47 Married 632 790 183.0 1427.0 0.8000000
## 61 Married 310 457 154.0 193.0 0.6783370
## 89 Married 732 975 1213.0 1550.0 0.7507693
## 99 Single 320 630 421.2 1051.2 0.5079365
# Question 6. Display enough rows to see examples of all of steps 1-5 above.
head(LoanData_sub, 10)
## Marital Status Loan Amount Loan Price Liquid Networth Loan PRC
## 19 Married 349 387 57.0 598.00 0.9018088
## 45 Married 315 355 42.9 66.70 0.8873239
## 47 Married 632 790 183.0 1427.00 0.8000000
## 61 Married 310 457 154.0 193.00 0.6783370
## 89 Married 732 975 1213.0 1550.00 0.7507693
## 99 Single 320 630 421.2 1051.20 0.5079365
## 117 Married 450 650 365.0 873.10 0.6923077
## 203 Married 450 620 60.0 641.00 0.7258065
## 217 Married 304 380 632.6 852.31 0.8000000
## 232 Married 312 390 120.0 289.00 0.8000000
# Question 7. BONUS – place the original .csv in a github file and have R read from the link. This will be a very
#useful skill as you progress in your data science education and career.
LoanDatGit <- "https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-Repos/main/loanapp.csv"
ReadGithub <- read.csv(LoanDatGit, header=TRUE)
head(ReadGithub)
## X occ loanamt action msa suffolk appinc typur unit married dep emp yjob self
## 1 1 1 89 1 1120 0 72 0 1 0 0 0 0 0
## 2 2 1 128 3 1120 0 74 0 1 1 1 0 0 0
## 3 3 1 128 1 1120 0 84 3 1 0 0 1 1 0
## 4 4 1 66 1 1120 0 36 0 1 1 0 0 0 1
## 5 5 1 120 1 1120 0 59 8 1 1 0 0 0 0
## 6 6 1 111 1 1120 0 63 9 1 0 0 0 0 0
## atotinc cototinc hexp price other liq rep gdlin lines mortg cons pubrec
## 1 5849 0 1031 118 0 34.5 1 1 15 2 1 0
## 2 4583 1508 1391 160 0 52.0 3 1 19 2 2 0
## 3 2666 4416 1371 143 0 37.0 6 1 18 2 2 0
## 4 3000 0 839 110 0 19.0 1 1 25 2 6 1
## 5 2583 2358 1341 134 0 31.0 1 1 15 2 1 0
## 6 2208 2959 1122 138 0 169.0 2 1 10 2 6 0
## hrat obrat fixadj term apr prop inss inson gift cosign unver review netw
## 1 17.63 34.5 0 360 118 1 0 0 0 0 0 1 99.6
## 2 22.54 34.1 1 360 175 2 0 0 0 0 0 999 847.0
## 3 19.00 26.0 0 180 145 2 0 0 0 0 0 3 40.0
## 4 24.00 37.0 0 360 110 2 0 0 1 0 0 2 158.0
## 5 25.10 32.1 0 360 135 1 1 0 0 0 0 2 69.0
## 6 21.00 33.0 0 360 144 2 0 0 0 0 0 1 262.0
## unem min30 bd mi old vr sch black hispan male reject approve mortno mortperf
## 1 3.2 0 0 1 0 1 1 0 0 NA 0 1 0 1
## 2 3.2 0 0 1 0 1 1 0 0 1 1 0 0 1
## 3 3.9 0 1 1 0 0 1 0 0 1 0 1 0 1
## 4 3.1 0 0 1 1 1 1 0 0 1 0 1 0 1
## 5 4.3 0 1 1 0 0 0 0 0 1 0 1 0 1
## 6 3.2 0 1 1 0 0 0 0 0 1 0 1 0 1
## mortlat1 mortlat2 chist multi loanprc thick white
## 1 0 0 1 0 0.7542373 0 1
## 2 0 0 1 0 0.8000000 1 1
## 3 0 0 1 0 0.8951049 1 1
## 4 0 0 0 0 0.6000000 0 1
## 5 0 0 1 0 0.8955224 0 1
## 6 0 0 0 0 0.8043478 0 1