Lets introduce our files:
setwd("C:/Users/Avner/MarketMaker") # Set the working Directory
train<-read.csv("cs-training.csv") # Import train Data
#test<-read.csv("cs-test.csv") # Import Test data
sample_submission<-read.csv("sampleEntry.csv") #Import data we need to use
train$X <- NULL #deleting X
names(train) #See name of train
## [1] "SeriousDlqin2yrs"
## [2] "RevolvingUtilizationOfUnsecuredLines"
## [3] "age"
## [4] "NumberOfTime30.59DaysPastDueNotWorse"
## [5] "DebtRatio"
## [6] "MonthlyIncome"
## [7] "NumberOfOpenCreditLinesAndLoans"
## [8] "NumberOfTimes90DaysLate"
## [9] "NumberRealEstateLoansOrLines"
## [10] "NumberOfTime60.89DaysPastDueNotWorse"
## [11] "NumberOfDependents"
names(train) <- c("CredOrNo", "DefaultLine","Age","DueNotWorse3059","DebtRatio","MonthlyIncome","NumberOfOpen","Plus90DaysLate","RealEstateLoans","DueNotWorse6089","NumberOfDependts") #Rename
names(train)
## [1] "CredOrNo" "DefaultLine" "Age"
## [4] "DueNotWorse3059" "DebtRatio" "MonthlyIncome"
## [7] "NumberOfOpen" "Plus90DaysLate" "RealEstateLoans"
## [10] "DueNotWorse6089" "NumberOfDependts"
sapply(train,function(x) sum(is.na(x))) #Sum of data of train where na
## CredOrNo DefaultLine Age DueNotWorse3059
## 0 0 0 0
## DebtRatio MonthlyIncome NumberOfOpen Plus90DaysLate
## 0 29731 0 0
## RealEstateLoans DueNotWorse6089 NumberOfDependts
## 0 0 3924
#sapply(test,function(x) sum(is.na(x)))
sapply(train, function(x) length(unique(x))) #Number of line of differents value
## CredOrNo DefaultLine Age DueNotWorse3059
## 2 125728 86 16
## DebtRatio MonthlyIncome NumberOfOpen Plus90DaysLate
## 114194 13595 58 19
## RealEstateLoans DueNotWorse6089 NumberOfDependts
## 28 13 14
#sapply(test, function(x) length(unique(x)))
missmap(train, main = "Missing values vs observed from Train") #visualization of our data
#missmap(test, main = "Missing values vs observed from Test")
#We see that there is missing data in 'number of dependant' and 'Monthly income'
train$MonthlyIncome[is.na(train$MonthlyIncome)] <- 0
train$NumberOfDependts[is.na(train$NumberOfDependts)] <- 0
train$MonthlyIncome[is.na(train$MonthlyIncome)] <- mean(train$MonthlyIncome,na.rm=T)
train$NumberOfDependts[is.na(train$NumberOfDependts)] <- mean(train$NumberOfDependts,na.rm=T)
model<-lm(train$MonthlyIncome~.,data=train)
model2<-lm(train$NumberOfDependts~.,data=train)
anova(model)
## Analysis of Variance Table
##
## Response: train$MonthlyIncome
## Df Sum Sq Mean Sq F value Pr(>F)
## CredOrNo 1 4.6047e+09 4.6047e+09 27.5106 1.565e-07 ***
## DefaultLine 1 9.8568e+08 9.8568e+08 5.8889 0.0152379 *
## Age 1 2.5700e+08 2.5700e+08 1.5354 0.2152999
## DueNotWorse3059 1 2.3612e+09 2.3612e+09 14.1066 0.0001728 ***
## DebtRatio 1 1.2948e+11 1.2948e+11 773.5879 < 2.2e-16 ***
## NumberOfOpen 1 3.0392e+11 3.0392e+11 1815.7299 < 2.2e-16 ***
## Plus90DaysLate 1 6.0108e+08 6.0108e+08 3.5911 0.0580909 .
## RealEstateLoans 1 2.3990e+11 2.3990e+11 1433.2377 < 2.2e-16 ***
## DueNotWorse6089 1 2.8706e+06 2.8706e+06 0.0172 0.8958075
## NumberOfDependts 1 1.5902e+11 1.5902e+11 950.0660 < 2.2e-16 ***
## Residuals 149989 2.5105e+13 1.6738e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2)
## Analysis of Variance Table
##
## Response: train$NumberOfDependts
## Df Sum Sq Mean Sq F value Pr(>F)
## CredOrNo 1 404 403.8 357.5439 < 2.2e-16 ***
## DefaultLine 1 0 0.3 0.2656 0.6063
## Age 1 8238 8238.0 7294.2640 < 2.2e-16 ***
## DueNotWorse3059 1 81 81.1 71.8108 < 2.2e-16 ***
## DebtRatio 1 283 283.3 250.8507 < 2.2e-16 ***
## MonthlyIncome 1 1751 1750.6 1550.0827 < 2.2e-16 ***
## NumberOfOpen 1 1781 1781.4 1577.3223 < 2.2e-16 ***
## Plus90DaysLate 1 105 105.1 93.1036 < 2.2e-16 ***
## RealEstateLoans 1 1727 1726.5 1528.7411 < 2.2e-16 ***
## DueNotWorse6089 1 57 57.4 50.8094 1.022e-12 ***
## Residuals 149989 169396 1.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,2))
plot(model)
train <- na.omit(train)
missmap(train, main = "Missing values vs observed from Train") #visualization of our data
It’s seem to be the best solution !