options(java.parameters = “-Xmx4g” ) library(XLConnect) library(xtable) library(caret) census <- loadWorkbook(“Logistic Regression Test.xlsx”) census_data <- readWorksheet(census,sheet = “Train Data”, header = TRUE)
str(census_data) table (complete.cases (census_data)) head(census_data)
cols <- c(“Workclass”,“Education”,“Marital.Status”,“Occupation”,“Relationship”,“Race”,“Sex”,“Native.Country”,“Income”) census_data[cols]= lapply(census_data[cols],as.factor) summary(census_data)
census_test_data <- readWorksheet(census,sheet = “Test Data”, header = TRUE) str(census_test_data) summary(census_test_data) census_test_data[cols]= lapply(census_test_data[cols],as.factor) census_test_data\(Age = as.numeric(census_test_data\)Age) str(census_test_data) summary(census_test_data) summary(census_test_data$Workclass)
table(census_data$Income)
census_data\(Fnlwgt = NULL census_test_data\)Fnlwgt=NULL
library(ggplot2) summary(census_data$Age)
boxplot (Age ~ Income, data = census_data, main = “Age distribution at different income levels”, xlab = “Income Levels”, ylab = “Age”, col = “orange”)
ggplot(census_data) + aes(x=as.numeric(Age), group=Income, fill=Income) + geom_histogram(binwidth=1, color=‘black’)
summary(census_data$Education…Num)
boxplot (Education…Num ~ Income, data = census_data, main = “Years of Education for different income levels”, xlab = “Income Levels”, ylab = “Years of Education”, col = “blue”)
summary(census_data$Capital.Gain)
ggplot(census_data) + aes(x=as.numeric(Capital.Loss), group=Income, fill=Income) + geom_histogram(bins=10, color=‘black’) + ggtitle(‘Histogram of Capital Loss’)
ggplot(census_data) + aes(x=as.numeric(Capital.Gain), group=Income, fill=Income) + geom_histogram(bins=10, color=‘black’) + ggtitle(‘Histogram of Capital Loss’)
sum(census_data\(Capital.Gain == 0)/length(census_data\)Capital.Gain)
sum(census_data\(Capital.Loss == 0)/length(census_data\)Capital.Loss)
summary(census_data$Hours.per.Week) boxplot(Hours.per.Week~Income, data = census_data, main = “Hours Per week Vs Income Level”, xlab = “Income Levels”, ylab=“Hours Per Week”,col = “blue”)
corMat = cor(census_data[, c(“Age”, “Education…Num”, “Capital.Gain”, “Capital.Loss”, “Hours.per.Week”)]) corMat
table(census_data$Sex) table(census_data[,c(“Sex”,“Income”)])
qplot (Income, data = census_data, fill = Workclass) + facet_grid (. ~ Workclass)
qplot (Income, data = census_data, fill = Occupation) + facet_grid (. ~ Occupation)
qplot (Income, data = census_data, fill = Marital.Status) + facet_grid (. ~ Marital.Status)
qplot (Income, data = census_data, fill = Relationship) + facet_grid (. ~ Relationship)
qplot (Income, data = census_data, fill = Education) + facet_grid (. ~ Education)
summary(census_data)
model1 <- glm(Income~. , data = census_data, family = binomial) summary(model1)
table(census_test_data$Income) PredictModel <- predict(model1, newdata = census_test_data, type = “response”) #pred_m <- rep(‘<=50K’, length(PredictModel)) #pred_m[PredictModel>=.3] <- ‘>50K’
confusionmatrix_LR<- table(census_test_data$Income, PredictModel >= 0.3) confusionmatrix_LR
Accuracy <- (confusionmatrix_LR[1,1] + confusionmatrix_LR[2,2])/sum(confusionmatrix_LR) Accuracy tpr <- confusionmatrix_LR[2,2]/(confusionmatrix_LR[2,1]+confusionmatrix_LR[2,2]) tpr fpr <- confusionmatrix_LR[1,2]/(confusionmatrix_LR[1,1]+confusionmatrix_LR[1,2]) fpr library(ROCR) pred1 <- prediction(PredictModel,census_test_data$Income) perf <- performance(pred1,“tpr”,“fpr”) plot(perf)
as.numeric(performance(pred1,“auc”)@y.values)