Loading Library xlConnect to read the xls data

options(java.parameters = “-Xmx4g” ) library(XLConnect) library(xtable) library(caret) census <- loadWorkbook(“Logistic Regression Test.xlsx”) census_data <- readWorksheet(census,sheet = “Train Data”, header = TRUE)

exploring structure of the train data and then Summarize

str(census_data) table (complete.cases (census_data)) head(census_data)

converting char as factor

cols <- c(“Workclass”,“Education”,“Marital.Status”,“Occupation”,“Relationship”,“Race”,“Sex”,“Native.Country”,“Income”) census_data[cols]= lapply(census_data[cols],as.factor) summary(census_data)

Reading the Test data and converting char as factor

census_test_data <- readWorksheet(census,sheet = “Test Data”, header = TRUE) str(census_test_data) summary(census_test_data) census_test_data[cols]= lapply(census_test_data[cols],as.factor) census_test_data$Age = as.numeric(census_test_data$Age) str(census_test_data) summary(census_test_data) summary(census_test_data$Workclass)

to check the incomelevel

table(census_data$Income)

variable is removed from the training data set due to it’s diminished impact on income level.

census_data$Fnlwgt = NULL census_test_data$Fnlwgt=NULL

library(ggplot2) summary(census_data$Age)

Exploring Numerical data and impact on dependent variable

boxplot of age by income group

boxplot (Age ~ Income, data = census_data, main = “Age distribution at different income levels”, xlab = “Income Levels”, ylab = “Age”, col = “orange”)

histogram of age by income group

ggplot(census_data) + aes(x=as.numeric(Age), group=Income, fill=Income) + geom_histogram(binwidth=1, color=‘black’)

summary(census_data$Education…Num)

boxplot (Education…Num ~ Income, data = census_data, main = “Years of Education for different income levels”, xlab = “Income Levels”, ylab = “Years of Education”, col = “blue”)

summary(census_data$Capital.Gain)

ggplot(census_data) + aes(x=as.numeric(Capital.Loss), group=Income, fill=Income) + geom_histogram(bins=10, color=‘black’) + ggtitle(‘Histogram of Capital Loss’)

ggplot(census_data) + aes(x=as.numeric(Capital.Gain), group=Income, fill=Income) + geom_histogram(bins=10, color=‘black’) + ggtitle(‘Histogram of Capital Loss’)

percentage of observatiosn with no capital gain or loss

sum(census_data$Capital.Gain == 0)/length(census_data$Capital.Gain)

percentage of observatiosn with no capital gain or loss

sum(census_data$Capital.Loss == 0)/length(census_data$Capital.Loss)

summary(census_data$Hours.per.Week) boxplot(Hours.per.Week~Income, data = census_data, main = “Hours Per week Vs Income Level”, xlab = “Income Levels”, ylab=“Hours Per Week”,col = “blue”)

corMat = cor(census_data[, c(“Age”, “Education…Num”, “Capital.Gain”, “Capital.Loss”, “Hours.per.Week”)]) corMat

Now checking the categorical variables and the impact on dependent variable income

table(census_data$Sex) table(census_data[,c(“Sex”,“Income”)])

qplot (Income, data = census_data, fill = Workclass) + facet_grid (. ~ Workclass)

qplot (Income, data = census_data, fill = Occupation) + facet_grid (. ~ Occupation)

qplot (Income, data = census_data, fill = Marital.Status) + facet_grid (. ~ Marital.Status)

qplot (Income, data = census_data, fill = Relationship) + facet_grid (. ~ Relationship)

qplot (Income, data = census_data, fill = Education) + facet_grid (. ~ Education)

summary(census_data)

create model using Logistic regression

model1 <- glm(Income~. , data = census_data, family = binomial) summary(model1)

predicting the model

table(census_test_data$Income) PredictModel <- predict(model1, newdata = census_test_data, type = “response”) #pred_m <- rep(‘<=50K’, length(PredictModel)) #pred_m[PredictModel>=.3] <- ‘>50K’

confusion matrix

confusionmatrix_LR<- table(census_test_data$Income, PredictModel >= 0.3) confusionmatrix_LR

Accuracy <- (confusionmatrix_LR[1,1] + confusionmatrix_LR[2,2])/sum(confusionmatrix_LR) Accuracy tpr <- confusionmatrix_LR[2,2]/(confusionmatrix_LR[2,1]+confusionmatrix_LR[2,2]) tpr fpr <- confusionmatrix_LR[1,2]/(confusionmatrix_LR[1,1]+confusionmatrix_LR[1,2]) fpr library(ROCR) pred1 <- prediction(PredictModel,census_test_data$Income) perf <- performance(pred1,“tpr”,“fpr”) plot(perf)

as.numeric(performance(pred1,“auc”)@y.values)