Predictive Analytics ALY6020 - Assignment 1

This Assignment focuses on Applied Predictve Modelling Techniques such as Linear Regression, Logistic Regression, Gradient Descent and Generalized Linear Modeling via Maximum Likelihood Estimation

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  1. R-programming: user-defined functions
  1. Create a user defined function in R taking two arguments to exponentiate arg1 to the arg2 power.
exponentiate_function <- function(a,b) 
{
  exp <- a^b 
  return(exp)}

exp_result <- exponentiate_function(2,2)
exp_result
## [1] 4
  1. Applied Predictive Modeling: Linear regression review
  1. Load the data http://data.princeton.edu/wws509/datasets/salary_dataary.dat into R
  2. Split data into train/test based on a 80/20 split (use seed: 123).
  3. Fit model with train data to predict salary based on sex, rank, year, degree, and years since degree was awarded.
  4. Interpret model results
  5. Does model meet linear regression assumptions?
  6. Use trained linear regression model to make predictions with test data
  7. Assess prediction error/accuracy with appropriate metrics comparing predicted vs. observed values. Interpret accordingly.
#Reading the data
salary_data <- read.table("http://data.princeton.edu/wws509/datasets/salary.dat", header=TRUE)
salary_data
##        sx        rk yr        dg yd    sl
## 1    male      full 25 doctorate 35 36350
## 2    male      full 13 doctorate 22 35350
## 3    male      full 10 doctorate 23 28200
## 4  female      full  7 doctorate 27 26775
## 5    male      full 19   masters 30 33696
## 6    male      full 16 doctorate 21 28516
## 7  female      full  0   masters 32 24900
## 8    male      full 16 doctorate 18 31909
## 9    male      full 13   masters 30 31850
## 10   male      full 13   masters 31 32850
## 11   male      full 12 doctorate 22 27025
## 12   male associate 15 doctorate 19 24750
## 13   male      full  9 doctorate 17 28200
## 14   male associate  9   masters 27 23712
## 15   male      full  9 doctorate 24 25748
## 16   male      full  7 doctorate 15 29342
## 17   male      full 13 doctorate 20 31114
## 18   male associate 11   masters 14 24742
## 19   male associate 10   masters 15 22906
## 20   male      full  6   masters 21 24450
## 21   male assistant 16   masters 23 19175
## 22   male associate  8   masters 31 20525
## 23   male      full  7 doctorate 13 27959
## 24 female      full  8 doctorate 24 38045
## 25   male associate  9 doctorate 12 24832
## 26   male      full  5 doctorate 18 25400
## 27   male associate 11 doctorate 14 24800
## 28 female      full  5 doctorate 16 25500
## 29   male associate  3   masters  7 26182
## 30   male associate  3   masters 17 23725
## 31 female assistant 10   masters 15 21600
## 32   male associate 11   masters 31 23300
## 33   male assistant  9   masters 14 23713
## 34 female associate  4   masters 33 20690
## 35 female associate  6   masters 29 22450
## 36   male associate  1 doctorate  9 20850
## 37 female assistant  8 doctorate 14 18304
## 38   male assistant  4 doctorate  4 17095
## 39   male assistant  4 doctorate  5 16700
## 40   male assistant  4 doctorate  4 17600
## 41   male assistant  3 doctorate  4 18075
## 42   male assistant  3   masters 11 18000
## 43   male associate  0 doctorate  7 20999
## 44 female assistant  3 doctorate  3 17250
## 45   male assistant  2 doctorate  3 16500
## 46   male assistant  2 doctorate  1 16094
## 47 female assistant  2 doctorate  6 16150
## 48 female assistant  2 doctorate  2 15350
## 49   male assistant  1 doctorate  1 16244
## 50 female assistant  1 doctorate  1 16686
## 51 female assistant  1 doctorate  1 15000
## 52 female assistant  0 doctorate  2 20300
class(salary_data)
## [1] "data.frame"
sum(is.na(salary_data))
## [1] 0
#Coding Sex, Rank, Highest degree column
salary_data$sx <- as.factor(salary_data$sx)
#levels(salary_data$sx) <- c("1", "0")
salary_data$rk <- as.factor(salary_data$rk)
#levels(salary_data$rk) <- c("1", "2","3")
salary_data$dg <- as.factor(salary_data$dg)
#levels(salary_data$dg) <- c("1", "0")
head(salary_data)
##       sx   rk yr        dg yd    sl
## 1   male full 25 doctorate 35 36350
## 2   male full 13 doctorate 22 35350
## 3   male full 10 doctorate 23 28200
## 4 female full  7 doctorate 27 26775
## 5   male full 19   masters 30 33696
## 6   male full 16 doctorate 21 28516
#Splitting the data into training and testing data
set.seed(123) 
row.number <- sample(x=1:nrow(salary_data), size=0.80*nrow(salary_data))
train = salary_data[row.number,]
test = salary_data[-row.number,]

#Fittting Linear Model
lm_fit <- lm(sl~., data=train)
lm(formula = sl ~ ., data = train)
## 
## Call:
## lm(formula = sl ~ ., data = train)
## 
## Coefficients:
## (Intercept)       sxmale  rkassociate       rkfull           yr  
##     16601.7       -532.9       5445.7      10908.2        528.1  
##   dgmasters           yd  
##      1646.3       -165.5
#Predicting values with test data
predicted_sl <- predict(lm_fit, newdata = test)
observed_sl <- test$sl
TestData <- cbind(test, predicted_sl)

#Measuring Prediction Accuracy
TestData <- mutate(TestData, Accuracy = (1-(abs(sl-predicted_sl)/sl))*100)
TestData
##        sx        rk yr        dg yd    sl predicted_sl Accuracy
## 1    male      full 16 doctorate 21 28516     31952.80 87.94781
## 2    male      full 16 doctorate 18 31909     32449.17 98.30717
## 3    male      full 13   masters 31 32850     30360.10 92.42039
## 4    male      full 12 doctorate 22 27025     29674.78 90.19510
## 5    male      full 13 doctorate 20 31114     30533.83 98.13533
## 6    male associate 11   masters 14 24742     26654.02 92.27216
## 7  female      full  8 doctorate 24 38045     27764.17 72.97718
## 8    male associate 11 doctorate 14 24800     25007.76 99.16228
## 9  female assistant  8 doctorate 14 18304     18510.50 98.87182
## 10 female assistant  2 doctorate  6 16150     16665.28 96.80943
## 11 female assistant  1 doctorate  1 16686     16964.41 98.33150
avg_accuracy <- mean(TestData$Accuracy)
avg_accuracy
## [1] 93.22092
#Prediction Accuracy Meassure : Correlation
correlation_accuracy <- cor(TestData$sl,TestData$predicted_sl)
correlation_accuracy
## [1] 0.8568947
#Model Summary
summary(lm_fit)
## 
## Call:
## lm(formula = sl ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3867.6  -865.7     1.6   642.6  5147.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16601.72     799.35  20.769  < 2e-16 ***
## sxmale       -532.87     861.04  -0.619   0.5401    
## rkassociate  5445.70    1067.90   5.099 1.28e-05 ***
## rkfull      10908.21    1250.47   8.723 3.41e-10 ***
## yr            528.14      90.51   5.835 1.41e-06 ***
## dgmasters    1646.27     940.50   1.750   0.0891 .  
## yd           -165.45      72.64  -2.278   0.0292 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2002 on 34 degrees of freedom
## Multiple R-squared:  0.8853, Adjusted R-squared:  0.8651 
## F-statistic: 43.76 on 6 and 34 DF,  p-value: 1.391e-14
#Calculating Error Metrics

library("DMwR")
## Loading required package: lattice
## Loading required package: grid
DMwR::regr.eval(TestData$sl,TestData$predicted_sl)
##          mae          mse         rmse         mape 
## 2.099783e+03 1.231283e+07 3.508964e+03 6.779077e-02
SSE <- sum((observed_sl - predicted_sl) ^ 2)
SST <- sum((observed_sl - mean(observed_sl)) ^ 2)
r2 <- 1 - SSE/SST
r2
## [1] 0.7317572
rmse <- sqrt(sum((predicted_sl - observed_sl)^2)/length(observed_sl))/1000
rmse
## [1] 3.508964
mae <- mean(abs(observed_sl - predicted_sl))/1000
mae
## [1] 2.099783
rss <- sum((predicted_sl - observed_sl)^2)
rss
## [1] 135441115
#Checking basic assumptions of Linear Regression
res <- lm_fit$residuals
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.9661, p-value = 0.2555
library("ggfortify")
## Warning: package 'ggfortify' was built under R version 3.5.2
## Loading required package: ggplot2
## 
## Attaching package: 'ggfortify'
## The following object is masked from 'package:DMwR':
## 
##     unscale
autoplot(lm_fit, label.size = 5)

  1. Applied Predictive Analytics: Logistic Regression.
  1. Load binary dataset using the following code: sales <- read.csv(“http://ucanalytics.com/blogs/wp-content/uploads/2017/09/Data-L-Reg-Gradient-Descent.csv”) sales\(X1plusX2 <- NULL var_names <- c("expenditure", "income", "purchase") names(sales) <- var_names Standardize predictors (X1, X2) sales\)expenditure <- scale(sales\(expenditure, scale=TRUE, center = TRUE) sales\)income <- scale(sales$income, scale=TRUE, center = TRUE) Context: this data derive from a marketing campaign. Marketers wanted to know the relationship between previous product expenditures (X1), consumer income (X2), and probability of purchasing their product (Y).
  2. Separate sales data into train/test sets (75/25) using seed(5689).
  3. Train logistic regression GLM model to predict Y from X1-X2.
  4. Test trained logistic regression model using test data (predict Ys using trained model, and convert probabilities to 0s/1s).
  5. Evaluate by comparing mismatch in observed/predicted 0s/1s. How did your model perform?
#Load binary dataset using the following code:
sales <- read.csv("http://ucanalytics.com/blogs/wp-content/uploads/2017/09/Data-L-Reg-Gradient-Descent.csv")
sales$X1plusX2 <- NULL
var_names <- c("expenditure", "income", "purchase")
names(sales) <- var_names
#Standardize predictors (X1, X2) 
sales$expenditure <- scale(sales$expenditure, scale=TRUE, center = TRUE)
sales$income <- scale(sales$income, scale=TRUE, center = TRUE)

#Separate sales data into train/test sets (75/25) using seed(5689).
set.seed(5689) 
row.number <- sample(x=1:nrow(sales), size=0.75*nrow(sales))
train_data = sales[row.number,]
test_data = sales[-row.number,]

#Train logistic regression GLM model to predict Y(Probability of purchasing their product) from X1(Expenditure)-X2.(Consumer Income)
#Building Logit Model on Training Dataset
model2 <- glm(train_data$purchase~(train_data$expenditure)+(train_data$income), family=binomial(link='logit') , data=train_data)
summary(model2)
## 
## Call:
## glm(formula = train_data$purchase ~ (train_data$expenditure) + 
##     (train_data$income), family = binomial(link = "logit"), data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1374  -0.7306   0.1735   0.7299   2.7885  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.04201    0.14897   0.282    0.778    
## train_data$expenditure  1.11422    0.18296   6.090 1.13e-09 ***
## train_data$income       1.63191    0.20183   8.086 6.18e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 415.67  on 299  degrees of freedom
## Residual deviance: 277.39  on 297  degrees of freedom
## AIC: 283.39
## 
## Number of Fisher Scoring iterations: 5
#Test trained logistic regression model using test data (predict Ys using trained model, and convert probabilities to 0s/1s).
#Predicting Y on Test Dataset

#Evaluate by comparing mismatch in observed/predicted 0s/1s. How did your model perform?
#Logistic Regression Model Evaluation using a confusion matrix

train_predicted_Y <- (predict(model2,train_data,type = "response"))
(y_pred_num_train <- ifelse(train_predicted_Y > 0.5, 1,0)) 
##  13 394  42 384 114 218 136 121 286  72 248  46 341 370 138 231 139 337 
##   1   0   1   1   1   1   1   0   1   1   0   0   0   1   0   0   1   1 
## 348 353 124  47 197 269  39  91 367  15 217 330 126 203 285 132 328  38 
##   1   0   0   1   0   1   0   0   1   0   1   0   0   0   1   1   0   0 
## 227 355 307 344 268 346 214  28 112  62 321  73 216 108 221 274 127 162 
##   1   1   0   1   1   0   0   0   0   1   1   1   1   0   0   1   1   1 
## 200 282 375 153  85  67  61 167 371  66 352 142 210 166 141  76 380   8 
##   0   1   0   0   1   1   1   1   1   1   0   1   1   1   0   0   0   1 
## 399  97 277 271 115 315  79 156 196 279 303 184   7 389 157 165 290  49 
##   0   1   1   0   1   0   1   1   0   0   0   1   1   1   1   0   0   1 
## 143 176  59 173 336 298  92 397 281 174 122 171  71 361 338  43 368 244 
##   0   1   0   1   0   1   0   1   0   1   0   0   1   0   1   0   0   0 
## 182 253 388  88 123 146 396  86 187  93  69  96 250 230 377 278 273 324 
##   1   1   0   0   1   0   1   0   0   1   1   1   1   1   1   0   0   0 
##  68 208 275 219 193 229 213  83 205 356 313 158 181 109 160 185 291 107 
##   1   1   0   0   1   1   0   0   1   0   1   0   0   1   0   0   0   1 
## 304  18 280 242 236 145  80 372 287   2 398 103 111  22 161 350 342  94 
##   1   1   1   0   1   0   0   0   1   1   1   0   0   1   1   1   1   0 
##  29 179  32 186 358 345 322  35  27 238 149 365 366 188 222 305  52 195 
##   0   0   1   1   0   1   1   0   0   0   1   1   0   0   1   1   1   1 
## 259 189  64  60  48 354  26 390 297 378  41 335 270  25  75  65 316 318 
##   1   1   1   0   1   0   0   1   1   1   1   1   0   1   1   0   0   1 
## 351 204 249   1 154  17 347 308 120 400  14  78 226 170  44 241 254 323 
##   0   0   0   1   0   0   0   0   1   0   0   1   0   1   1   1   0   0 
## 237 131 339 284 393 113 265 183 325 198 258 223  12 363 320  74 135  50 
##   0   0   1   1   1   1   0   1   1   0   0   0   1   1   0   1   0   1 
## 327 202 199 266  19  30 302 110  98 175 125 382 256  36  24 243 215 225 
##   0   1   1   0   0   1   1   1   0   1   0   0   0   0   0   1   1   0 
##  89 310  40 360 329 155  82 260 309 102  20 190  16 169 306 119 357  37 
##   1   1   1   0   1   0   0   0   0   1   0   1   1   1   1   0   0   0 
## 276 152 379 312 177 252 272 289 245 104 314 294 101 159 283  58 234 293 
##   1   1   0   0   1   1   0   0   1   0   1   0   1   1   1   0   0   0 
##  57 129 172 319 147   9 233   5 235 168  84 374 
##   0   0   0   0   0   0   1   1   1   1   1   0
#Making predictions on the training dataset
training_table <- table(predicted=y_pred_num_train, actual = train_data$purchase)
training_table
##          actual
## predicted   0   1
##         0 114  31
##         1  32 123
test_predicted_Y <- (predict(model2,test_data,type = "response"))
## Warning: 'newdata' had 100 rows but variables found have 300 rows
(y_pred_num_test <- ifelse(test_predicted_Y > 0.5, 1,0)) 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   0   1   1   1   1   1   0   1   1   0   0   0   1   0   0   1   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   1   0   0   1   0   1   0   0   1   0   1   0   0   0   1   1   0   0 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   1   0   1   1   0   0   0   0   1   1   1   1   0   0   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   0   1   0   0   1   1   1   1   1   1   0   1   1   1   0   0   0   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   0   1   1   0   1   0   1   1   0   0   0   1   1   1   1   0   0   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   0   1   0   1   0   1   0   1   0   1   0   0   1   0   1   0   0   0 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   1   1   0   0   1   0   1   0   0   1   1   1   1   1   1   0   0   0 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   0   0   1   1   0   0   1   0   1   0   0   1   0   0   0   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   0   1   0   0   0   1   1   1   0   0   1   1   1   1   0 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   0   0   1   1   0   1   1   0   0   0   1   1   0   0   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   1   1   1   0   1   0   0   1   1   1   1   1   0   1   1   0   0   1 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   0   0   0   1   0   0   0   0   1   0   0   1   0   1   1   1   0   0 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   0   0   1   1   1   1   0   1   1   0   0   0   1   1   0   1   0   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   0   1   1   0   0   1   1   1   0   1   0   0   0   0   0   1   1   0 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   1   1   1   0   1   0   0   0   0   1   0   1   1   1   1   0   0   0 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   1   1   0   0   1   1   0   0   1   0   1   0   1   1   1   0   0   0 
## 289 290 291 292 293 294 295 296 297 298 299 300 
##   0   0   0   0   0   0   1   1   1   1   1   0
y_pred_num_test <- data.frame(y_pred_num_test)
y_pred_num_test <- y_pred_num_test[1:100,]
#Making predictions on the testing dataset
testing_table <- table(predicted=y_pred_num_test, actual = test_data$purchase)
testing_table
##          actual
## predicted  0  1
##         0 21 24
##         1 33 22
#Calculating error rate for training and testing prediction

calc_class_err <- function(actual,predicted){
  mean(actual!= predicted)
}

#Train error
calc_class_err(actual = train_data$purchase, predicted = y_pred_num_train)
## [1] 0.21
#Test error
calc_class_err(actual = test_data$purchase, predicted = y_pred_num_test)
## [1] 0.57
library("ROCR")
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("Metrics")
pr <- prediction(y_pred_num_test,test_data$purchase)
perf <- performance(pr,measure = "tpr", x.measure = "fpr")
plot(perf,colorize=TRUE, text.adj = c(-0.2,1.7)) 

auc(test_data$purchase,y_pred_num_test)
## [1] 0.4335749
ggplot(sales, aes(y=sales$purchase, x=sales$expenditure)) +geom_point()+stat_smooth(method = "glm", method.args = list(family=binomial),se=FALSE, fullrange = TRUE)+xlab("Expenditure")+ylab("Purchase")

ggplot(sales, aes(y=sales$purchase, x=sales$income)) +geom_point()+stat_smooth(method = "glm", method.args = list(family=binomial),se=FALSE)+xlab("Income")+ylab("Purchase")

  1. Predictive Analytics Theory & Applied: Gradient Descent and Generalized Linear Modeling via Maximum Likelihood Estimation
  1. Load data. Use entire sales dataset from Question 3 above.
  2. Logistic regression via user-defined maximum likelihood function. Create user-defined maximum likelihood function to estimate the parameters that maximize the log likelihood of the logistic density function. Use your logistic log likelihood function and the R optim() function to estimate the parameters (for B0, B1, and B2) that maximize the log likelihood given the observed sales data. You may use the “Logistic Regression Maximum Likelihood Code Outline.R” as a starting place. Just fill in the empty pieces denoted by brackets []. For a review of maximum likelihood estimation specific to logistic regression see: https://www.statlect.com/fundamentals-of-statistics/logistic-model-maximum-likelihood. The log likelihood equation should look something like: loglik <- sum(-ylog(1 + exp(-(x%%beta))) - (1-y)log(1 + exp(x%%beta))).
  3. Logistc regression via user-defined gradient descent function. Create a user-defined gradient descent function to estimate beta parameters (B0, B1, B2) using the sales data. You may use the “Logistic Regression Gradient Descent Code Outline” as a starting place. For a review of gradient descent specific to logistic regression see: http://ucanalytics.com/blogs/gradient-descent-logistic-regression-simplified-step-step-visual-guide/. The gradient descent cost function should look something like: (1/m)sum((-Ylog(g)) - ((1-Y)log(1-g))). The gradient descent delta formula should look something like: (t(X) %% error) / length(Y). d.Fit a generalized linear model logistic regression model using R’s built-in glm() function using the full sales dataset.
  4. Summarize the parameters/weights (B0, B1, B2) from each of these three optimization approaches (parts b, c, and d). How do the results from these three approaches compare?
#4a
sales <- read.csv("http://ucanalytics.com/blogs/wp-content/uploads/2017/09/Data-L-Reg-Gradient-Descent.csv")
sales$X1plusX2 <- NULL
var_names <- c("expenditure", "income", "purchase")
names(sales) <- var_names
#Standardize predictors (X1, X2) 
sales$expenditure <- scale(sales$expenditure, scale=TRUE, center = TRUE)
sales$income <- scale(sales$income, scale=TRUE, center = TRUE)

#4B
#Predictor variables matrix
X <- as.matrix(sales[,c(1,2)])

#Add ones to Predictor variables matrix (for intercept, B0)
X <- cbind(rep(1,nrow(X)),X)

#Response variable matrix
Y <- as.matrix(sales$purchase)


#end of data import/prep section
logl <- function(theta,x,y){
  y <- y
  x <- as.matrix(x)
  beta <- theta[1:ncol(x)]
  loglik <-  sum(-y*log(1 + exp(-(x%*%beta))) - (1-y)*log(1 + exp(x%*%beta)))
  return(-loglik)
}
theta.start = rep(0,3)
names(theta.start) = colnames(X)
mle = optim(theta.start,logl,x=X,y=Y ,hessian=T)
out = list(beta=mle$par,se=diag(sqrt(solve(mle$hessian))),ll=2*mle$value)
## Warning in sqrt(solve(mle$hessian)): NaNs produced
#Get parameter values from out object and beta attribute
#[call "out" object and "beta" attribute, separated by $, as in object$attribute]

betaVal <- out$beta
print(out)
## $beta
##             expenditure      income 
## -0.03175081  1.12831751  1.72290357 
## 
## $se
##             expenditure      income 
##   0.1315999   0.1573365   0.1842335 
## 
## $ll
## [1] 356.7375
#4c - resetting values

sales <- read.csv("http://ucanalytics.com/blogs/wp-content/uploads/2017/09/Data-L-Reg-Gradient-Descent.csv")
sales$X1plusX2 <- NULL
var_names <- c("expenditure", "income", "purchase")
names(sales) <- var_names

#Standardize predictors (X1, X2) to facilitate gradient descent optimization
sales$expenditure <- scale(sales$expenditure, scale=TRUE, center = TRUE)
sales$income <- scale(sales$income, scale=TRUE, center = TRUE)

#Predictor variables matrix
X <- as.matrix(sales[,c(1,2)])

#Add ones to Predictor variables matrix (for intercept, B0)
X <- cbind(rep(1,nrow(X)),X)

#Response variable matrix
Y <- as.matrix(sales$purchase)

sigmoid <- function(z){
  g <- 1/(1+exp(-z))
  return(g)
}
#Cost Function
#Create user-defined Loss Function specific to logistic regression gradient descent
#[finish the equation for J - the logistic regression cost/loss function]

logistic_cost_function <- function(beta){
  m <- nrow(X)
  g <- sigmoid(X%*%beta)
  J <-  (1/m)*sum((-Y*log(g)) - ((1-Y)*log(1-g)))
  return(J)
}

#Gradient Descent Function
#Create user-defined Gradient Descent Function specific to Logistic Regression
#[finish the equation for Beta - the updated weight (beta-alpha*delta)]

logistic_gradient_descent <- function(alpha, iterations, beta, x, y){
  for (i in 1:iterations) {
    error <- (X %*% beta - y)
    delta <- (t(X) %*% error) / length(Y)
    beta <- beta - alpha * delta
  }
  return(list(parameters=beta))
}


#Set initial parameters for gradient descent

# Define learning rate and iteration limit
initial_alpha <- 0.01 #learning rate
num_iterations <- 10000 #number of times we'll run the loop in the function
empty_beta <- matrix(c(0,0,0), nrow=3) # initialized parameters (matrix of 0s)


#Run logistic regression gradient descent to find beta parameters
#[fill in all of the arguments for the user-defined logistic gradient descent function]
output <- logistic_gradient_descent(alpha = initial_alpha, iterations = num_iterations, beta = empty_beta, x = X, y = Y)

#Get final estimated parameters from our output object:
#[call object "output" and stored attribute "parameters", separated by a $, as in object$attribute]
print(output)
## $parameters
##                  [,1]
##             0.5000000
## expenditure 0.1698615
## income      0.2614278
#End of Logistic Regression via Gradient Descent Code Outline

#4d
sales <- read.csv("http://ucanalytics.com/blogs/wp-content/uploads/2017/09/Data-L-Reg-Gradient-Descent.csv")
var_names <- c("expenditure", "income", "purchase")
names(sales) <- var_names
#Standardize predictors (X1, X2) 
sales$expenditure <- scale(sales$expenditure, scale=TRUE, center = TRUE)
sales$income <- scale(sales$income, scale=TRUE, center = TRUE)

fit <- glm(sales$purchase~sales$income+sales$expenditure,data=sales,family=binomial())
summary(fit) # display results
## 
## Call:
## glm(formula = sales$purchase ~ sales$income + sales$expenditure, 
##     family = binomial(), data = sales)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.14739  -0.68601  -0.00463   0.70809   2.87783  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.03141    0.13160  -0.239    0.811    
## sales$income       1.72304    0.18424   9.352  < 2e-16 ***
## sales$expenditure  1.12834    0.15734   7.171 7.43e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 554.52  on 399  degrees of freedom
## Residual deviance: 356.74  on 397  degrees of freedom
## AIC: 362.74
## 
## Number of Fisher Scoring iterations: 5
confint(fit) # 95% CI for the coefficients
## Waiting for profiling to be done...
##                        2.5 %    97.5 %
## (Intercept)       -0.2905710 0.2265394
## sales$income       1.3799249 2.1040551
## sales$expenditure  0.8320624 1.4503882
exp(coef(fit)) # exponentiated coefficients
##       (Intercept)      sales$income sales$expenditure 
##         0.9690797         5.6015517         3.0905142
exp(confint(fit)) # 95% CI for exponentiated coefficients
## Waiting for profiling to be done...
##                       2.5 %   97.5 %
## (Intercept)       0.7478364 1.254252
## sales$income      3.9746033 8.199352
## sales$expenditure 2.2980533 4.264770
predict(fit, type="response") # predicted values
##           1           2           3           4           5           6 
## 0.907520487 0.773257456 0.775561354 0.926764819 0.529941464 0.089108385 
##           7           8           9          10          11          12 
## 0.532092403 0.874972649 0.380536276 0.040296879 0.240207910 0.489448671 
##          13          14          15          16          17          18 
## 0.891879440 0.011498930 0.057559466 0.642095529 0.023727158 0.831910064 
##          19          20          21          22          23          24 
## 0.301589984 0.285953785 0.014872423 0.669298420 0.231272282 0.078925088 
##          25          26          27          28          29          30 
## 0.716500230 0.246410686 0.104259337 0.021703341 0.215099385 0.769113833 
##          31          32          33          34          35          36 
## 0.409215004 0.483860195 0.045477216 0.691096374 0.208300117 0.454668893 
##          37          38          39          40          41          42 
## 0.278360646 0.253968018 0.336068768 0.900304267 0.487338955 0.549777494 
##          43          44          45          46          47          48 
## 0.217236010 0.911295038 0.392145346 0.366032460 0.907700228 0.703215806 
##          49          50          51          52          53          54 
## 0.858829606 0.976553452 0.552104202 0.986374443 0.955946508 0.056073044 
##          55          56          57          58          59          60 
## 0.031332971 0.204492711 0.036505790 0.082632775 0.202289569 0.454073952 
##          61          62          63          64          65          66 
## 0.735631258 0.707258082 0.806263754 0.911136802 0.328141439 0.496593662 
##          67          68          69          70          71          72 
## 0.940529747 0.919427376 0.744444889 0.016972973 0.603940243 0.668577448 
##          73          74          75          76          77          78 
## 0.746535537 0.800348602 0.952908889 0.347724666 0.502204609 0.974891429 
##          79          80          81          82          83          84 
## 0.885648248 0.037067667 0.606901542 0.268284366 0.132437714 0.659493104 
##          85          86          87          88          89          90 
## 0.677858510 0.160609900 0.885344036 0.204413937 0.886545856 0.444468734 
##          91          92          93          94          95          96 
## 0.199338759 0.050850058 0.885771037 0.387771009 0.923384754 0.705441068 
##          97          98          99         100         101         102 
## 0.640019737 0.121051860 0.961785087 0.957680355 0.672135001 0.641197501 
##         103         104         105         106         107         108 
## 0.046673023 0.057524756 0.861142692 0.963708439 0.602614975 0.147019033 
##         109         110         111         112         113         114 
## 0.734793801 0.809326133 0.220522240 0.223901141 0.673924394 0.784463285 
##         115         116         117         118         119         120 
## 0.893576989 0.072936535 0.564712685 0.038585242 0.336637957 0.818616729 
##         121         122         123         124         125         126 
## 0.059743264 0.037830912 0.671415900 0.443258587 0.248752077 0.015907708 
##         127         128         129         130         131         132 
## 0.641963762 0.769587462 0.157054434 0.003898853 0.253747372 0.982770135 
##         133         134         135         136         137         138 
## 0.342627507 0.495350532 0.161072568 0.528383968 0.085557499 0.106566453 
##         139         140         141         142         143         144 
## 0.891896572 0.144449378 0.358201108 0.901752825 0.299954241 0.366037146 
##         145         146         147         148         149         150 
## 0.121256661 0.103228729 0.377475717 0.230781816 0.975014461 0.075993897 
##         151         152         153         154         155         156 
## 0.705881754 0.869047151 0.248100669 0.333556160 0.367132392 0.947726046 
##         157         158         159         160         161         162 
## 0.867535943 0.380128729 0.955531648 0.321176921 0.651166843 0.582566304 
##         163         164         165         166         167         168 
## 0.243608328 0.376310848 0.075039070 0.595258528 0.955328376 0.905560391 
##         169         170         171         172         173         174 
## 0.978709786 0.617140694 0.353206059 0.014615860 0.628506198 0.793253677 
##         175         176         177         178         179         180 
## 0.970216716 0.914356765 0.557480718 0.981463587 0.243928119 0.147399615 
##         181         182         183         184         185         186 
## 0.110784693 0.936542904 0.723700112 0.993020070 0.081091076 0.647353776 
##         187         188         189         190         191         192 
## 0.269759596 0.351912379 0.973861184 0.840582080 0.058975093 0.625588772 
##         193         194         195         196         197         198 
## 0.629476149 0.469077971 0.810743612 0.311653728 0.083982739 0.122627531 
##         199         200         201         202         203         204 
## 0.814317012 0.139041491 0.711641703 0.996849291 0.094089056 0.204212565 
##         205         206         207         208         209         210 
## 0.903920536 0.813161646 0.779155166 0.955745626 0.613254450 0.648765053 
##         211         212         213         214         215         216 
## 0.385218427 0.870307968 0.164693306 0.082771199 0.718957489 0.588340727 
##         217         218         219         220         221         222 
## 0.754010239 0.881442986 0.215568182 0.226051634 0.285100785 0.928785764 
##         223         224         225         226         227         228 
## 0.075947014 0.898808386 0.174382477 0.062863273 0.493741096 0.409875717 
##         229         230         231         232         233         234 
## 0.800436882 0.769447373 0.074688934 0.141573310 0.753302736 0.222729670 
##         235         236         237         238         239         240 
## 0.896731036 0.693155548 0.399832225 0.106434523 0.942331476 0.694208796 
##         241         242         243         244         245         246 
## 0.533798597 0.017593548 0.648159579 0.141776588 0.898610219 0.009336942 
##         247         248         249         250         251         252 
## 0.801803450 0.267646802 0.109579568 0.812047461 0.818145845 0.941922351 
##         253         254         255         256         257         258 
## 0.785093047 0.059722371 0.844183570 0.297767263 0.039825411 0.158981036 
##         259         260         261         262         263         264 
## 0.830853100 0.259316880 0.504211957 0.965367958 0.123121129 0.852663973 
##         265         266         267         268         269         270 
## 0.271618947 0.076778287 0.027063711 0.874902616 0.848051499 0.292705484 
##         271         272         273         274         275         276 
## 0.424760824 0.204998805 0.073305089 0.839358554 0.038906330 0.807184287 
##         277         278         279         280         281         282 
## 0.993964433 0.155865177 0.346769039 0.572289878 0.455796067 0.803392794 
##         283         284         285         286         287         288 
## 0.521439530 0.532209231 0.840804401 0.943972673 0.909599772 0.394962411 
##         289         290         291         292         293         294 
## 0.049028541 0.023254730 0.431083515 0.857908958 0.029134978 0.325575822 
##         295         296         297         298         299         300 
## 0.334801701 0.196259812 0.881980058 0.573979203 0.745892616 0.124140697 
##         301         302         303         304         305         306 
## 0.920490921 0.987144008 0.195431141 0.774639342 0.973703834 0.633168087 
##         307         308         309         310         311         312 
## 0.455649533 0.031515237 0.141656315 0.718891438 0.570565167 0.171730881 
##         313         314         315         316         317         318 
## 0.872031245 0.726526339 0.297609169 0.213795970 0.780159079 0.848077053 
##         319         320         321         322         323         324 
## 0.476625825 0.020425283 0.819558478 0.953761334 0.399522271 0.084424309 
##         325         326         327         328         329         330 
## 0.756232292 0.084071424 0.151924271 0.349119915 0.670943071 0.003974446 
##         331         332         333         334         335         336 
## 0.965684125 0.144748744 0.555604783 0.807790778 0.808261872 0.116520995 
##         337         338         339         340         341         342 
## 0.618366444 0.815079421 0.827297695 0.082158247 0.248369340 0.929341515 
##         343         344         345         346         347         348 
## 0.592233877 0.956355905 0.696289861 0.190223444 0.099884425 0.873452354 
##         349         350         351         352         353         354 
## 0.996874395 0.694770997 0.403567611 0.325454491 0.300190000 0.367571202 
##         355         356         357         358         359         360 
## 0.972124708 0.141591762 0.298169503 0.026302398 0.381598630 0.103847073 
##         361         362         363         364         365         366 
## 0.460334207 0.873369896 0.605379617 0.887631027 0.899375270 0.092358690 
##         367         368         369         370         371         372 
## 0.835918117 0.306024945 0.586711337 0.733182140 0.544522668 0.301375759 
##         373         374         375         376         377         378 
## 0.185862432 0.015140470 0.005473059 0.582902023 0.921603542 0.765681387 
##         379         380         381         382         383         384 
## 0.004861230 0.150472564 0.759796288 0.220679442 0.005313169 0.872697987 
##         385         386         387         388         389         390 
## 0.905053822 0.632181665 0.042460077 0.030238911 0.875481609 0.722528978 
##         391         392         393         394         395         396 
## 0.166811893 0.922941796 0.918264418 0.473334077 0.169818877 0.655242445 
##         397         398         399         400 
## 0.936925253 0.788938017 0.219070994 0.380225547
residuals(fit, type="deviance") # residuals
##           1           2           3           4           5           6 
##  0.44054316  0.71713768  0.71297712  0.39001397  1.12693276 -0.43204482 
##           7           8           9          10          11          12 
## -1.23246456  0.51684166 -0.97867373 -0.28681455 -0.74122932  1.19538754 
##          13          14          15          16          17          18 
##  0.47838125 -0.15208915 -0.34433256  0.94129505 -0.21914918 -1.88852117 
##          19          20          21          22          23          24 
##  1.54834547 -0.82074063 -0.17311341 -1.48764168 -0.72528401 -0.40549700 
##          25          26          27          28          29          30 
## -1.58779342 -0.75220707 -0.46926399 -0.20948663 -0.69598588 -1.71220938 
##          31          32          33          34          35          36 
##  1.33679810  1.20495582 -0.30510249  0.85962317 -0.68348064 -1.10123761 
##          37          38          39          40          41          42 
## -0.80774968 -0.76548913  1.47677993 -2.14738558  1.19899574  1.09383878 
##          43          44          45          46          47          48 
##  1.74743865  0.43101873 -0.99781710 -0.95473297  0.44009340  0.83915607 
##          49          50          51          52          53          54 
##  0.55169691  0.21783384  1.08997108  0.16564562  0.30017769 -0.33972487 
##          55          56          57          58          59          60 
## -0.25232657 -0.67642482 -0.27272257 -0.41532499 -0.67232375 -1.10024702 
##          61          62          63          64          65          66 
##  0.78361508  0.83229759  0.65626877  0.43142143  1.49285669 -1.17162930 
##          67          68          69          70          71          72 
##  0.35017710  0.40988833  0.76826617 -0.18503332  1.00427090 -1.48617706 
##          73          74          75          76          77          78 
##  0.76460717  0.66739478  0.31059937 -0.92443336 -1.18115719  0.22551793 
##          79          80          81          82          83          84 
##  0.49281927 -0.27485318  0.99938852  1.62216386 -0.53304403  0.91245138 
##          85          86          87          88          89          90 
##  0.88183525 -0.59174272 -2.08125887  1.78191371  0.49075947  1.27347993 
##          91          92          93          94          95          96 
## -0.66680933 -0.32307427  0.49253789 -0.99060476  0.39927254  0.83538260 
##          97          98          99         100         101         102 
##  0.94472881 -0.50799485  0.27915679  0.29407895 -1.49342113 -1.43177036 
##         103         104         105         106         107         108 
## -0.30918387  2.38978655  0.54679989  0.27190616  1.00645596 -0.56394689 
##         109         110         111         112         113         114 
##  0.78506734  0.65046647  1.73882531 -0.71200474  0.88841133 -1.75192695 
##         115         116         117         118         119         120 
##  0.47438968 -0.38918698  1.06905397 -0.28053297 -0.90601807 -1.84777865 
##         121         122         123         124         125         126 
## -0.35100518  2.55915171  0.89259903  1.27561904 -0.75633267  2.87782957 
##         127         128         129         130         131         132 
##  0.94151306  0.72374121 -0.58455606 -0.08839087 -0.76510273  0.18644048 
##         133         134         135         136         137         138 
## -0.91597430 -1.16952231 -0.59267372 -1.22604251 -0.42294370  2.11612217 
##         139         140         141         142         143         144 
##  0.47834110 -0.55858753  1.43294152  0.45478528 -0.84452303 -0.95474071 
##         145         146         147         148         149         150 
## -0.50845337 -0.46680712  1.39588613 -0.72440407  0.22495767 -0.39758421 
##         151         152         153         154         155         156 
##  0.83463470  0.52982619 -0.75518585  1.48185292 -0.95655008  0.32768827 
##         157         158         159         160         161         162 
##  0.53310100  1.39085968  0.30162027 -0.88022128 -1.45131770 -1.32183929 
##         163         164         165         166         167         168 
##  1.68059130  1.39809853 -0.39497792  1.01858673  0.30232482  0.44542409 
##         169         170         171         172         173         174 
##  0.20746142 -1.38570394  1.44270833 -0.17160259  0.96375245  0.68060593 
##         175         176         177         178         179         180 
##  0.24590980  0.42316533  1.08104335  0.19344439 -0.74782194 -0.56473768 
##         181         182         183         184         185         186 
## -0.48459443  0.36210481  0.80421164  0.11835881 -0.41126212  0.93259031 
##         187         188         189         190         191         192 
## -0.79294575 -0.93137465  0.23015867  0.58933976 -0.34867082 -1.40171363 
##         193         194         195         196         197         198 
##  0.96215104  1.23043592 -1.82463839 -0.86424911 -0.41885575 -0.51151475 
##         199         200         201         202         203         204 
##  0.64094546  1.98644552  0.82484025  0.07944411 -0.44455432 -0.67590409 
##         205         206         207         208         209         210 
##  0.44947486 -1.83167216  0.70646312  0.30087700 -1.37839638  0.93025227 
##         211         212         213         214         215         216 
## -0.98639570 -2.02118412 -0.59992720 -0.41568817 -1.59326667  1.02999906 
##         217         218         219         220         221         222 
##  0.75146435 -2.06512048 -0.69684377 -0.71589122 -0.81928469  0.38438828 
##         223         224         225         226         227         228 
## -0.39745658 -2.14043892 -0.61906972 -0.36035007 -1.16679653  1.33559073 
##         229         230         231         232         233         234 
##  0.66722949  0.72399271 -0.39401855 -0.55254682  0.75271255 -0.70988319 
##         235         236         237         238         239         240 
##  0.46690322 -1.53714953 -1.01049101 -0.47441683  0.34466848 -1.53938478 
##         241         242         243         244         245         246 
## -1.23542506 -0.18841526  0.93125544 -0.55297527  0.46239790 -0.13697302 
##         247         248         249         250         251         252 
##  0.66466800 -0.78929382 -0.48179152  0.64528519  0.63358451  0.34592611 
##         253         254         255         256         257         258 
##  0.69563358 -0.35094187  0.58204005  1.55656230 -0.28509699 -0.58845742 
##         259         260         261         262         263         264 
## -1.88519904 -0.77483209  1.17026369  0.26550309 -0.51261372 -1.95705871 
##         265         266         267         268         269         270 
## -0.79615444 -0.39971456 -0.23425063  0.51699651  0.57413224 -0.83223570 
##         271         272         273         274         275         276 
##  1.30860921 -0.67736498 -0.39020733 -1.91237049 -0.28172115  0.65452773 
##         277         278         279         280         281         282 
##  0.11003503 -0.58213925  1.45540119 -1.30331086 -1.10311489  0.66168199 
##         283         284         285         286         287         288 
##  1.14119408  1.12313719  0.58889086  0.33958228  0.43531733 -1.00246166 
##         289         290         291         292         293         294 
## -0.31708431 -0.21693035 -1.06209381  0.55363760 -0.24317824  1.49810541 
##         295         296         297         298         299         300 
## -0.90296189 -0.66102831  0.50117030 -1.30634384  0.76573316 -0.51487827 
##         301         302         303         304         305         306 
##  0.40705808  0.16086855 -0.65946755  0.71464358  0.23085967 -1.41622847 
##         307         308         309         310         311         312 
##  1.25381923 -0.25307125  1.97704399  0.81245914 -1.30021942  1.87713967 
##         313         314         315         316         317         318 
##  0.52331639  0.79935041 -0.84055374 -0.69359778  0.70463811  0.57407976 
##         319         320         321         322         323         324 
## -1.13794431 -0.20315887  0.63085581  0.30770705  1.35461121 -0.42000534 
##         325         326         327         328         329         330 
##  0.74753821 -0.41908684 -0.57408248 -0.92674684 -1.49098927 -0.08924533 
##         331         332         333         334         335         336 
##  0.26426688 -0.55921371  1.08415687  0.65337920  0.65248628 -0.49777053 
##         337         338         339         340         341         342 
##  0.98048360 -1.83729630  0.61577704 -0.41407798 -0.75565894  0.38282892 
##         343         344         345         346         347         348 
##  1.02357575  0.29874789  0.85086925 -0.64961054 -0.45876379  0.52019553 
##         349         350         351         352         353         354 
##  0.07912648  0.85343188 -1.01665077 -0.88737380 -0.84492178 -0.95727492 
##         355         356         357         358         359         360 
##  0.23778639 -0.55258572  1.55569480 -0.23088739  1.38808208 -0.46828240 
##         361         362         363         364         365         366 
##  1.24563439  0.52037699 -1.36369427  0.48826045  0.46055380 -0.44024087 
##         367         368         369         370         371         372 
## -1.90125732 -0.85477396 -1.32936751  0.78785928  1.10258397  1.54880433 
##         373         374         375         376         377         378 
## -0.64128921 -0.17467831 -0.10476728 -1.32244783  0.40407956  0.73073817 
##         379         380         381         382         383         384 
## -0.09872268 -0.57109551  0.74122186  1.73841544 -0.10322145  0.52185387 
##         385         386         387         388         389         390 
##  0.44667855  0.95768312 -0.29457720 -0.24781258 -2.04122603  0.80622299 
##         391         392         393         394         395         396 
## -0.60414542  0.40047249  0.41296463 -1.13242117 -0.61010062  0.91951070 
##         397         398         399         400 
##  0.36097582  0.68857464 -0.70323685  1.39067657