#################################################################################################################
# advanced regression models                                                                                    #                                 
# ref : http://r-statistics.co/adv-regression-models.html                                                       #
# ref : https://onlinecourses.science.psu.edu/stat504/node/216                                                  #
# ref : https://blog.cloudera.com/blog/2015/12/common-probability-distributions-the-data-scientists-crib-sheet/ #
# ref : http://rstudio-pubs-static.s3.amazonaws.com/190997_40fa09db8e344b19b14a687ea5de914b.html                #
#################################################################################################################
######################################################################################################
# package & method                                                                                   #
# smbinning : https://cran.r-project.org/web/packages/smbinning/smbinning.pdf                        #
#           : smbinning.factor()                                                                     #
# InformationValue : https://cran.r-project.org/web/packages/InformationValue/InformationValue.pdf   #
#                  : optimalCutoff() , plotRoc()                                                     #
######################################################################################################
# linear regression 의 가정중 dependent variable 정규분포해야 한다는 것이 있는데,
# 이 가정은, 서비스단에서는 앵간하면 맞추기 힘든 가정이다.
# depnendent value 가 다양한 분포를 가질때, 사용할 수 있는 glm 에 대해서 정리

GLM

# General Linear Model 
#        refers to conventional linear regression models for a continuous response variable 
#        given continuous and/or categorical predictors
#
# Generalized Linear Model
#        the response variable yi is assumed to follow an exponential family distribution with mean μi, 
#        which is assumed to be some (often nonlinear) function of T(xi)β.
# three components
## Random Component : the probability distribution of the response variable (Y)
## Systematic Component : the explanatory variables (X1, X2, ... Xk) as a combination of linear predictors; e.g. β0 + β1x1 + β2x2
## Link Function (η or g(μ)) : specifies the link between random and systematic components , e.g. η = logit(π) for logistic regression
##                           : GLM은 Y ~ X 가 linear 관계가 아님. link function 을통해서, linear 관계를 만들어주는 역할 
##                           : binary ( 0, 1 ) 값만을 갖는 Y 를 , 확률개념으로 변경하여 continuous variable 로 만들어주는 역할 
##                           : 아래 그림에서 'x' 점으로 찍히는 y 변수값을, sigmoid curve 형태로 변환시키는 역할 
##                           : http://www.columbia.edu/~so33/SusDev/Lecture_9.pdf

Binary Logistic Regression

# ref : http://r-statistics.co/Logistic-Regression-With-R.html#Build%20Logit%20Models%20and%20Predict
#
#1. Random component : Binomial Distribution ( Y )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Logit
#
# Assumptions
##4.1 dependent variable 끼리 independent 해야함 
##4.2 샘플 데이터가 충분히 많아야 함.
#
# MLE ( Maximum Likelihood Estimation ) : 여기까지 봐야하는가? 
## 오직 주어진 Observation, 혹은 데이터들 만을 토대로 parameter estimation을 하는 방법
## observation에 따라 그 값이 너무 민감하게 변한다는 단점 ( so, 샘플 데이터가 충분히 많아야 함 )
#
# No Outliers ( 0 or 1 )
# ref : http://rfriend.tistory.com/99
# binomial distribution
# Discret Prob. Distribution > Binomial Distribution , Poisson Distribution 
# 
# 반복시행시, Success / Fail 만 발생하는 케이스 ( 베르누이 시행 )
# 성공확률이 p인 베르누이 시행을 n번 반복했을 때 성공하는 횟수를 X라 하면, 확률변수 X는 모수 n과 p인 이항분포(Binomial distributio)을 따른다
y <- dbinom(0:20, size=20, prob=0.5)
plot(0:20, y, type='h', lwd=5, col="grey", ylab="Probability", xlab="확률변수 X", main = c("X ~ B(20, 0.5)"))

plot(pbinom(0:20, size=20, prob=0.5), type='h')

# sum(dbinom(0:12, size=20, prob=0.5))
# rbinom(12, size=20, prob=0.5)
library(data.table)
adult <- fread("/Users/CA/Downloads/adult.csv", stringsAsFactors = F)
head(adult, n = 1)
# Create Training Data
input_ones <- adult[which(adult$ABOVE50K == 1), ]  # all 1's
input_zeros <- adult[which(adult$ABOVE50K == 0), ]  # all 0's
set.seed(100)  
input_ones_training_rows <- sample(1:nrow(input_ones), 0.7*nrow(input_ones))  # 1's for training
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.7*nrow(input_ones))  # 0's for training. Pick as many 0's as 1's
training_ones <- input_ones[input_ones_training_rows, ]  
training_zeros <- input_zeros[input_zeros_training_rows, ]
trainingData <- rbind(training_ones, training_zeros)  # row bind the 1's and 0's 
# Create Test Data
test_ones <- input_ones[-input_ones_training_rows, ]
test_zeros <- input_zeros[-input_zeros_training_rows, ]
testData <- rbind(test_ones, test_zeros)  # row bind the 1's and 0's 
library(smbinning)
# segregate continuous and factor variables
factor_vars <- c ("WORKCLASS", "EDUCATION", "MARITALSTATUS", "OCCUPATION", "RELATIONSHIP", "RACE", "SEX", "NATIVECOUNTRY")
continuous_vars <- c("AGE", "FNLWGT","EDUCATIONNUM", "HOURSPERWEEK", "CAPITALGAIN", "CAPITALLOSS")
iv_df <- data.frame(VARS=c(factor_vars, continuous_vars), IV=numeric(14))  # init for IV results
# compute IV for categoricals
for(factor_var in factor_vars){
  smb <- smbinning.factor(trainingData, y="ABOVE50K", x=factor_var)  # WOE table
  if(class(smb) != "character"){ 
    iv_df[iv_df$VARS == factor_var, "IV"] <- smb$iv
  }
}
# compute IV for continuous vars
for(continuous_var in continuous_vars){
  smb <- smbinning(trainingData, y="ABOVE50K", x=continuous_var)  # WOE table
  if(class(smb) != "character"){  # any error while calculating scores.
    iv_df[iv_df$VARS == continuous_var, "IV"] <- smb$iv
  }
}
NAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercion
iv_df <- iv_df[order(-iv_df$IV), ]  # sort
iv_df
# Check Random component : Binomial Distribution ( Y ) , Success(1) or Fail(0) => Binomial + Logit function 
table(trainingData$ABOVE50K)

   0    1 
5488 5488 
# 0 과 1 만 갖는 분포는 transformation을 하더라도, nomal distribution 의 모양을 가질 수 없다. 
plot(density(adult$ABOVE50K))

# binomial distribution 의 link function 으로, glm() 에서는 logit . probit . couchit 을 사용할 수 있다. 
model <- glm(ABOVE50K ~  AGE + CAPITALGAIN + EDUCATIONNUM + HOURSPERWEEK + CAPITALLOSS, data=trainingData, family=binomial(link="logit"))
glm.fit: fitted probabilities numerically 0 or 1 occurred
testData$predicted <- predict(model, testData, type="response")  # predicted scores
summary(model)

Call:
glm(formula = ABOVE50K ~ AGE + CAPITALGAIN + EDUCATIONNUM + HOURSPERWEEK + 
    CAPITALLOSS, family = binomial(link = "logit"), data = trainingData)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.8040  -0.8310  -0.0421   0.8669   2.7637  

Coefficients:
                Estimate  Std. Error z value            Pr(>|z|)    
(Intercept)  -7.72106683  0.17998331  -42.90 <0.0000000000000002 ***
AGE           0.04991639  0.00195081   25.59 <0.0000000000000002 ***
CAPITALGAIN   0.00032189  0.00001745   18.45 <0.0000000000000002 ***
EDUCATIONNUM  0.32710225  0.01042704   31.37 <0.0000000000000002 ***
HOURSPERWEEK  0.04611973  0.00219967   20.97 <0.0000000000000002 ***
CAPITALLOSS   0.00060828  0.00005436   11.19 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 15216  on 10975  degrees of freedom
Residual deviance: 11022  on 10970  degrees of freedom
AIC: 11034

Number of Fisher Scoring iterations: 7
# check multicollinearity ( 다중공선성 ) 독립변수들간에 높은 선형관계가 존재하면 ( 다중공선성이 존재하면 ) 회귀분석 전제 가정 위배
library(car)
sqrt(vif(model)) > 2 
         AGE  CAPITALGAIN EDUCATIONNUM HOURSPERWEEK  CAPITALLOSS 
       FALSE        FALSE        FALSE        FALSE        FALSE 
# feature 들의 상대적인 중요도 ( feature 를 추가했을때 평균적은 R2 의 증가를 보는 것 )
relweights <- function(fit,...){
         R <- cor(fit$model)
         nvar <- ncol(R)
         rxx <- R[2:nvar, 2:nvar]
         rxy <- R[2:nvar, 1]
         svd <- eigen(rxx)
         evec <- svd$vectors
         ev <- svd$values
         delta <- diag(sqrt(ev))
         lambda <- evec %*% delta %*% t(evec)
         lambdasq <- lambda ^ 2
         beta <- solve(lambda) %*% rxy
         rsquare <- colSums(beta ^ 2)
         rawwgt <- lambdasq %*% beta ^ 2
         import <- (rawwgt / rsquare) * 100
         import <- as.data.frame(import)
         row.names(import) <- names(fit$model[2:nvar])
         names(import) <- "Weights"
         import <- import[order(import),1, drop=FALSE]
         dotchart(import$Weights, labels=row.names(import),
            xlab="% of R-Square", pch=19,
            main="Relative Importance of Predictor Variables",
            sub=paste("Total R-Square=", round(rsquare, digits=3)),
            ...)
         return(import)
}
relweights(model)

library(InformationValue)
threshold <- optimalCutoff(testData$ABOVE50K, testData$predicted)
misClassError(testData$ABOVE50K, testData$predicted, threshold = threshold)
[1] 0.0965
plotROC(testData$ABOVE50K, testData$predicted)

# linear regression 의 가정을 검사하기 위한 아래 plot 들은, 당연하게도 모두 위배되는 결과를 보인다. 
par(mfrow = c(2,2))
plot(model)

# model 비교 
model.1 <- model <- glm(ABOVE50K ~  1, data=trainingData, family=binomial(link="logit"))
model.2 <- model <- glm(ABOVE50K ~  AGE, data=trainingData, family=binomial(link="logit"))
model.3 <- model <- glm(ABOVE50K ~  AGE + CAPITALGAIN, data=trainingData, family=binomial(link="logit"))
glm.fit: fitted probabilities numerically 0 or 1 occurred
library(rcompanion)
compareGLM(model.1, model.2, model.3)
$Models
  Formula                       
1 "ABOVE50K ~ 1"                
2 "ABOVE50K ~ AGE"              
3 "ABOVE50K ~ AGE + CAPITALGAIN"

$Fit.criteria
anova(model.1, model.2, model.3)
Analysis of Deviance Table

Model 1: ABOVE50K ~ 1
Model 2: ABOVE50K ~ AGE
Model 3: ABOVE50K ~ AGE + CAPITALGAIN
  Resid. Df Resid. Dev Df Deviance
1     10975      15216            
2     10974      14273  1   942.62
3     10973      13178  1  1095.60
# 최선의 모델을 고르는 방법 중에, 앞선 모델 evaluation 과 함께 몇가지 통계량을 기준으로 선택하는 방법도 있다. 
# 위의 compareGLM function 의 결과를 보면 AIC / BIC 등의 통계량이 나오는데, 이에 대한 내용을 살펴보자 
#
# AIC(Akaike’s An Information Criterion)
## 모형의 통계적 적합성 및 통계 적합에 필요한 인수의 수를 설명 ( ? )
## AIC 값이 적은 모형, 즉 적은 인수를 가지고 적절한 적합성을 보이는 모형이 선호 ( AIC 값은 적으면 좋다. AIC 값은 음수도 가능하다. )
## AIC / BIC function 의 doc 을 살펴보면, 아래와 같다. 
## When comparing models fitted by maximum likelihood to the same data, the smaller the AIC or BIC, the better the fit.
AIC(model.1, model.2, model.3)
# 위 예에서는 model.3 의 AIC 값이 제일 작게 나왔으므로, AIC 기준으로는 model.3 이 제일 적절한 모델이라고 판단한다. 
# 이때, independent var. 를 자동으로 추가해가면서 여러 모델을 쉽게 비교해 볼 수 도 있는데, step 함수를 사용하면 된다. 
# step 함수는 backward ( feature 를 빼가면서, ) , forward ( 지정된 feature 들을 하나씩 추가하면서 ) 최소 AIC 값을 찾는다.  
super.model = glm(ABOVE50K ~ 1 , data = trainingData)
sub.model = step(super.model, 
                 direction = "forward", 
                 scope = (ABOVE50K ~AGE + CAPITALGAIN + EDUCATIONNUM + HOURSPERWEEK + CAPITALLOSS),
                 trace = 0)
summary(sub.model)

Call:
glm(formula = ABOVE50K ~ EDUCATIONNUM + AGE + HOURSPERWEEK + 
    CAPITALGAIN + CAPITALLOSS, data = trainingData)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.30068  -0.36853  -0.00151   0.37907   1.16576  

Coefficients:
                  Estimate    Std. Error t value            Pr(>|t|)    
(Intercept)  -0.8607493215  0.0235767419  -36.51 <0.0000000000000002 ***
EDUCATIONNUM  0.0599233849  0.0016031719   37.38 <0.0000000000000002 ***
AGE           0.0092871239  0.0003158898   29.40 <0.0000000000000002 ***
HOURSPERWEEK  0.0077724553  0.0003416700   22.75 <0.0000000000000002 ***
CAPITALGAIN   0.0000049110  0.0000003842   12.78 <0.0000000000000002 ***
CAPITALLOSS   0.0000884968  0.0000086535   10.23 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.1816425)

    Null deviance: 2744.0  on 10975  degrees of freedom
Residual deviance: 1992.6  on 10970  degrees of freedom
AIC: 12435

Number of Fisher Scoring iterations: 2
# 추가적으로 dummy coding 이 적용되었음을 볼 수 있다. 
# 아래 결과는 smbinning 을 통해 feature 를 selection 한 결과와는 다소 차이가 있는데, selection 의 기준이 다른 것이지 어느게 맞고 틀리고의 문제는 아니다. 
# ( Stepwise regression은 논란의 여지가 있다. 이 방법을 통해서 좋은 모형을 찾을 수는 있지만 최선의 모형이라는 보장은 없다고 한다. )
super.model = glm(ABOVE50K ~ . , data = trainingData)
sub.model = step(super.model, 
                 direction = "backward", 
                 trace = 0)
summary(sub.model)

Call:
glm(formula = ABOVE50K ~ AGE + WORKCLASS + EDUCATION + MARITALSTATUS + 
    OCCUPATION + RELATIONSHIP + SEX + CAPITALGAIN + CAPITALLOSS + 
    HOURSPERWEEK, data = trainingData)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.16358  -0.24830   0.04366   0.26503   1.23987  

Coefficients: (1 not defined because of singularities)
                                         Estimate    Std. Error t value             Pr(>|t|)    
(Intercept)                         -0.2176612302  0.0620936039  -3.505             0.000458 ***
AGE                                  0.0039158938  0.0003480717  11.250 < 0.0000000000000002 ***
WORKCLASS Federal-gov                0.1483278291  0.0307674444   4.821  0.00000144813701874 ***
WORKCLASS Local-gov                  0.0189662061  0.0276407711   0.686             0.492622    
WORKCLASS Never-worked               0.1316368230  0.1865781872   0.706             0.480494    
WORKCLASS Private                    0.0606786508  0.0241217712   2.516             0.011900 *  
WORKCLASS Self-emp-inc               0.0922718863  0.0292939172   3.150             0.001638 ** 
WORKCLASS Self-emp-not-inc           0.0160970788  0.0270663080   0.595             0.552038    
WORKCLASS State-gov                 -0.0047074598  0.0300679722  -0.157             0.875594    
WORKCLASS Without-pay               -0.0539326449  0.2154031550  -0.250             0.802298    
EDUCATION 11th                       0.0005553596  0.0330492619   0.017             0.986593    
EDUCATION 12th                       0.0143839256  0.0416193099   0.346             0.729645    
EDUCATION 1st-4th                   -0.1584027879  0.0581623653  -2.723             0.006470 ** 
EDUCATION 5th-6th                   -0.1199547846  0.0473723410  -2.532             0.011350 *  
EDUCATION 7th-8th                   -0.1110682470  0.0386398760  -2.874             0.004055 ** 
EDUCATION 9th                       -0.0564954363  0.0425877132  -1.327             0.184680    
EDUCATION Assoc-acdm                 0.1421904469  0.0317469646   4.479  0.00000758018607599 ***
EDUCATION Assoc-voc                  0.1428015087  0.0304761755   4.686  0.00000282406418285 ***
EDUCATION Bachelors                  0.2292445880  0.0267800900   8.560 < 0.0000000000000002 ***
EDUCATION Doctorate                  0.3208177158  0.0363621562   8.823 < 0.0000000000000002 ***
EDUCATION HS-grad                    0.0498731514  0.0256111266   1.947             0.051522 .  
EDUCATION Masters                    0.2648763068  0.0291740439   9.079 < 0.0000000000000002 ***
EDUCATION Preschool                 -0.2855220829  0.1102914318  -2.589             0.009644 ** 
EDUCATION Prof-school                0.3088747268  0.0344194361   8.974 < 0.0000000000000002 ***
EDUCATION Some-college               0.1187370026  0.0261210670   4.546  0.00000553567358735 ***
MARITALSTATUS Married-AF-spouse      0.2923877218  0.1332320772   2.195             0.028215 *  
MARITALSTATUS Married-civ-spouse     0.1925430790  0.0481188830   4.001  0.00006338343712847 ***
MARITALSTATUS Married-spouse-absent  0.0245921424  0.0365667163   0.673             0.501262    
MARITALSTATUS Never-married         -0.0238518146  0.0149377972  -1.597             0.110352    
MARITALSTATUS Separated             -0.0216743671  0.0255018884  -0.850             0.395393    
MARITALSTATUS Widowed                0.0210239155  0.0259651211   0.810             0.418131    
OCCUPATION Adm-clerical              0.0063112701  0.0208946494   0.302             0.762618    
OCCUPATION Armed-Forces             -0.1742573735  0.2165631268  -0.805             0.421040    
OCCUPATION Craft-repair              0.0434817843  0.0194093726   2.240             0.025095 *  
OCCUPATION Exec-managerial           0.1620763288  0.0195193418   8.303 < 0.0000000000000002 ***
OCCUPATION Farming-fishing          -0.1264055860  0.0272153570  -4.645  0.00000344644199015 ***
OCCUPATION Handlers-cleaners        -0.0569818919  0.0260198885  -2.190             0.028550 *  
OCCUPATION Machine-op-inspct        -0.0428607606  0.0228215134  -1.878             0.060396 .  
OCCUPATION Other-service            -0.0436358817  0.0217759172  -2.004             0.045110 *  
OCCUPATION Priv-house-serv          -0.0306822525  0.0626331702  -0.490             0.624234    
OCCUPATION Prof-specialty            0.1350309036  0.0207405073   6.510  0.00000000007819094 ***
OCCUPATION Protective-serv           0.1318706516  0.0304850745   4.326  0.00001533678611790 ***
OCCUPATION Sales                     0.0699466748  0.0199834039   3.500             0.000467 ***
OCCUPATION Tech-support              0.1349436628  0.0266019579   5.073  0.00000039865911098 ***
OCCUPATION Transport-moving                    NA            NA      NA                   NA    
RELATIONSHIP Not-in-family          -0.1392870330  0.0478823987  -2.909             0.003634 ** 
RELATIONSHIP Other-relative         -0.1875317170  0.0472994733  -3.965  0.00007393384645715 ***
RELATIONSHIP Own-child              -0.1786630248  0.0477186530  -3.744             0.000182 ***
RELATIONSHIP Unmarried              -0.1457543619  0.0498602048  -2.923             0.003471 ** 
RELATIONSHIP Wife                    0.1716370871  0.0191453005   8.965 < 0.0000000000000002 ***
SEX Male                             0.0953608305  0.0121372899   7.857  0.00000000000000431 ***
CAPITALGAIN                          0.0000041883  0.0000003408  12.290 < 0.0000000000000002 ***
CAPITALLOSS                          0.0000568831  0.0000075652   7.519  0.00000000000005952 ***
HOURSPERWEEK                         0.0041407861  0.0003270325  12.662 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.1374828)

    Null deviance: 2744.0  on 10975  degrees of freedom
Residual deviance: 1501.7  on 10923  degrees of freedom
AIC: 9424.2

Number of Fisher Scoring iterations: 2
# stepwise regression 은 가능한 모든 조합을 탐색하지는 않지만, 
# all subsets regression 은 가능한 모든 조합을 탐색하여, 각 예측인자별로 best n 개 ( nbest ) 를 표시한다. 
# 하지만 이 방법은 feature set 의 크기에 따라, 수행시간이 매우 길어지는 단점이 있다. ( really.big argument ㅜㅜ)
# all_subset_regression <- regsubsets(ABOVE50K ~ ., data = trainingData, nbest = 2, really.big = T)
library(leaps)
all_subset_regression <- regsubsets(
  ABOVE50K ~ AGE + WORKCLASS + EDUCATION + OCCUPATION + RELATIONSHIP + SEX + CAPITALGAIN + CAPITALLOSS, 
  data = trainingData, 
  nbest = 2
)
1  linear dependencies found
Reordering variables and trying again:
plot(all_subset_regression, scale = "adjr2") # scale = [rsq|rss|adjr2|cp|bic|outmat]

# 위 plot 의 각 행의 색깔이 칠해진 것이 feature 의 갯수이다. 
# 위에서 nbest = 2 를 주었으므로, 2개가 색칠된 것이 아래부터 2개이고, 
# 그위에는 3개가 색칠된 것이 3개이고 이런식이다. 이 경우, adjr2 를 기준으로 best 를 찾은 것이니 1 에 가까운 모델이 최적이다. 

Multinomial Regression

# ref : https://onlinecourses.science.psu.edu/stat504/node/40
# ref : http://r-statistics.co/Multinomial-Regression-With-R.html
# 
#1. Random component : Multinomial Distribution ( Y , multi levels )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Generalized Logit
cmcData <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data", stringsAsFactors=FALSE, header=F)
colnames(cmcData) <- c("wife_age", "wife_edu", "hus_edu", "num_child", "wife_rel", "wife_work", "hus_occu", "sil", "media_exp", "cmc")
head(cmcData, n = 1)
levels(as.factor(cmcData$cmc))
[1] "1" "2" "3"
cmcData <- transform(cmcData, cmc <- as.factor(cmc))
# random component 의 distribution 의 type 과 그에 따른 link function 이 달라진 것 이외에는 
# 큰 차이가 없어 추가적인 설명없이, nnet package 의 multinom function 을 통한 prediction 까지만 수행한다. 
# summary 결과를 보면, 각 level 별 coefficient 값이 estimation 됨을 알 수 있다. 
library(nnet)
model <- multinom(cmc ~ . , data = cmcData)
# weights:  33 (20 variable)
initial  value 1618.255901 
iter  10 value 1425.547165
iter  20 value 1393.032613
final  value 1391.288728 
converged
summary(model)
Call:
multinom(formula = cmc ~ ., data = cmcData)

Coefficients:
  (Intercept)    wife_age  wife_edu     hus_edu num_child   wife_rel  wife_work    hus_occu       sil  media_exp
2  -3.2489937 -0.04552648 0.8841599 -0.08376536 0.3446932 -0.4787242 0.03124732 -0.07945559 0.3419976 -0.4421317
3  -0.1244396 -0.10557077 0.3695350  0.05643188 0.3505750 -0.3249388 0.16619955  0.18149509 0.2280536 -0.4756168

Std. Errors:
  (Intercept)   wife_age   wife_edu   hus_edu  num_child  wife_rel wife_work   hus_occu        sil media_exp
2   0.7592481 0.01205427 0.11453340 0.1339649 0.04287206 0.2004112 0.1683670 0.09724308 0.09679462 0.3880349
3   0.6270848 0.01124560 0.08746493 0.1006902 0.03860957 0.1980680 0.1506602 0.08437123 0.07253987 0.2726890

Residual Deviance: 2782.577 
AIC: 2822.577 
cmcData$pred <- predict(model, cmcData)
cmcData <- cbind(cmcData, as.data.frame(predict(model, cmcData, type = "prob")))
table(cmcData$pred, cmcData$cmc)
   
      1   2   3
  1 407  96 181
  2  43 122  84
  3 179 115 246
# predict 의 type 으로 prob 을 지정하는 경우, 각 level 이 될 확률값을 matrix 로 반환하게 된다. 
head(cmcData[c("cmc", "pred", "1", "2", "3")])
# vglm package 를 사용하면, model summary 로 significance 값을 알 수 있다. 
cmcData <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data", stringsAsFactors=FALSE, header=F)
colnames(cmcData) <- c("wife_age", "wife_edu", "hus_edu", "num_child", "wife_rel", "wife_work", "hus_occu", "sil", "media_exp", "cmc")
library(VGAM)
model <- vglm(cmc ~ . , family = multinomial(refLevel = 1), data = cmcData)
exp(coef(model))
(Intercept):1 (Intercept):2    wife_age:1    wife_age:2    wife_edu:1    wife_edu:2     hus_edu:1     hus_edu:2   num_child:1   num_child:2 
   0.03881314    0.88298963    0.95549418    0.89981058    2.42095796    1.44706290    0.91964531    1.05805549    1.41155837    1.41988463 
   wife_rel:1    wife_rel:2   wife_work:1   wife_work:2    hus_occu:1    hus_occu:2         sil:1         sil:2   media_exp:1   media_exp:2 
   0.61957124    0.72257099    1.03173928    1.18080883    0.92361870    1.19900979    1.40775746    1.25615273    0.64268382    0.62150338 
summary(model)

Call:
vglm(formula = cmc ~ ., family = multinomial(refLevel = 1), data = cmcData)


Pearson residuals:
                      Min      1Q  Median       3Q   Max
log(mu[,2]/mu[,1]) -2.285 -0.5427 -0.2721 -0.07947 8.309
log(mu[,3]/mu[,1]) -2.363 -0.7109 -0.3746  0.99564 3.436

Coefficients: 
              Estimate Std. Error z value             Pr(>|z|)    
(Intercept):1 -3.24900    0.75925  -4.279 0.000018754188419641 ***
(Intercept):2 -0.12444    0.62708  -0.198             0.842697    
wife_age:1    -0.04553    0.01205  -3.777             0.000159 ***
wife_age:2    -0.10557    0.01125  -9.388 < 0.0000000000000002 ***
wife_edu:1     0.88416    0.11453   7.720 0.000000000000011661 ***
wife_edu:2     0.36954    0.08746   4.225 0.000023898352591387 ***
hus_edu:1     -0.08377    0.13396  -0.625             0.531779    
hus_edu:2      0.05643    0.10069   0.560             0.575166    
num_child:1    0.34469    0.04287   8.040 0.000000000000000898 ***
num_child:2    0.35058    0.03861   9.080 < 0.0000000000000002 ***
wife_rel:1    -0.47873    0.20041  -2.389             0.016907 *  
wife_rel:2    -0.32494    0.19807  -1.641             0.100892    
wife_work:1    0.03125    0.16837   0.186             0.852772    
wife_work:2    0.16620    0.15066   1.103             0.269965    
hus_occu:1    -0.07946    0.09724  -0.817             0.413879    
hus_occu:2     0.18150    0.08437   2.151             0.031464 *  
sil:1          0.34200    0.09679   3.533             0.000411 ***
sil:2          0.22805    0.07254   3.144             0.001667 ** 
media_exp:1   -0.44210    0.38803  -1.139             0.254559    
media_exp:2   -0.47561    0.27269  -1.744             0.081131 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Number of linear predictors:  2 

Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1])

Residual deviance: 2782.577 on 2926 degrees of freedom

Log-likelihood: -1391.289 on 2926 degrees of freedom

Number of iterations: 5 

Reference group is level  1  of the response
# mlogit package 를 사용해볼 수도 있다. 
# https://cran.r-project.org/web/packages/mlogit/vignettes/mlogit.pdf

Poisson Regression

# ref : https://onlinecourses.science.psu.edu/stat504/node/165
# ref : http://wiki.stat.ucla.edu/socr/index.php/AP_Statistics_Curriculum_2007_Distrib_Poisson
#
#1. Random component : Poisson, QuasiPoisson Distribution ( Y )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Log , Identity
#
# 웹서비스를 제공하면서, Poisson Distribution 을 겪을 수 있는 케이스 
# O2O 배달주문, 콜 호출 
# 그외 각종 count 류의 데이터들 ( 사이버가 아니라 현실과 가까울 수록 )
# Poisson Distribution
# Discret Prob. Distribution > Binomial Distribution , Poisson Distribution 
# 실서비스의 데이터를 사용하면 제일 좋지만, rpubs 에 게시하기로 했으므로
# http://data.princeton.edu/wws509/datasets/#ceb 를 사용한다.
#
library(foreign)
data <- read.dta("http://data.princeton.edu/wws509/datasets/ceb.dta")

library(dplyr)
glimpse(data)
par(mfrow = c(3,2))
plot(density(data$n))
plot(density(data$mean))
car::qqPlot(data$n)
# Clearly the variance increases with the mean.
upper_20 <- subset(data, n >= 20)
plot(log(upper_20$mean), log(upper_20$var))
abline(0,1)
# dependent variable : y = mean number of children ever born * number of women in the cell 
data$y <- round(data$mean * data$n, 0)
plot(density(data$y))
plot(data$n, data$y)

# If the number of CEB to one woman in a given cell is a Poisson random variable with mean (and variance) μ, 
# then the number born to all n women in that cell is a Poisson r.v. with mean (and variance) nμ. 
# The log of the expected sum is log(n)+log(μ), and consists of a known offset and the quantity we are interested in modeling. 
data$os <- log(data$n) 
model.0 <- glm( y ~ 1, offset = os, data = data, family = poisson)
summary(model.0)

Call:
glm(formula = y ~ 1, family = poisson, data = data, offset = os)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-18.6368   -4.4774   -0.8548    2.5292   21.9744  

Coefficients:
            Estimate Std. Error z value            Pr(>|z|)    
(Intercept) 1.376346   0.009712   141.7 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3731.9  on 69  degrees of freedom
Residual deviance: 3731.9  on 69  degrees of freedom
AIC: 4163.3

Number of Fisher Scoring iterations: 5
paste("exp(coef(model.0)) : ", exp(coef(model.0)),  " == sum(data$y)/sum(data$n) : ", sum(data$y)/sum(data$n))
[1] "exp(coef(model.0)) :  3.96040343668286  == sum(data$y)/sum(data$n) :  3.96040343668285"
model.res <- glm( y ~ res, offset = os, data = data, family = poisson)
summary(model.res)

Call:
glm(formula = y ~ res, family = poisson, data = data, offset = os)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-19.6141   -4.1326   -0.3872    3.2787   20.1998  

Coefficients:
            Estimate Std. Error z value             Pr(>|z|)    
(Intercept)  1.20460    0.02499  48.199 < 0.0000000000000002 ***
resUrban     0.14429    0.03245   4.447 0.000008716338818415 ***
resRural     0.22806    0.02783   8.194 0.000000000000000252 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3731.9  on 69  degrees of freedom
Residual deviance: 3659.3  on 67  degrees of freedom
AIC: 4094.8

Number of Fisher Scoring iterations: 5
exp(coef(model.res))
(Intercept)    resUrban    resRural 
   3.335417    1.155219    1.256160 
c(deviance(model.0)-deviance(model.res), deviance(model.res))
[1]   72.57247 3659.27915
all_subset_regression <- regsubsets(
  y ~ offset(os) + dur + res + educ, 
  data = data, 
  nbest = 2
)
plot(all_subset_regression, scale = "bic") # scale = [rsq|rss|adjr2|cp|bic|outmat]

model.all <- glm( y ~ dur + res + educ, offset = os, data = data, family = poisson)
summary(model.all)

Call:
glm(formula = y ~ dur + res + educ, family = poisson, data = data, 
    offset = os)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2960  -0.6641   0.0725   0.6336   3.6782  

Coefficients:
                  Estimate Std. Error z value             Pr(>|z|)    
(Intercept)       -0.11710    0.05491  -2.132             0.032969 *  
dur5-9             0.99693    0.05274  18.902 < 0.0000000000000002 ***
dur10-14           1.36940    0.05107  26.815 < 0.0000000000000002 ***
dur15-19           1.61376    0.05119  31.522 < 0.0000000000000002 ***
dur20-24           1.78491    0.05121  34.852 < 0.0000000000000002 ***
dur25+             1.97641    0.05003  39.501 < 0.0000000000000002 ***
resUrban           0.11242    0.03250   3.459             0.000541 ***
resRural           0.15166    0.02833   5.353         0.0000000863 ***
educLower primary  0.02297    0.02266   1.014             0.310597    
educUpper primary -0.10127    0.03099  -3.268             0.001082 ** 
educSecondary+    -0.31015    0.05521  -5.618         0.0000000194 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3731.852  on 69  degrees of freedom
Residual deviance:   70.665  on 59  degrees of freedom
AIC: 522.14

Number of Fisher Scoring iterations: 4
exp(coef(model.all))
      (Intercept)            dur5-9          dur10-14          dur15-19          dur20-24            dur25+          resUrban 
        0.8894987         2.7099627         3.9329722         5.0216439         5.9590510         7.2167534         1.1189812 
         resRural educLower primary educUpper primary    educSecondary+ 
        1.1637648         1.0232387         0.9036856         0.7333373 
c(deviance(model.0)-deviance(model.all), deviance(model.all))
[1] 3661.1863   70.6653
dur_count <- t(data %>% dplyr::group_by(dur) %>% summarise(mean = sum(mean) / n()))
res_count <- t(data %>% dplyr::group_by(res) %>% summarise(mean = sum(mean) / n()))
count <- cbind(dur_count, res_count)
colnames(count) <- as.character(unlist(count[1, ]))
count[rownames(count) == "mean", ]
        0-4         5-9       10-14       15-19       20-24         25+        Suva       Urban       Rural 
"0.8891667" "2.6300000" "3.5541666" "4.2216667" "4.9841667" "6.3380000"  "3.267917"  "3.998696"  "3.840435" 
models <- c("dur+res*educ","dur*res+educ","dur*educ+res",
             "(dur+res)*educ","(dur+educ)*res","dur*(res+educ)",
             "dur*res*educ - dur:res:educ")
dd <- matrix(0, length(models), 2)
i <- 1
for(model in models) {
   formula <- paste("y ~ offset(os) + ",model)
   m <- glm(formula, family=poisson, data=data)
   dd[i,] <- c(deviance(m), m$df.residual)
   i <- i+1
 }
data.frame(model=models,deviance=round(dd[,1],2), 
            pval=round(pchisq(dd[,1],dd[,2],lower.tail=FALSE),4))
# 위에서, formula 에 들어가는 모든 feature 들의 subset 에 대해 평가하는 시도를 해봤는데,  
# 특히 그 조합의 수가 많은 경우 매우 느리다는 것을 보았다.
# 이런 경우, 계산량을 줄이는 기법 중에 하나가 Ridge 혹은 Lasso Regression 이다. 

Model Selection ( Statistical Technicals )

# 아래 Ridge 와 Lasso 를 보기전에, Model Selection 에 대한 부분을 디벼 보자.
# 
# ref : https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/model_selection.pdf
# ref : http://sosal.kr/868
# ref : http://www.statground.org/?module=file&act=procFileDownload&file_srl=9145&sid=a0a4e7fd21503e45194d485ccb0c5476&module_srl=137
#
# 1. Subset Selection : like a regsubsets function ( estimate model based on AIC , BIC , ADJR2 , CP ..  ) 
#                     : Stepwise Selection ( Backward / Forward ) or regsubsets ( 위에서 수행한 내용 )
# 2. Shrinkage ( Regularization ) : Lasso , Ridge , ElasticNet
#                                 : shrinks the coefficient estimates towards zero
# 3. Dimension Reduction : PCA 
# 
# Occam's Razor in Model Selection ( 최소의 feature , 설명변수로 설명을 하자 )
## All things being equal, usually the simplest explanation of a phenomenon is a good hypothesis.
## Simplicity = representational succinctness 
## 통계 모델링에서 간결함의 원리
#  - 복잡한 설명보다 단순하고 간단한 설명이 좋다.
#  - 모형에 있어서 모수는 가능한 적게 가지고 있어야 한다.
#  - 선형 모델이 비선형 모델보다 좋다.
#  - 적은 수의 가정을 가진 실험은 많은 가정을 가진 실험보다 좋다.
#  - 데이터로부터 회귀모델을 구할 때는 위해선 최소적합이 될 때까지 축소해야한다.
## 단순화 작업 목록
#  - 유의하지 않은 교호작용 항목을 제거한다.
#  - 유의하지 않은 2차 또는 비선형 항목을 제거한다.
#  - 유의하지 않은 설명 변수를 제거한다.
#  - 유사한 모수값을 갖는 설명 변수를 통합한다.
#
# 선형회귀모형에서 모형계수 벡터(coefficient estimated value) 에 대한 최소제곱 해보다
# 이것을 약간 축소한 ridge 해를 쓰는 것이 예측 성과가 좋은 것으로 알려져 있다. 
# 또한 모형계수 벡터의 일부요소를 0으로 퇴화시킨 lasso 해를 쓰게 되면 모형이 간결해진다는 잇점이 있다. 

Ridge Regression ( Linear Regression with L2 regularization, α=0 )

# ref : https://onlinecourses.science.psu.edu/stat857/node/155
# ref : https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Ridge_Regression.pdf
# ref : https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/model_selection.pdf
# ref : https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
# ref : https://drsimonj.svbtle.com/ridge-regression-with-glmnet
# ref : https://gerardnico.com/wiki/lang/r/ridge_lasso#formula_parameters
# ref : http://r-bong.blogspot.kr/p/blog-page_4.html
# 
# Motivation: too many predictors
# Motivation: ill-conditioned X
# Motivation: 다중공분산성 존재 해결 
library(glmnet)
data(BinomialExample)
glimpse(as.data.frame(x))
Observations: 100
Variables: 30
$ V1  <dbl> -0.619261352, 1.094272782, -0.356704020, -2.469070123, 0.567288518, 0.912925427, 0.095673049, 1.934206669, 0.282757006, 0.80...
$ V2  <dbl> 0.01624409, 0.47257285, 0.30121334, 2.84771447, 0.88888747, 0.77350086, 0.14027229, -0.71114983, 1.05940570, 1.53674274, 2.5...
$ V3  <dbl> -0.62606831, -1.33714704, 0.19056192, 1.66024352, -0.01158671, 0.55836355, -0.76043921, -0.27387147, -0.03944966, -1.0123076...
$ V4  <dbl> 0.41268461, -0.64058126, 0.23402677, 1.56881297, 0.57627526, -0.53509922, -0.04935541, 1.00113828, 0.30277367, -0.38480878, ...
$ V5  <dbl> 0.49443737, 0.28231989, 0.16980865, -0.83305695, -0.86894535, 0.35070935, 1.57409924, 1.04390121, -0.91617615, -2.03190998, ...
$ V6  <dbl> -0.44932691, -0.60933212, 1.22914266, -0.56200878, -0.31325709, -0.57630209, -0.12409033, 0.80288927, 0.69149343, 0.22363140...
$ V7  <dbl> 0.67600532, 0.35472319, 1.16280950, -0.61424552, 0.69029066, -0.38826725, -1.11062763, -0.60357694, 0.60875533, -1.16288468,...
$ V8  <dbl> -0.067714189, -0.626865146, 0.880242416, -1.765298380, -1.299611997, 0.555186632, 1.728954516, -0.511363802, 0.309215943, -0...
$ V9  <dbl> -1.47470315, -0.11572657, 0.76164309, 0.39023457, -0.96736399, 0.35695623, -1.24507511, 0.42946576, 1.03080204, -0.72415080,...
$ V10 <dbl> 1.07591778, -0.31270712, -1.18080414, 1.79400617, -0.27323220, 1.36296403, -0.44391646, 0.51898565, -0.03086332, -0.04368505...
$ V11 <dbl> 0.57060390, 0.41386677, -0.48768528, 0.46447534, -0.52395953, -0.78199306, 0.36040354, 0.95508062, 0.12818637, 0.27686071, -...
$ V12 <dbl> -0.09864730, -1.87169972, 0.45650048, 0.51681031, -0.82159177, 0.60533784, -1.93464482, 0.59361080, -1.61166897, -0.20969135...
$ V13 <dbl> 1.2099852651, 0.1640714232, 0.6450244725, -1.0808010261, 0.5460573463, -0.4721640040, -0.5793974421, -0.5499844788, 0.201644...
$ V14 <dbl> -1.067236343, 0.772106042, 1.752761666, -0.298094014, -0.515107637, 1.255609733, 0.425012238, -0.117441666, -0.721208510, 0....
$ V15 <dbl> 0.34329125, -0.86250855, 0.49003474, -0.58492559, -0.13621841, -1.18593654, -1.60255584, -1.13825281, -0.86039000, 0.5039227...
$ V16 <dbl> -0.02255467, 1.46205409, 2.27150850, -1.35886288, -1.11128459, -2.60167966, 0.37601661, -0.43159934, 0.20612204, -2.21481512...
$ V17 <dbl> -0.10206712, -0.43697967, -0.73408583, -0.65949297, -0.97840076, -0.34922893, -0.14361636, -1.60487751, -0.01159857, 0.05097...
$ V18 <dbl> 0.141396200, -1.040576427, 0.976631268, -1.199244134, -1.377847577, 0.162100751, -0.313691971, -1.115409953, 0.189800512, 0....
$ V19 <dbl> -0.54935769, -0.60271031, -0.35846388, 0.20162167, -1.35167004, -0.87035201, 0.86895743, 0.27109331, 0.65493038, 1.71682268,...
$ V20 <dbl> -0.201680275, -0.320569476, -0.080051607, 0.507079112, 0.959261661, -1.748484377, -0.350490405, 0.725738748, -0.984670053, -...
$ V21 <dbl> 0.45660069, 0.21247573, 1.24529573, 0.67274655, 0.49191549, -0.20263506, 0.27961034, -0.22359249, 0.15490587, 0.24866870, -0...
$ V22 <dbl> 0.80728018, 0.61291024, 1.01187663, -0.42514840, -1.34002195, 1.70688487, -1.44030057, -0.16096838, 1.59133102, -0.32262962,...
$ V23 <dbl> 0.01473546, -0.81898245, 0.03927982, -0.98728378, -0.13288456, -1.51250049, -0.22509945, 0.30174828, -1.26593344, -0.5384656...
$ V24 <dbl> -1.116096788, 2.903714215, 1.017275065, -0.401273920, 0.219702267, 1.176929799, 0.471839849, -1.001070418, -2.251378574, -0....
$ V25 <dbl> -0.015876170, -0.157830993, 0.297052891, -2.066153107, 0.789746549, 1.668301931, -1.731136499, -1.854376870, -0.513064119, 1...
$ V26 <dbl> 1.14549082, -2.10469356, 0.40180292, -0.55248745, -0.14745205, 0.06319776, 1.00031874, 1.92738918, -0.99683357, 1.59311684, ...
$ V27 <dbl> 0.94175286, 0.64084422, 1.92464695, -1.37902657, -0.80188950, 0.29071513, 0.61124388, -0.12412178, -0.45950970, 0.71393487, ...
$ V28 <dbl> 1.43715303, 0.01236235, 0.31108274, 0.09518939, 0.77559612, -0.73832848, 0.34301517, -2.00604840, 0.90450461, 0.41088674, -0...
$ V29 <dbl> -0.92783303, 0.48111055, 0.89830290, -0.38971056, 1.11845955, -0.40887682, 0.52683668, 0.29409299, -1.41784242, 0.95574856, ...
$ V30 <dbl> 0.9080937, -2.0492817, -1.2089006, 1.7796744, -1.1970731, -1.0545311, -0.1119846, -0.1780586, 0.5141867, 1.5930828, -0.34353...
model.ridge <- glmnet(x, y, family = "binomial", alpha = 0 )
par(mfrow=c(2,2))
plot(density(y))
plot(model.ridge, xvar="lambda", label=TRUE)
plot(model.ridge, xvar="norm", label=TRUE)
plot(model.ridge, xvar="dev", label=TRUE)

cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial", nfolds = 100)
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv.ridge)

coef(cv.ridge, s = cv.ridge$lambda.min)
31 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept)  0.1718290283
V1           0.1148574142
V2           0.5068431000
V3          -0.3384649794
V4          -0.8634050979
V5          -0.3141436782
V6          -0.6956355852
V7           0.0798900376
V8          -0.5167458568
V9           0.5193890584
V10         -1.0182682093
V11         -0.2077506627
V12         -0.2218540968
V13         -0.1638673635
V14          0.1370473811
V15          0.0388320169
V16          0.3621440665
V17         -0.1226309533
V18         -0.1492504287
V19         -0.0497939458
V20         -0.2024006258
V21          0.0006531455
V22          0.2456970018
V23          0.4333057414
V24         -0.1769632495
V25          0.5320062623
V26         -0.3875044960
V27         -0.2157079430
V28          0.3337625633
V29         -0.2659968175
V30          0.1601149964
library(doMC)
registerDoMC(cores = 4)
cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial", nfolds = 100, type.measure = "class")
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv.ridge)

foldid <- sample(1:10,size=length(y),replace=TRUE)
cv.1  <- cv.glmnet(x,y,foldid=foldid,alpha=1)
cv.05 <- cv.glmnet(x,y,foldid=foldid,alpha=.5)
cv.0  <- cv.glmnet(x,y,foldid=foldid,alpha=0)
par(mfrow=c(2,2))
plot(cv.1);plot(cv.05);plot(cv.0)
plot(log(cv.1$lambda),cv.1$cvm,pch=19,col="red",xlab="log(Lambda)",ylab=cv.1$name)
points(log(cv.05$lambda),cv.05$cvm,pch=19,col="grey")
points(log(cv.0$lambda),cv.0$cvm,pch=19,col="blue")
legend("topleft",legend=c("alpha= 1","alpha= .5","alpha 0"),pch=19,col=c("red","grey","blue"))

Lasso Regression

ElasticNet Regression

---
title: "advanced_regression_models"
output: html_notebook
---

```{r}
#################################################################################################################
# advanced regression models                                                                                    #                                 
# ref : http://r-statistics.co/adv-regression-models.html                                                       #
# ref : https://onlinecourses.science.psu.edu/stat504/node/216                                                  #
# ref : https://blog.cloudera.com/blog/2015/12/common-probability-distributions-the-data-scientists-crib-sheet/ #
# ref : http://rstudio-pubs-static.s3.amazonaws.com/190997_40fa09db8e344b19b14a687ea5de914b.html                #
# ref : http://kostat.go.kr/attach/journal/12-2-3.pdf                                                           #
#################################################################################################################

######################################################################################################
# package & method                                                                                   #
# smbinning : https://cran.r-project.org/web/packages/smbinning/smbinning.pdf                        #
#           : smbinning.factor()                                                                     #
# InformationValue : https://cran.r-project.org/web/packages/InformationValue/InformationValue.pdf   #
#                  : optimalCutoff() , plotRoc()                                                     #
# glmnet : https://cran.r-project.org/web/packages/glmnet/glmnet.pdf                                 #
######################################################################################################
```

```{r}
# linear regression 의 가정중 dependent variable 정규분포해야 한다는 것이 있는데,
# 이 가정은, 서비스단에서는 앵간하면 맞추기 힘든 가정이다.
# depnendent value 가 다양한 분포를 가질때, 사용할 수 있는 glm 에 대해서 정리
```
![](/Users/CA/Downloads/familyg.png)

## GLM

```{r}
# General Linear Model 
#        refers to conventional linear regression models for a continuous response variable 
#        given continuous and/or categorical predictors
#
# Generalized Linear Model
#        the response variable yi is assumed to follow an exponential family distribution with mean μi, 
#        which is assumed to be some (often nonlinear) function of T(xi)β.
```
![](/Users/CA/Downloads/family.png)


```{r}
# three components
## Random Component : the probability distribution of the response variable (Y)
## Systematic Component : the explanatory variables (X1, X2, ... Xk) as a combination of linear predictors; e.g. β0 + β1x1 + β2x2
## Link Function (η or g(μ)) : specifies the link between random and systematic components , e.g. η = logit(π) for logistic regression
##                           : GLM은 Y ~ X 가 linear 관계가 아님. link function 을통해서, linear 관계를 만들어주는 역할 
##                           : binary ( 0, 1 ) 값만을 갖는 Y 를 , 확률개념으로 변경하여 continuous variable 로 만들어주는 역할 
##                           : 아래 그림에서 'x' 점으로 찍히는 y 변수값을, sigmoid curve 형태로 변환시키는 역할 
##                           : http://www.columbia.edu/~so33/SusDev/Lecture_9.pdf
```
![](/Users/CA/Downloads/sigmoid.png)

## Binary Logistic Regression 

```{r}
# ref : http://r-statistics.co/Logistic-Regression-With-R.html#Build%20Logit%20Models%20and%20Predict
#
#1. Random component : Binomial Distribution ( Y )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Logit
#
# Assumptions
##4.1 dependent variable 끼리 independent 해야함 
##4.2 샘플 데이터가 충분히 많아야 함.
#
# MLE ( Maximum Likelihood Estimation ) : 여기까지 봐야하는가? 
## 오직 주어진 Observation, 혹은 데이터들 만을 토대로 parameter estimation을 하는 방법
## observation에 따라 그 값이 너무 민감하게 변한다는 단점 ( so, 샘플 데이터가 충분히 많아야 함 )
#
# No Outliers ( 0 or 1 )
```
![](/Users/CA/Downloads/logitfunc.png)

```{r}
# ref : http://rfriend.tistory.com/99
# binomial distribution
# Discret Prob. Distribution > Binomial Distribution , Poisson Distribution 
# 
# 반복시행시, Success / Fail 만 발생하는 케이스 ( 베르누이 시행 )
# 성공확률이 p인 베르누이 시행을 n번 반복했을 때 성공하는 횟수를 X라 하면, 확률변수 X는 모수 n과 p인 이항분포(Binomial distributio)을 따른다
```
![](/Users/CA/Downloads/logistic.png)

```{r}
y <- dbinom(0:20, size=20, prob=0.5)
plot(0:20, y, type='h', lwd=5, col="grey", ylab="Probability", xlab="확률변수 X", main = c("X ~ B(20, 0.5)"))
plot(pbinom(0:20, size=20, prob=0.5), type='h')

# sum(dbinom(0:12, size=20, prob=0.5))
# rbinom(12, size=20, prob=0.5)
```

```{r}
library(data.table)

adult <- fread("/Users/CA/Downloads/adult.csv", stringsAsFactors = F)
head(adult, n = 1)
```

```{r}
# Create Training Data
input_ones <- adult[which(adult$ABOVE50K == 1), ]  # all 1's
input_zeros <- adult[which(adult$ABOVE50K == 0), ]  # all 0's

set.seed(100)  
input_ones_training_rows <- sample(1:nrow(input_ones), 0.7*nrow(input_ones))  # 1's for training
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.7*nrow(input_ones))  # 0's for training. Pick as many 0's as 1's
training_ones <- input_ones[input_ones_training_rows, ]  
training_zeros <- input_zeros[input_zeros_training_rows, ]
trainingData <- rbind(training_ones, training_zeros)  # row bind the 1's and 0's 

# Create Test Data
test_ones <- input_ones[-input_ones_training_rows, ]
test_zeros <- input_zeros[-input_zeros_training_rows, ]
testData <- rbind(test_ones, test_zeros)  # row bind the 1's and 0's 
```

```{r}
library(smbinning)
# segregate continuous and factor variables
factor_vars <- c ("WORKCLASS", "EDUCATION", "MARITALSTATUS", "OCCUPATION", "RELATIONSHIP", "RACE", "SEX", "NATIVECOUNTRY")
continuous_vars <- c("AGE", "FNLWGT","EDUCATIONNUM", "HOURSPERWEEK", "CAPITALGAIN", "CAPITALLOSS")

iv_df <- data.frame(VARS=c(factor_vars, continuous_vars), IV=numeric(14))  # init for IV results

# compute IV for categoricals
for(factor_var in factor_vars){
  smb <- smbinning.factor(trainingData, y="ABOVE50K", x=factor_var)  # WOE table
  if(class(smb) != "character"){ 
    iv_df[iv_df$VARS == factor_var, "IV"] <- smb$iv
  }
}

# compute IV for continuous vars
for(continuous_var in continuous_vars){
  smb <- smbinning(trainingData, y="ABOVE50K", x=continuous_var)  # WOE table
  if(class(smb) != "character"){  # any error while calculating scores.
    iv_df[iv_df$VARS == continuous_var, "IV"] <- smb$iv
  }
}

iv_df <- iv_df[order(-iv_df$IV), ]  # sort
iv_df
```

```{r}
# Check Random component : Binomial Distribution ( Y ) , Success(1) or Fail(0) => Binomial + Logit function 
table(trainingData$ABOVE50K)

# 0 과 1 만 갖는 분포는 transformation을 하더라도, nomal distribution 의 모양을 가질 수 없다. 
plot(density(adult$ABOVE50K))
```

```{r}
# binomial distribution 의 link function 으로, glm() 에서는 logit . probit . couchit 을 사용할 수 있다. 
```
![](/Users/CA/Downloads/it.png)

```{r}
model <- glm(ABOVE50K ~  AGE + CAPITALGAIN + EDUCATIONNUM + HOURSPERWEEK + CAPITALLOSS, data=trainingData, family=binomial(link="logit"))

testData$predicted <- predict(model, testData, type="response")  # predicted scores

summary(model)

# check multicollinearity ( 다중공선성 ) 독립변수들간에 높은 선형관계가 존재하면 ( 다중공선성이 존재하면 ) 회귀분석 전제 가정 위배
library(car)
sqrt(vif(model)) > 2 
```
![](/Users/CA/Downloads/coli.png)
```{r}
# feature 들의 상대적인 중요도 ( feature 를 추가했을때 평균적은 R2 의 증가를 보는 것 )
relweights <- function(fit,...){
         R <- cor(fit$model)
         nvar <- ncol(R)
         rxx <- R[2:nvar, 2:nvar]
         rxy <- R[2:nvar, 1]
         svd <- eigen(rxx)
         evec <- svd$vectors
         ev <- svd$values
         delta <- diag(sqrt(ev))
         lambda <- evec %*% delta %*% t(evec)
         lambdasq <- lambda ^ 2
         beta <- solve(lambda) %*% rxy
         rsquare <- colSums(beta ^ 2)
         rawwgt <- lambdasq %*% beta ^ 2
         import <- (rawwgt / rsquare) * 100
         import <- as.data.frame(import)
         row.names(import) <- names(fit$model[2:nvar])
         names(import) <- "Weights"
         import <- import[order(import),1, drop=FALSE]
         dotchart(import$Weights, labels=row.names(import),
            xlab="% of R-Square", pch=19,
            main="Relative Importance of Predictor Variables",
            sub=paste("Total R-Square=", round(rsquare, digits=3)),
            ...)
         return(import)
}

relweights(model)
```

```{r}
library(InformationValue)
threshold <- optimalCutoff(testData$ABOVE50K, testData$predicted)
misClassError(testData$ABOVE50K, testData$predicted, threshold = threshold)
plotROC(testData$ABOVE50K, testData$predicted)
```

```{r}
# linear regression 의 가정을 검사하기 위한 아래 plot 들은, 당연하게도 모두 위배되는 결과를 보인다. 

par(mfrow = c(2,2))
plot(model)
```


```{r}
# model 비교 
model.1 <- model <- glm(ABOVE50K ~  1, data=trainingData, family=binomial(link="logit"))
model.2 <- model <- glm(ABOVE50K ~  AGE, data=trainingData, family=binomial(link="logit"))
model.3 <- model <- glm(ABOVE50K ~  AGE + CAPITALGAIN, data=trainingData, family=binomial(link="logit"))

library(rcompanion)

compareGLM(model.1, model.2, model.3)
anova(model.1, model.2, model.3)
```

```{r}
# 최선의 모델을 고르는 방법 중에, 앞선 모델 evaluation 과 함께 몇가지 통계량을 기준으로 선택하는 방법도 있다. 
# 위의 compareGLM function 의 결과를 보면 AIC / BIC 등의 통계량이 나오는데, 이에 대한 내용을 살펴보자 
#
# AIC(Akaike’s An Information Criterion)
## 모형의 통계적 적합성 및 통계 적합에 필요한 인수의 수를 설명 ( ? )
## AIC 값이 적은 모형, 즉 적은 인수를 가지고 적절한 적합성을 보이는 모형이 선호 ( AIC 값은 적으면 좋다. AIC 값은 음수도 가능하다. )
## AIC / BIC function 의 doc 을 살펴보면, 아래와 같다. 
## When comparing models fitted by maximum likelihood to the same data, the smaller the AIC or BIC, the better the fit.

AIC(model.1, model.2, model.3)
```

```{r}
# 위 예에서는 model.3 의 AIC 값이 제일 작게 나왔으므로, AIC 기준으로는 model.3 이 제일 적절한 모델이라고 판단한다. 
# 이때, independent var. 를 자동으로 추가해가면서 여러 모델을 쉽게 비교해 볼 수 도 있는데, step 함수를 사용하면 된다. 
# step 함수는 backward ( feature 를 빼가면서, ) , forward ( 지정된 feature 들을 하나씩 추가하면서 ) 최소 AIC 값을 찾는다.  
super.model = glm(ABOVE50K ~ 1 , data = trainingData)
sub.model = step(super.model, 
                 direction = "forward", 
                 scope = (ABOVE50K ~AGE + CAPITALGAIN + EDUCATIONNUM + HOURSPERWEEK + CAPITALLOSS),
                 trace = 0)

summary(sub.model)
```

```{r}
# 추가적으로 dummy coding 이 적용되었음을 볼 수 있다. 
# 아래 결과는 smbinning 을 통해 feature 를 selection 한 결과와는 다소 차이가 있는데, selection 의 기준이 다른 것이지 어느게 맞고 틀리고의 문제는 아니다. 
# ( Stepwise regression은 논란의 여지가 있다. 이 방법을 통해서 좋은 모형을 찾을 수는 있지만 최선의 모형이라는 보장은 없다고 한다. )
super.model = glm(ABOVE50K ~ . , data = trainingData)
sub.model = step(super.model, 
                 direction = "backward", 
                 trace = 0)

summary(sub.model)
```

```{r}
# stepwise regression 은 가능한 모든 조합을 탐색하지는 않지만, 
# all subsets regression 은 가능한 모든 조합을 탐색하여, 각 예측인자별로 best n 개 ( nbest ) 를 표시한다. 
# 하지만 이 방법은 feature set 의 크기에 따라, 수행시간이 매우 길어지는 단점이 있다. ( really.big argument ㅜㅜ)
# all_subset_regression <- regsubsets(ABOVE50K ~ ., data = trainingData, nbest = 2, really.big = T)
library(leaps)

all_subset_regression <- regsubsets(
  ABOVE50K ~ AGE + WORKCLASS + EDUCATION + OCCUPATION + RELATIONSHIP + SEX + CAPITALGAIN + CAPITALLOSS, 
  data = trainingData, 
  nbest = 2
)
plot(all_subset_regression, scale = "adjr2") # scale = [rsq|rss|adjr2|cp|bic|outmat]
```

```{r}
# 위 plot 의 각 행의 색깔이 칠해진 것이 feature 의 갯수이다. 
# 위에서 nbest = 2 를 주었으므로, 2개가 색칠된 것이 아래부터 2개이고, 
# 그위에는 3개가 색칠된 것이 3개이고 이런식이다. 이 경우, adjr2 를 기준으로 best 를 찾은 것이니 1 에 가까운 모델이 최적이다. 
```


## Multinomial Regression

```{r}
# ref : https://onlinecourses.science.psu.edu/stat504/node/40
# ref : http://r-statistics.co/Multinomial-Regression-With-R.html
# 
#1. Random component : Multinomial Distribution ( Y , multi levels )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Generalized Logit
```

```{r}
cmcData <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data", stringsAsFactors=FALSE, header=F)
colnames(cmcData) <- c("wife_age", "wife_edu", "hus_edu", "num_child", "wife_rel", "wife_work", "hus_occu", "sil", "media_exp", "cmc")
head(cmcData, n = 1)

levels(as.factor(cmcData$cmc))
cmcData <- transform(cmcData, cmc <- as.factor(cmc))
```

```{r}
# random component 의 distribution 의 type 과 그에 따른 link function 이 달라진 것 이외에는 
# 큰 차이가 없어 추가적인 설명없이, nnet package 의 multinom function 을 통한 prediction 까지만 수행한다. 
# summary 결과를 보면, 각 level 별 coefficient 값이 estimation 됨을 알 수 있다. 
library(nnet)

model <- multinom(cmc ~ . , data = cmcData)
summary(model)
cmcData$pred <- predict(model, cmcData)
cmcData <- cbind(cmcData, as.data.frame(predict(model, cmcData, type = "prob")))

table(cmcData$pred, cmcData$cmc)
```


```{r}
# predict 의 type 으로 prob 을 지정하는 경우, 각 level 이 될 확률값을 matrix 로 반환하게 된다. 
head(cmcData[c("cmc", "pred", "1", "2", "3")])
```

```{r}
# vglm package 를 사용하면, model summary 로 significance 값을 알 수 있다. 
cmcData <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data", stringsAsFactors=FALSE, header=F)
colnames(cmcData) <- c("wife_age", "wife_edu", "hus_edu", "num_child", "wife_rel", "wife_work", "hus_occu", "sil", "media_exp", "cmc")

library(VGAM)
model <- vglm(cmc ~ . , family = multinomial(refLevel = 1), data = cmcData)

exp(coef(model))
summary(model)
```

```{r}
# mlogit package 를 사용해볼 수도 있다. 
# https://cran.r-project.org/web/packages/mlogit/vignettes/mlogit.pdf
```


## Poisson Regression 
```{r}
# ref : https://onlinecourses.science.psu.edu/stat504/node/165
# ref : http://wiki.stat.ucla.edu/socr/index.php/AP_Statistics_Curriculum_2007_Distrib_Poisson
# ref : http://data.princeton.edu/wws509/R/c4s1.html
# ref : https://stats.idre.ucla.edu/r/dae/poisson-regression/
# ref : http://r-bong.blogspot.kr/2016/11/blog-post.html
# ref : http://www.statground.org/?module=file&act=procFileDownload&file_srl=1793&sid=3c7f2d1d1d2c1543a97cf0f6144a5e90&module_srl=137
#
#1. Random component : Poisson, QuasiPoisson Distribution ( Y )
#2. Systematic component : Explanatory var. ( can be continuous / discret | Xs )
#3. Link function : Log , Identity
#
# 웹서비스를 제공하면서, Poisson Distribution 을 겪을 수 있는 케이스 
# O2O 배달주문, 콜 호출 
# 그외 각종 count 류의 데이터들 ( 사이버보다는 현실과 가까울 수록 )
```
![](/Users/CA/Downloads/regressions.png)

```{r}
# Poisson Distribution
# Discret Prob. Distribution > Binomial Distribution , Poisson Distribution 
```

```{r}
# 실서비스의 데이터를 사용하면 제일 좋지만, rpubs 에 게시하기로 했으므로
# http://data.princeton.edu/wws509/datasets/#ceb 를 사용한다.
#
library(foreign)
data <- read.dta("http://data.princeton.edu/wws509/datasets/ceb.dta")

library(dplyr)
glimpse(data)
```

```{r}
par(mfrow = c(3,2))
plot(density(data$n))
plot(density(data$mean))
car::qqPlot(data$n)

# Clearly the variance increases with the mean.
upper_20 <- subset(data, n >= 20)
plot(log(upper_20$mean), log(upper_20$var))
abline(0,1)

# dependent variable : y = mean number of children ever born * number of women in the cell 
data$y <- round(data$mean * data$n, 0)

plot(density(data$y))
plot(data$n, data$y)
```

```{r}


# If the number of CEB to one woman in a given cell is a Poisson random variable with mean (and variance) μ, 
# then the number born to all n women in that cell is a Poisson r.v. with mean (and variance) nμ. 
# The log of the expected sum is log(n)+log(μ), and consists of a known offset and the quantity we are interested in modeling. 
#
# 1. number of women in the cell 에 비례하여 커질 경향이 있을 것이므로
#    Y ~ poisson(μ), 평균 μ 인 포아송분포를 따르는 모델을 사용한다. ( Y 는 discrete variable )
#
# 2. ln(μ) = ln(n) + 절편 + x1 + x2 ... // 이 중 첫항을 우리는 offset 이라고 하며, 이것이 poisson link function 이다. ( log-linear model )
#    offset 의 coefficient 값을 구하면 아래와 같고, 그 값은 전체 출생아 수 를 전체 여성수로 나눈 값과 같다. 

data$os <- log(data$n) 

model.0 <- glm( y ~ 1, offset = os, data = data, family = poisson)
summary(model.0)
paste("exp(coef(model.0)) : ", exp(coef(model.0)),  " == sum(data$y)/sum(data$n) : ", sum(data$y)/sum(data$n))
```

```{r}
model.res <- glm( y ~ res, offset = os, data = data, family = poisson)
summary(model.res)
exp(coef(model.res))
c(deviance(model.0)-deviance(model.res), deviance(model.res))
```

```{r}
all_subset_regression <- regsubsets(
  y ~ offset(os) + dur + res + educ, 
  data = data, 
  nbest = 2
)
plot(all_subset_regression, scale = "bic") # scale = [rsq|rss|adjr2|cp|bic|outmat]
```

```{r}
model.all <- glm( y ~ dur + res + educ, offset = os, data = data, family = poisson)

summary(model.all)
exp(coef(model.all))
c(deviance(model.0)-deviance(model.all), deviance(model.all))
```

```{r}
dur_count <- t(data %>% dplyr::group_by(dur) %>% summarise(mean = sum(mean) / n()))
res_count <- t(data %>% dplyr::group_by(res) %>% summarise(mean = sum(mean) / n()))

count <- cbind(dur_count, res_count)
colnames(count) <- as.character(unlist(count[1, ]))
count[rownames(count) == "mean", ]
```

```{r}
models <- c("dur+res*educ","dur*res+educ","dur*educ+res",
             "(dur+res)*educ","(dur+educ)*res","dur*(res+educ)",
             "dur*res*educ - dur:res:educ")

dd <- matrix(0, length(models), 2)

i <- 1
for(model in models) {
   formula <- paste("y ~ offset(os) + ",model)
   m <- glm(formula, family=poisson, data=data)
   dd[i,] <- c(deviance(m), m$df.residual)
   i <- i+1
 }

data.frame( model=models,
            deviance=round(dd[,1],2), 
            pval=round(pchisq(dd[,1], dd[,2], lower.tail = FALSE), 4))
```

```{r}
# 위에서, formula 에 들어가는 모든 feature 들의 subset 에 대해 평가하는 시도를 해봤는데,  
# 특히 그 조합의 수가 많은 경우 매우 느리다는 것을 보았다.
# 이런 경우, 계산량을 줄이는 기법 중에 하나가 Ridge 혹은 Lasso Regression 이다. 
```

## Model Selection ( Statistical Technicals )
```{r}
# 아래 Ridge 와 Lasso 를 보기전에, Model Selection 에 대한 부분을 디벼 보자.
# 
# ref : https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/model_selection.pdf
# ref : http://sosal.kr/868
# ref : http://www.statground.org/?module=file&act=procFileDownload&file_srl=9145&sid=a0a4e7fd21503e45194d485ccb0c5476&module_srl=137
#
# 1. Subset Selection : like a regsubsets function ( estimate model based on AIC , BIC , ADJR2 , CP ..  ) 
#                     : Stepwise Selection ( Backward / Forward ) or regsubsets ( 위에서 수행한 내용 )
# 2. Shrinkage ( Regularization ) : Lasso , Ridge , ElasticNet
#                                 : shrinks the coefficient estimates towards zero
# 3. Dimension Reduction : PCA 
# 
# Occam's Razor in Model Selection ( 최소의 feature , 설명변수로 설명을 하자 )
## All things being equal, usually the simplest explanation of a phenomenon is a good hypothesis.
## Simplicity = representational succinctness 
## 통계 모델링에서 간결함의 원리
#  - 복잡한 설명보다 단순하고 간단한 설명이 좋다.
#  - 모형에 있어서 모수는 가능한 적게 가지고 있어야 한다.
#  - 선형 모델이 비선형 모델보다 좋다.
#  - 적은 수의 가정을 가진 실험은 많은 가정을 가진 실험보다 좋다.
#  - 데이터로부터 회귀모델을 구할 때는 위해선 최소적합이 될 때까지 축소해야한다.
## 단순화 작업 목록
#  - 유의하지 않은 교호작용 항목을 제거한다.
#  - 유의하지 않은 2차 또는 비선형 항목을 제거한다.
#  - 유의하지 않은 설명 변수를 제거한다.
#  - 유사한 모수값을 갖는 설명 변수를 통합한다.
#
# 선형회귀모형에서 모형계수 벡터(coefficient estimated value) 에 대한 최소제곱 해보다
# 이것을 약간 축소한 ridge 해를 쓰는 것이 예측 성과가 좋은 것으로 알려져 있다. 
# 또한 모형계수 벡터의 일부요소를 0으로 퇴화시킨 lasso 해를 쓰게 되면 모형이 간결해진다는 잇점이 있다. 
```

## Ridge Regression ( Linear Regression with L2 regularization,  α=0  )
```{r}
# ref : https://onlinecourses.science.psu.edu/stat857/node/155
# ref : https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Ridge_Regression.pdf
# ref : https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/model_selection.pdf
# ref : https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
# ref : https://drsimonj.svbtle.com/ridge-regression-with-glmnet
# ref : https://gerardnico.com/wiki/lang/r/ridge_lasso#formula_parameters
# ref : http://r-bong.blogspot.kr/p/blog-page_4.html
# 
# Motivation: too many predictors
# Motivation: ill-conditioned X
# Motivation: 다중공분산성 존재 해결 
```
![](/Users/CA/Downloads/ridge.png)

```{r}
library(glmnet)
data(BinomialExample)

glimpse(as.data.frame(x))

model.ridge <- glmnet(x, y, family = "binomial", alpha = 0 )

par(mfrow=c(2,2))
plot(density(y))
plot(model.ridge, xvar="lambda", label=TRUE)
plot(model.ridge, xvar="norm", label=TRUE)
plot(model.ridge, xvar="dev", label=TRUE)
```

```{r}
cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial", nfolds = 100)
plot(cv.ridge)

coef(cv.ridge, s = cv.ridge$lambda.min)
```

```{r}
library(doMC)
registerDoMC(cores = 4)
cv.ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial", nfolds = 100, type.measure = "class")
plot(cv.ridge)
```


```{r}
foldid <- sample(1:10,size=length(y),replace=TRUE)
cv.1  <- cv.glmnet(x,y,foldid=foldid,alpha=1)
cv.05 <- cv.glmnet(x,y,foldid=foldid,alpha=.5)
cv.0  <- cv.glmnet(x,y,foldid=foldid,alpha=0)

par(mfrow=c(2,2))
plot(cv.1);plot(cv.05);plot(cv.0)
plot(log(cv.1$lambda),cv.1$cvm,pch=19,col="red",xlab="log(Lambda)",ylab=cv.1$name)
points(log(cv.05$lambda),cv.05$cvm,pch=19,col="grey")
points(log(cv.0$lambda),cv.0$cvm,pch=19,col="blue")
legend("topleft",legend=c("alpha= 1","alpha= .5","alpha 0"),pch=19,col=c("red","grey","blue"))
```

## Lasso Regression 

## ElasticNet Regression






















