library(ggplot2)
library(dplyr)
library(caret)
library(mlbench)
library(caTools)
#Cargamos Data
load("data_proyecto.rda")
names(train)
 [1] "state"         "src"           "zip"           "hour"         
 [5] "placement"     "provider_name" "cpc"           "ctr"          
 [9] "ecpm"          "brand_name"    "position"      "weekday"      
[13] "click"        

Shufle de datos

#Mexclamos los datos para formar dos data sets: entrenamiento y prueba.
index<-1:nrow(train)
shuff_index<-sample(index)
dataset<-train[shuff_index,] #Data Frame desordenado.

Generamos datos para entrenamiento y pruebas

train_dataset<-dataset[1:(0.7*nrow(dataset)),] #subdataset de entrenamiento.
test_dataset<-dataset[(0.7*nrow(dataset)+1): nrow(dataset),] #subdataset de prueba.
head(test_dataset)

Arreglos a dataset de entrenamiento:

train_dataset$click<-as.numeric(train_dataset$click)
train_dataset$src<-as.numeric(train_dataset$src)
train_dataset$cpc<-as.vector(train_dataset$cpc)
train_dataset$ecpm<-as.vector(train_dataset$ecpm)

Creamos Varibles Categóricas para set de datos de entrenamiento

train_dataset$placement<-ifelse(train_dataset$placement == "A", 0, 1)
#Creamos de variables categóticas
#weekday
train_dataset<-train_dataset %>% 
  mutate(weekday.TUE = ifelse(weekday == "TUE", 1, 0)) %>%
  mutate(weekday.THU = ifelse(weekday == "THU", 1, 0)) %>% 
  mutate(weekday.SUN = ifelse(weekday == "SUN", 1, 0)) %>%
  mutate(weekday.FRI = ifelse(weekday == "FRI", 1, 0)) %>%
  mutate(weekday.SAT = ifelse(weekday == "SAT", 1, 0)) %>%
  mutate(weekday.WED = ifelse(weekday == "WED", 1, 0)) %>%
  mutate(weekday.MON= ifelse(weekday == "MON", 1, 0)) 
  
train_dataset<-train_dataset %>% 
   mutate(state.77d6b684 = ifelse(state == "77d6b684", 1, 0)) %>%
   mutate(state.97e4e7b7 = ifelse(state == "97e4e7b7", 1, 0)) %>%
   mutate(state.4bdb89dd = ifelse(state == "4bdb89dd", 1, 0)) %>%
   mutate(state.f38822b3 = ifelse(state == "f38822b3", 1, 0)) %>%
   mutate(state.f04eaa4b = ifelse(state == "f04eaa4b", 1, 0)) %>%
   mutate(state.92676321 = ifelse(state == "92676321", 1, 0)) %>%
   mutate(state.26e1d51b = ifelse(state == "26e1d51b", 1, 0)) %>%
   mutate(state.92600ee5 = ifelse(state == "92600ee5", 1, 0)) %>%
   mutate(state.dd26f5e6 = ifelse(state == "dd26f5e6", 1, 0)) %>%
   mutate(state.a8283dd2 = ifelse(state == "a8283dd2", 1, 0)) %>%
   mutate(state.8ab3223a = ifelse(state == "8ab3223a", 1, 0)) %>%
   mutate(state.63d0870f = ifelse(state == "63d0870f", 1, 0)) %>%
   mutate(state.c43dc4a7 = ifelse(state == "c43dc4a7", 1, 0)) %>%
   mutate(state.bf0f3c8c = ifelse(state == "bf0f3c8c", 1, 0)) %>%
   mutate(state.d1f48576 = ifelse(state == "d1f48576", 1, 0))
#provider name
train_dataset<-train_dataset %>% 
   mutate(provider_name.b985a6c6 = ifelse(state == "b985a6c6", 1, 0)) %>%
   mutate(provider_name.af8e115f = ifelse(state == "af8e115f", 1, 0)) %>%
   mutate(provider_name.d886ded = ifelse(state == "d886ded", 1, 0))

Arreglos a dataset de prueba:

#Arreglos a dataset de prueba
test_dataset$click<-as.numeric(test_dataset$click)
test_dataset$src<-as.numeric(test_dataset$src)
test_dataset$cpc<-as.vector(test_dataset$cpc)
test_dataset$ecpm<-as.vector(test_dataset$ecpm)
test_dataset$placement<-ifelse(test_dataset$placement == "A", 0, 1)

Creamos Varibles Categóricas para set de datos de prueba

test_dataset$placement<-ifelse(test_dataset$placement == "A", 0, 1)
#Creamos de variables categóticas
#weekday
test_dataset<-test_dataset %>% 
  mutate(weekday.TUE = ifelse(weekday == "TUE", 1, 0)) %>%
  mutate(weekday.THU = ifelse(weekday == "THU", 1, 0)) %>% 
  mutate(weekday.SUN = ifelse(weekday == "SUN", 1, 0)) %>%
  mutate(weekday.FRI = ifelse(weekday == "FRI", 1, 0)) %>%
  mutate(weekday.SAT = ifelse(weekday == "SAT", 1, 0)) %>%
  mutate(weekday.WED = ifelse(weekday == "WED", 1, 0)) %>%
  mutate(weekday.MON= ifelse(weekday == "MON", 1, 0)) 
  
test_dataset<-test_dataset %>% 
   mutate(state.77d6b684 = ifelse(state == "77d6b684", 1, 0)) %>%
   mutate(state.97e4e7b7 = ifelse(state == "97e4e7b7", 1, 0)) %>%
   mutate(state.4bdb89dd = ifelse(state == "4bdb89dd", 1, 0)) %>%
   mutate(state.f38822b3 = ifelse(state == "f38822b3", 1, 0)) %>%
   mutate(state.f04eaa4b = ifelse(state == "f04eaa4b", 1, 0)) %>%
   mutate(state.92676321 = ifelse(state == "92676321", 1, 0)) %>%
   mutate(state.26e1d51b = ifelse(state == "26e1d51b", 1, 0)) %>%
   mutate(state.92600ee5 = ifelse(state == "92600ee5", 1, 0)) %>%
   mutate(state.dd26f5e6 = ifelse(state == "dd26f5e6", 1, 0)) %>%
   mutate(state.a8283dd2 = ifelse(state == "a8283dd2", 1, 0)) %>%
   mutate(state.8ab3223a = ifelse(state == "8ab3223a", 1, 0)) %>%
   mutate(state.63d0870f = ifelse(state == "63d0870f", 1, 0)) %>%
   mutate(state.c43dc4a7 = ifelse(state == "c43dc4a7", 1, 0)) %>%
   mutate(state.bf0f3c8c = ifelse(state == "bf0f3c8c", 1, 0)) %>%
   mutate(state.d1f48576 = ifelse(state == "d1f48576", 1, 0))
#provider name
test_dataset<-test_dataset %>% 
   mutate(provider_name.b985a6c6 = ifelse(state == "b985a6c6", 1, 0)) %>%
   mutate(provider_name.af8e115f = ifelse(state == "af8e115f", 1, 0)) %>%
   mutate(provider_name.d886ded = ifelse(state == "d886ded", 1, 0))

Modelo de regresión logistica con todas las variavble, excepto ‘zip’ y ‘brand_name’ ya que son demaciadas variables categoricas:

#No usaremos la varaible zip ni brand_name por que son demaciados elementos diferentes.
Logit_model1<-
  glm(formula=click ~ src + hour +  placement + cpc
      + ctr + ecpm + position + weekday.MON + weekday.FRI + weekday.SUN + weekday.TUE + 
      weekday.THU + weekday.SAT + state.f38822b3 + state.4bdb89dd + state.92600ee5 + 
        state.bf0f3c8c + state.f04eaa4b + state.a8283dd2 + state.dd26f5e6 +    state.c43dc4a7 + state.63d0870f + state.77d6b684 + state.8ab3223a + state.26e1d51b + state.92676321 + state.d1f48576 
        , data=train_dataset, family = binomial(link="logit"))
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Logit_model1)

Call:
glm(formula = click ~ src + hour + placement + cpc + ctr + ecpm + 
    position + weekday.MON + weekday.FRI + weekday.SUN + weekday.TUE + 
    weekday.THU + weekday.SAT + state.f38822b3 + state.4bdb89dd + 
    state.92600ee5 + state.bf0f3c8c + state.f04eaa4b + state.a8283dd2 + 
    state.dd26f5e6 + state.c43dc4a7 + state.63d0870f + state.77d6b684 + 
    state.8ab3223a + state.26e1d51b + state.92676321 + state.d1f48576, 
    family = binomial(link = "logit"), data = train_dataset)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-8.4904  -0.2747  -0.1665  -0.0987   3.6796  

Coefficients:
                 Estimate Std. Error  z value Pr(>|z|)    
(Intercept)    -3.6212231  0.0313934 -115.350  < 2e-16 ***
src            -0.0007771  0.0001651   -4.706 2.53e-06 ***
hour           -0.0245923  0.0007841  -31.362  < 2e-16 ***
placement       1.3919191  0.0127543  109.133  < 2e-16 ***
cpc             0.4066732  0.0071148   57.159  < 2e-16 ***
ctr            17.1704597  0.2394683   71.702  < 2e-16 ***
ecpm           -0.4431471  0.0096228  -46.052  < 2e-16 ***
position       -0.5974681  0.0056986 -104.844  < 2e-16 ***
weekday.MON     0.0428716  0.0182530    2.349 0.018837 *  
weekday.FRI     0.0679309  0.0200902    3.381 0.000721 ***
weekday.SUN     0.6340413  0.0209855   30.213  < 2e-16 ***
weekday.TUE     0.0090771  0.0183994    0.493 0.621777    
weekday.THU    -0.0262560  0.0192433   -1.364 0.172436    
weekday.SAT     0.5322960  0.0210327   25.308  < 2e-16 ***
state.f38822b3 -0.4944600  0.0278503  -17.754  < 2e-16 ***
state.4bdb89dd -0.4259786  0.0329094  -12.944  < 2e-16 ***
state.92600ee5 -0.6905151  0.0270756  -25.503  < 2e-16 ***
state.bf0f3c8c -0.3701961  0.0344010  -10.761  < 2e-16 ***
state.f04eaa4b -0.6774949  0.0268554  -25.228  < 2e-16 ***
state.a8283dd2 -0.0056743  0.0293590   -0.193 0.846746    
state.dd26f5e6  0.2249447  0.0323901    6.945 3.79e-12 ***
state.c43dc4a7 -0.4785729  0.0309811  -15.447  < 2e-16 ***
state.63d0870f -0.1184034  0.0382737   -3.094 0.001977 ** 
state.77d6b684 -0.4972242  0.0306970  -16.198  < 2e-16 ***
state.8ab3223a  0.2078498  0.0367659    5.653 1.57e-08 ***
state.26e1d51b -0.6091662  0.0316434  -19.251  < 2e-16 ***
state.92676321 -0.0128167  0.0312440   -0.410 0.681649    
state.d1f48576  0.5986194  0.0341215   17.544  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 383615  on 1225773  degrees of freedom
Residual deviance: 312629  on 1225746  degrees of freedom
AIC: 312685

Number of Fisher Scoring iterations: 8

Ahora con la infomracion proporcionada por el modelo general, eliminaremos las variables que son menos significativas:

Ahora eliminamos las observaciones ‘MON’, ‘TUE’ y ‘WED’ para la variable weekday ademas eliminamos las observaciones ‘a8283dd2’, ‘92676321’

Logit_model2<-
  glm(formula=click ~ src + hour +  placement + cpc
      + ctr + ecpm + position + weekday.FRI + weekday.SUN + 
      weekday.SAT + state.f38822b3 + state.4bdb89dd + state.92600ee5 + 
        state.bf0f3c8c + state.f04eaa4b  + state.dd26f5e6 +    state.c43dc4a7 + state.63d0870f + state.77d6b684 + state.8ab3223a + state.26e1d51b + state.d1f48576 
        , data=train_dataset, family = binomial(link="logit"))
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(Logit_model2)

Call:
glm(formula = click ~ src + hour + placement + cpc + ctr + ecpm + 
    position + weekday.FRI + weekday.SUN + weekday.SAT + state.f38822b3 + 
    state.4bdb89dd + state.92600ee5 + state.bf0f3c8c + state.f04eaa4b + 
    state.dd26f5e6 + state.c43dc4a7 + state.63d0870f + state.77d6b684 + 
    state.8ab3223a + state.26e1d51b + state.d1f48576, family = binomial(link = "logit"), 
    data = train_dataset)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-8.4904  -0.2747  -0.1666  -0.0988   3.6889  

Coefficients:
                 Estimate Std. Error  z value Pr(>|z|)    
(Intercept)    -3.6196518  0.0214054 -169.100  < 2e-16 ***
src            -0.0007795  0.0001651   -4.722 2.33e-06 ***
hour           -0.0245630  0.0007840  -31.329  < 2e-16 ***
placement       1.3917827  0.0127512  109.149  < 2e-16 ***
cpc             0.4064843  0.0071135   57.142  < 2e-16 ***
ctr            17.1695170  0.2393922   71.721  < 2e-16 ***
ecpm           -0.4428990  0.0096174  -46.052  < 2e-16 ***
position       -0.5975813  0.0056983 -104.871  < 2e-16 ***
weekday.FRI     0.0598121  0.0163981    3.647 0.000265 ***
weekday.SUN     0.6257532  0.0174566   35.846  < 2e-16 ***
weekday.SAT     0.5239950  0.0175186   29.911  < 2e-16 ***
state.f38822b3 -0.4874646  0.0198263  -24.587  < 2e-16 ***
state.4bdb89dd -0.4197283  0.0264956  -15.841  < 2e-16 ***
state.92600ee5 -0.6834597  0.0188096  -36.336  < 2e-16 ***
state.bf0f3c8c -0.3639665  0.0283250  -12.850  < 2e-16 ***
state.f04eaa4b -0.6709813  0.0184795  -36.309  < 2e-16 ***
state.dd26f5e6  0.2315773  0.0257916    8.979  < 2e-16 ***
state.c43dc4a7 -0.4723663  0.0240602  -19.633  < 2e-16 ***
state.63d0870f -0.1118275  0.0329002   -3.399 0.000676 ***
state.77d6b684 -0.4906320  0.0236883  -20.712  < 2e-16 ***
state.8ab3223a  0.2140548  0.0311225    6.878 6.08e-12 ***
state.26e1d51b -0.6029681  0.0249212  -24.195  < 2e-16 ***
state.d1f48576  0.6048629  0.0279122   21.670  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 383615  on 1225773  degrees of freedom
Residual deviance: 312643  on 1225751  degrees of freedom
AIC: 312689

Number of Fisher Scoring iterations: 8

Al parecer todas estas variables son significativas, por lo que haremos una predicción.

Logit_Predict_LM2<-predict(Logit_model2, newdata=test_dataset, type="response")
Logit_Predict_LM2<-data.frame(Logit_Predict_LM2)
nrow(test_dataset)
[1] 525331
nrow(Logit_Predict_LM2)
[1] 525331
Logit_Predict_LM2
Compare_LM2_dataset<-data.frame("observacion" = test_dataset$click, 
                  "Probabilidad" = Logit_Predict_LM2$Logit_Predict_LM2,
                  "Prediccion" = ifelse(Logit_Predict_LM2$Logit_Predict_LM2 >=0.1,1,0))
Compare_LM2_dataset

Análisis de Rendimiento

countYes <- 0
countNo <- 0
TotalYes <- 0
TotalNo <- 0
  
countNoFail<-0
countYesFail<-0
for(i in 1:nrow(Compare_LM2_dataset)) {
  if(Compare_LM2_dataset$observacion[i] == 0){
    if(Compare_LM2_dataset$Prediccion[i] == 0){
      countNo <- countNo + 1
    } else {
      countNoFail <- countNoFail + 1
    }
  }
  
   if(Compare_LM2_dataset$observacion[i] == 1){
     TotalYes<-TotalYes + 1
    if(Compare_LM2_dataset$Prediccion[i] == 1){
      countYes <- countYes + 1
    }else {
      countYesFail <- countYesFail + 1
    }
   }
}
TotalNo <- countNo + countNoFail
TotalYes <- countYes + countYesFail
countYes/TotalYes
[1] 0.5806418
countNo/TotalNo
[1] 0.849926
