#———————————————– #———————————————– # 0. Cargando los datos #———————————————– #———————————————–

Seguro_Antirobo<-read.csv(file="0.DS_Seguro_Antirobos.csv",header=TRUE,sep=";")

XLS: 0. DS_Seguro_Antirobo.xls

XLS: 1. Analisis y modelamiento.xlsx

#Seguro_Antirobo<-X0_DS_Seguro_Antirobo

nrow(Seguro_Antirobo)
## [1] 28313
ncol(Seguro_Antirobo)
## [1] 12
head(Seguro_Antirobo)
##   Mto_TC MARCA  Nombre_territorio FLAG_LIMA_PROVINCIA         REGION
## 1   5000  Visa           T.CENTRO                   0 SIERRA CENTRAL
## 2    750  Visa  T.SURCO LA MOLINA                   1   LIMA MODERNA
## 3  12000  Visa           T.CENTRO                   0         CENTRO
## 4   2200  Visa T.LIMA RESIDENCIAL                   0          NORTE
## 5   5000  Visa           T.CENTRO                   1      LIMA ESTE
## 6   2400  Visa           T.CENTRO                   0         CENTRO
##   SUELDO_ESTIMADO EDAD SEXO ANTIGUEDAD_MES   SEGMENTO FLAG_SS FLAG_UNICEF
## 1        1000.000   24    F             28    CLASICO      NA          NA
## 2        2309.875   30    F            146    CLASICO      NA          NA
## 3        6541.000   39    M             24    CLASICO      NA          NA
## 4        2196.330   28    M             10 BAJO VALOR      NA          NA
## 5         578.000   41    M             NA                 NA          NA
## 6         899.000   28    M             76    CLASICO      NA          NA
str(Seguro_Antirobo)
## 'data.frame':    28313 obs. of  12 variables:
##  $ Mto_TC             : int  5000 750 12000 2200 5000 2400 6500 14000 32500 1600 ...
##  $ MARCA              : chr  "Visa" "Visa" "Visa" "Visa" ...
##  $ Nombre_territorio  : chr  "T.CENTRO" "T.SURCO LA MOLINA" "T.CENTRO" "T.LIMA RESIDENCIAL" ...
##  $ FLAG_LIMA_PROVINCIA: int  0 1 0 0 1 0 0 0 0 0 ...
##  $ REGION             : chr  "SIERRA CENTRAL" "LIMA MODERNA" "CENTRO" "NORTE" ...
##  $ SUELDO_ESTIMADO    : num  1000 2310 6541 2196 578 ...
##  $ EDAD               : int  24 30 39 28 41 28 30 31 41 48 ...
##  $ SEXO               : chr  "F" "F" "M" "M" ...
##  $ ANTIGUEDAD_MES     : int  28 146 24 10 NA 76 36 NA 100 133 ...
##  $ SEGMENTO           : chr  "CLASICO" "CLASICO" "CLASICO" "BAJO VALOR" ...
##  $ FLAG_SS            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ FLAG_UNICEF        : int  NA NA NA NA NA NA NA NA NA NA ...
summary(Seguro_Antirobo)
##      Mto_TC          MARCA           Nombre_territorio  FLAG_LIMA_PROVINCIA
##  Min.   :     0   Length:28313       Length:28313       Min.   :0.0000     
##  1st Qu.:  1700   Class :character   Class :character   1st Qu.:0.0000     
##  Median :  5100   Mode  :character   Mode  :character   Median :1.0000     
##  Mean   :  7865                                         Mean   :0.5626     
##  3rd Qu.:  9500                                         3rd Qu.:1.0000     
##  Max.   :169455                                         Max.   :1.0000     
##                                                         NA's   :340        
##     REGION          SUELDO_ESTIMADO       EDAD           SEXO          
##  Length:28313       Min.   :     1   Min.   :18.00   Length:28313      
##  Class :character   1st Qu.:  1178   1st Qu.:29.00   Class :character  
##  Mode  :character   Median :  1748   Median :36.00   Mode  :character  
##                     Mean   :  2580   Mean   :38.25                     
##                     3rd Qu.:  2762   3rd Qu.:45.00                     
##                     Max.   :520650   Max.   :84.00                     
##                     NA's   :720      NA's   :424                       
##  ANTIGUEDAD_MES     SEGMENTO            FLAG_SS       FLAG_UNICEF   
##  Min.   :  1.00   Length:28313       Min.   :1       Min.   :1      
##  1st Qu.: 16.00   Class :character   1st Qu.:1       1st Qu.:1      
##  Median : 58.00   Mode  :character   Median :1       Median :1      
##  Mean   : 81.42                      Mean   :1       Mean   :1      
##  3rd Qu.:120.00                      3rd Qu.:1       3rd Qu.:1      
##  Max.   :455.00                      Max.   :1       Max.   :1      
##  NA's   :1263                        NA's   :27565   NA's   :28190

#Asignando el tipo de variables Categoricas

#——————————————– #——————————————– # 1. Exploracion de los datos cargado y sus valores nulos #——————————————– #——————————————–

Libreria de Exploracion

#————————–

library(DescTools)
#library(RDCOMClient)
#install.packages("RDCOMClient",repos = "http://www.omegahat.net")
#wrd<-GetNewWrd()
Desc(Seguro_Antirobo)
## ------------------------------------------------------------------------------ 
## Describe Seguro_Antirobo (data.frame):
## 
## data frame:  28313 obs. of  12 variables
##      5 complete cases (0.0%)
## 
##   Nr  ColName              Class      NAs            Levels
##   1   Mto_TC               integer        .                
##   2   MARCA                character      .                
##   3   Nombre_territorio    character      .                
##   4   FLAG_LIMA_PROVINCIA  integer      340 (1.2%)         
##   5   REGION               character      .                
##   6   SUELDO_ESTIMADO      numeric      720 (2.5%)         
##   7   EDAD                 integer      424 (1.5%)         
##   8   SEXO                 character      .                
##   9   ANTIGUEDAD_MES       integer     1263 (4.5%)         
##   10  SEGMENTO             character      .                
##   11  FLAG_SS              integer    27565 (97.4%)        
##   12  FLAG_UNICEF          integer    28190 (99.6%)        
## 
## 
## ------------------------------------------------------------------------------ 
## 1 - Mto_TC (integer)
## 
##       length          n       NAs    unique        0s       mean     meanCI'
##       28'313     28'313         0       320         5   7'865.34   7'738.45
##                  100.0%      0.0%                0.0%              7'992.24
##                                                                            
##          .05        .10       .25    median       .75        .90        .95
##       700.00     800.00  1'700.00  5'100.00  9'500.00  16'000.00  26'000.00
##                                                                            
##        range         sd     vcoef       mad       IQR       skew       kurt
##   169'455.00  10'893.49      1.38  5'337.36  7'800.00       4.05      22.69
##                                                                            
## lowest : 0 (5), 1 (2), 2 (297), 20 (4), 50 (3)
## highest: 98'000, 98'500 (3), 99'500, 100'000 (39), 169'455
## 
## heap(?): remarkable frequency (7.9%) for the mode(s) (= 700)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 2 - MARCA (character - dichotomous)
## 
##   length      n    NAs unique
##   28'313 28'313      0      2
##          100.0%   0.0%       
## 
##               freq   perc  lci.95  uci.95'
## Visa        28'149  99.4%   99.3%   99.5%
## MasterCard     164   0.6%    0.5%    0.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 3 - Nombre_territorio (character)
## 
##   length      n    NAs unique levels  dupes
##   28'313 28'313      0     18     18      y
##          100.0%   0.0%                     
## 
##                    level   freq   perc  cumfreq  cumperc
## 1      T.CENTROS MASIVOS  6'584  23.3%    6'584    23.3%
## 2               T.CENTRO  2'508   8.9%    9'092    32.1%
## 3      T.SURCO LA MOLINA  1'995   7.0%   11'087    39.2%
## 4         DIR.RED MASIVA  1'965   6.9%   13'052    46.1%
## 5                T.LINCE  1'948   6.9%   15'000    53.0%
## 6     T.LIMA RESIDENCIAL  1'841   6.5%   16'841    59.5%
## 7                  T.SUR  1'794   6.3%   18'635    65.8%
## 8          T.NOR ORIENTE  1'757   6.2%   20'392    72.0%
## 9   T.MIRAFLORES S.BORJA  1'540   5.4%   21'932    77.5%
## 10   T.CALLAO SAN MIGUEL  1'288   4.5%   23'220    82.0%
## 11         T.LIMA CENTRO  1'276   4.5%   24'496    86.5%
## 12     T.NORTE SUR CHICO  1'236   4.4%   25'732    90.9%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 4 - FLAG_LIMA_PROVINCIA (integer - dichotomous)
## 
##   length      n    NAs unique
##   28'313 27'973    340      2
##           98.8%   1.2%       
## 
##      freq   perc  lci.95  uci.95'
## 0  12'234  43.7%   43.2%   44.3%
## 1  15'739  56.3%   55.7%   56.8%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 5 - REGION (character)
## 
##   length      n    NAs unique levels  dupes
##   28'313 28'313      0     13     13      y
##          100.0%   0.0%                     
## 
##              level   freq   perc  cumfreq  cumperc
## 1     LIMA MODERNA  4'545  16.1%    4'545    16.1%
## 2            NORTE  3'806  13.4%    8'351    29.5%
## 3              SUR  3'226  11.4%   11'577    40.9%
## 4       LIMA NORTE  2'771   9.8%   14'348    50.7%
## 5           CENTRO  2'632   9.3%   16'980    60.0%
## 6        LIMA ESTE  2'609   9.2%   19'589    69.2%
## 7      LIMA CENTRO  2'305   8.1%   21'894    77.3%
## 8         LIMA SUR  1'813   6.4%   23'707    83.7%
## 9          ORIENTE  1'624   5.7%   25'331    89.5%
## 10          CALLAO  1'069   3.8%   26'400    93.2%
## 11  SIERRA CENTRAL    946   3.3%   27'346    96.6%
## 12  LIMA PROVINCIA    627   2.2%   27'973    98.8%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 6 - SUELDO_ESTIMADO (numeric)
## 
##        length          n        NAs     unique         0s       mean     meanCI'
##        28'313     27'593        720      9'811          0  2'579.749  2'508.135
##                    97.5%       2.5%                  0.0%             2'651.363
##                                                                                
##           .05        .10        .25     median        .75        .90        .95
##       550.000    761.875  1'178.000  1'748.000  2'761.500  4'900.000  6'833.202
##                                                                                
##         range         sd      vcoef        mad        IQR       skew       kurt
##   520'649.000  6'069.157      2.353  1'012.127  1'583.500     58.691  4'652.992
##                                                                                
## lowest : 1.0, 17.0, 18.0 (3), 26.0, 31.0
## highest: 85'195.0 (3), 91'358.0, 139'711.0 (2), 415'307.0, 520'650.0 (2)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 7 - EDAD (integer)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   28'313  27'889    424      64      0  38.25   38.11
##            98.5%   1.5%           0.0%          38.38
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##    24.00   26.00  29.00   36.00  45.00  56.00   60.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##    66.00   11.27   0.29   11.86  16.00   0.74   -0.21
##                                                      
## lowest : 18, 19, 20 (19), 21 (197), 22 (271)
## highest: 77 (3), 78 (5), 79 (3), 80 (4), 84
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 8 - SEXO (character)
## 
##   length      n    NAs unique levels  dupes
##   28'313 28'313      0      3      3      y
##          100.0%   0.0%                     
## 
##    level    freq   perc  cumfreq  cumperc
## 1      M  16'397  57.9%   16'397    57.9%
## 2      F  11'598  41.0%   27'995    98.9%
## 3            318   1.1%   28'313   100.0%

## ------------------------------------------------------------------------------ 
## 9 - ANTIGUEDAD_MES (integer)
## 
##   length       n    NAs  unique      0s    mean  meanCI'
##   28'313  27'050  1'263     351       0   81.42   80.50
##            95.5%   4.5%            0.0%           82.34
##                                                        
##      .05     .10    .25  median     .75     .90     .95
##     2.00    3.00  16.00   58.00  120.00  212.00  237.00
##                                                        
##    range      sd  vcoef     mad     IQR    skew    kurt
##   454.00   77.24   0.95   72.65  104.00    0.98    0.04
##                                                        
## lowest : 1 (858), 2 (1'302), 3 (1'617), 4 (307), 5 (275)
## highest: 432, 433, 434, 435 (3), 455
## 
## heap(?): remarkable frequency (6.0%) for the mode(s) (= 3)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 10 - SEGMENTO (character)
## 
##   length      n    NAs unique levels  dupes
##   28'313 28'313      0      7      7      y
##          100.0%   0.0%                     
## 
##            level    freq   perc  cumfreq  cumperc
## 1        CLASICO  18'719  66.1%   18'719    66.1%
## 2     BAJO VALOR   5'572  19.7%   24'291    85.8%
## 3            VIP   1'513   5.3%   25'804    91.1%
## 4                  1'301   4.6%   27'105    95.7%
## 5  Micro empresa   1'130   4.0%   28'235    99.7%
## 6       DIAMANTE      62   0.2%   28'297    99.9%
## 7        Empresa      16   0.1%   28'313   100.0%

## ------------------------------------------------------------------------------ 
## 11 - FLAG_SS (integer - dichotomous)
## 
##   length      n    NAs unique
##   28'313    748 27'565      1
##            2.6%  97.4%       
## 
##    freq    perc  lci.95  uci.95'
## 1   748  100.0%   99.5%  100.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 12 - FLAG_UNICEF (integer - dichotomous)
## 
##   length      n    NAs unique
##   28'313    123 28'190      1
##            0.4%  99.6%       
## 
##    freq    perc  lci.95  uci.95'
## 1   123  100.0%   97.0%  100.0%
## 
## ' 95%-CI (Wilson)

# Identificacion exhaustiva de valores nulos
#--------------------------------------------
install.packages("VIM")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(VIM) 
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
a=aggr(Seguro_Antirobo,numbers=T) 
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

a
## 
##  Missings in variables:
##             Variable Count
##  FLAG_LIMA_PROVINCIA   340
##      SUELDO_ESTIMADO   720
##                 EDAD   424
##       ANTIGUEDAD_MES  1263
##              FLAG_SS 27565
##          FLAG_UNICEF 28190
summary(a)
## 
##  Missings per variable: 
##             Variable Count
##               Mto_TC     0
##                MARCA     0
##    Nombre_territorio     0
##  FLAG_LIMA_PROVINCIA   340
##               REGION     0
##      SUELDO_ESTIMADO   720
##                 EDAD   424
##                 SEXO     0
##       ANTIGUEDAD_MES  1263
##             SEGMENTO     0
##              FLAG_SS 27565
##          FLAG_UNICEF 28190
## 
##  Missings in combinations of variables: 
##             Combinations Count      Percent
##  0:0:0:0:0:0:0:0:0:0:0:0     5  0.017659732
##  0:0:0:0:0:0:0:0:0:0:0:1   702  2.479426412
##  0:0:0:0:0:0:0:0:0:0:1:0   111  0.392046057
##  0:0:0:0:0:0:0:0:0:0:1:1 25410 89.746759439
##  0:0:0:0:0:0:0:0:1:0:0:0     1  0.003531946
##  0:0:0:0:0:0:0:0:1:0:0:1    16  0.056511143
##  0:0:0:0:0:0:0:0:1:0:1:0     4  0.014127786
##  0:0:0:0:0:0:0:0:1:0:1:1  1198  4.231271854
##  0:0:0:0:0:0:1:0:0:0:0:1     6  0.021191679
##  0:0:0:0:0:0:1:0:0:0:1:1    83  0.293151556
##  0:0:0:0:0:1:0:0:0:0:0:1    11  0.038851411
##  0:0:0:0:0:1:0:0:0:0:1:1   374  1.320947974
##  0:0:0:0:0:1:0:0:1:0:1:1    20  0.070638929
##  0:0:0:0:0:1:1:0:0:0:1:1    30  0.105958394
##  0:0:0:0:0:1:1:0:1:0:1:1     2  0.007063893
##  0:0:0:1:0:0:0:0:0:0:0:1     2  0.007063893
##  0:0:0:1:0:0:0:0:0:0:1:0     2  0.007063893
##  0:0:0:1:0:0:0:0:0:0:1:1    27  0.095362554
##  0:0:0:1:0:0:0:0:1:0:1:1     6  0.021191679
##  0:0:0:1:0:0:1:0:0:0:1:1    20  0.070638929
##  0:0:0:1:0:1:1:0:0:0:0:1     5  0.017659732
##  0:0:0:1:0:1:1:0:0:0:1:1   262  0.925369971
##  0:0:0:1:0:1:1:0:1:0:1:1    16  0.056511143
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Loading required package: RSQLite
sqldf("select count(*) from Seguro_Antirobo where FLAG_UNICEF is NULL")
##   count(*)
## 1    28190
### WRD 1.1. Exploracion Describe Seguros.docx

# Completando los valores nulos
#------------------------


Seguro_Antirobo$FLAG_LIMA_PROVINCIA[is.na(Seguro_Antirobo$FLAG_LIMA_PROVINCIA)]<-0
Seguro_Antirobo$SUELDO_ESTIMADO[is.na(Seguro_Antirobo$SUELDO_ESTIMADO)]<-0
Seguro_Antirobo$ANTIGUEDAD_MES[is.na(Seguro_Antirobo$ANTIGUEDAD_MES)]<-0
Seguro_Antirobo$FLAG_SS[is.na(Seguro_Antirobo$FLAG_SS)]<-0


Seguro_Antirobo$MARCA<-as.factor(Seguro_Antirobo$MARCA)
Seguro_Antirobo$FLAG_SS<-as.factor(Seguro_Antirobo$FLAG_SS)


write.csv(Seguro_Antirobo,"Datos_PowerBI.csv", row.names = FALSE)

### WRD 1.2. Exploracion Describe Seguros.docx

#https://stackoverflow.com/questions/13234005/filling-missing-values  

#-------------------------------------------------------
#-----------------------------------------------------
# 2. Seleccion de variables importantes  
#-----------------------------------------------------
#-------------------------------------------------------

install.packages("scorecard")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
# Libreria de ExploracionScorecard
#------------------------------------
library(colorspace)
library(scorecard)

# Variables mƔs importantes 
#--------------------------------
iv(Seguro_Antirobo, y="FLAG_SS")
##                variable  info_value
##  1:     SUELDO_ESTIMADO 1.096072487
##  2:              Mto_TC 0.736090635
##  3:      ANTIGUEDAD_MES 0.484898237
##  4:            SEGMENTO 0.272278268
##  5:                EDAD 0.232495668
##  6:   Nombre_territorio 0.216308648
##  7:              REGION 0.176230615
##  8:                SEXO 0.088545560
##  9: FLAG_LIMA_PROVINCIA 0.057108723
## 10:               MARCA 0.002533002
## 11:         FLAG_UNICEF 0.002418277
# Filtrado preliminar de las variables mƔs importantes
#----------------------------------------------------

dt_s = var_filter(Seguro_Antirobo, y="FLAG_SS",iv=0.1)
## [INFO] filtering variables ...
dt_s
##        Mto_TC  Nombre_territorio         REGION SUELDO_ESTIMADO EDAD
##     1:   5000           T.CENTRO SIERRA CENTRAL        1000.000   24
##     2:    750  T.SURCO LA MOLINA   LIMA MODERNA        2309.875   30
##     3:  12000           T.CENTRO         CENTRO        6541.000   39
##     4:   2200 T.LIMA RESIDENCIAL          NORTE        2196.330   28
##     5:   5000           T.CENTRO      LIMA ESTE         578.000   41
##    ---                                                              
## 28309:   1200  T.CENTROS MASIVOS      LIMA ESTE        1242.170   28
## 28310:   4300  T.CENTROS MASIVOS    LIMA CENTRO        1880.000   63
## 28311:   1700  T.CENTROS MASIVOS    LIMA CENTRO        1764.880   25
## 28312:   5100  T.CENTROS MASIVOS       LIMA SUR        1228.125   31
## 28313:   5200  T.SURCO LA MOLINA      LIMA ESTE        2764.000   58
##        ANTIGUEDAD_MES   SEGMENTO FLAG_SS
##     1:             28    CLASICO       0
##     2:            146    CLASICO       0
##     3:             24    CLASICO       0
##     4:             10 BAJO VALOR       0
##     5:              0                  0
##    ---                                  
## 28309:              6    CLASICO       0
## 28310:              3    CLASICO       0
## 28311:             37 BAJO VALOR       0
## 28312:              3    CLASICO       0
## 28313:            243 BAJO VALOR       0
# Representación grafica de variables mÔs importantes
#----------------------------------------------------
bins = woebin(dt_s, y="FLAG_SS")
## [INFO] creating woe binning ...
## Warning in rep_blank_na(dt): The blank values are replaced with NAs in the following columns:
## REGION, SEGMENTO
woebin_plot(bins)
## $Mto_TC

## 
## $Nombre_territorio

## 
## $REGION

## 
## $SUELDO_ESTIMADO

## 
## $EDAD

## 
## $ANTIGUEDAD_MES

## 
## $SEGMENTO

# Dimensionamiento de publico objetivo de gestión
#---------------------------------------

library("sqldf") 



sqldf("select count(*)Poblacion,sum(FLAG_SS)Ventas from Seguro_Antirobo where SUELDO_ESTIMADO>=800 and SUELDO_ESTIMADO<1800 and Mto_TC<6000 and ANTIGUEDAD_MES<50")
##   Poblacion Ventas
## 1      4964    237
#P.O.: 4694 clientes
237/4964*100
## [1] 4.774376
# % 2.64 grupo
# %  grupo