INTRODUCTION

Project Plan

Creating an Environment

rm(list=ls())
library(lubridate) # for csv files
library(leaflet)   # maps
library(dplyr)     # for piping purpose %>%
library(lpSolve)   # fir linear programming in R
setwd("/Users/Mughundhan/Data Science Intern/Bugs Viz")
inspection_data <- read.csv("inspection_data.csv")
unicodes <- read.csv("unicodes.csv")

Merge the data

names(inspection_data)[names(inspection_data) == 'UNI_CODE'] <- 'UNICODE'
names(unicodes)[names(unicodes) == 'unicode'] <- 'UNICODE'

dt <- merge(inspection_data,unicodes,"UNICODE")
dt <- subset(dt,dt$TEST_DATA == 0) 
dt$DATETIME1 <- as.POSIXct(dt$DATETIME,format="%m/%d/%y %H:%M")

sorted_ins <- dt[order(dt$USER_NAME,dt$DATETIME1),]

Clean the data

dt1 <- dt
str(dt)
## 'data.frame':    667 obs. of  40 variables:
##  $ UNICODE              : Factor w/ 755 levels "1.-1.1.2","1.1.1.1",..: 71 72 73 74 76 77 78 78 79 80 ...
##  $ ID                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ USER_NAME            : Factor w/ 19 levels "CC","CCP_1V",..: 11 6 6 6 6 6 6 6 6 6 ...
##  $ GROUP_NAME           : Factor w/ 6 levels "MINISTERIO_DE_SALUD",..: 1 3 3 3 3 3 3 3 3 3 ...
##  $ DATA_ACTION          : Factor w/ 2 levels "INSPECTION_NEW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ OBS_UNICODE          : int  NA 2 NA NA NA NA NA NA NA NA ...
##  $ OBS_TEXT             : logi  NA NA NA NA NA NA ...
##  $ CARACT_PREDIO        : Factor w/ 7 levels "CR","DES","LP",..: 2 6 6 6 6 6 6 6 6 6 ...
##  $ TIPO_LP              : Factor w/ 8 levels "","Banco BCP",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ STATUS_INSPECCION    : Factor w/ 10 levels "","C","ENT","I",..: 2 9 2 2 2 9 10 7 9 10 ...
##  $ MOTIVO_VOLVER        : Factor w/ 1 level "Esta de salida": NA NA NA NA NA NA NA 1 NA NA ...
##  $ INTRA_INSPECCION     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ INTRA_CHIRIS         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ INTRA_RASTROS        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PERI_INSPECCION      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PERI_CHIRIS          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PERI_RASTROS         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LUGAR_INSPECCION     : Factor w/ 3 levels "intra","intra-peri",..: NA NA NA NA NA NA 3 NA NA 2 ...
##  $ ENTREVISTA           : Factor w/ 4 levels "","cree_no_tiene",..: NA 2 NA NA NA 2 NA NA 2 NA ...
##  $ TOT_INTRA            : Factor w/ 10 levels "0","1","10","14",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ TOT_PERI             : Factor w/ 13 levels "0","1","10","100",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ RASTROS              : Factor w/ 9 levels "","0","NULL",..: NA NA NA NA NA NA 2 NA NA 2 ...
##  $ PERSONAS_PREDIO      : Factor w/ 19 levels "0","1","10","11",..: NA NA NA NA NA NA 11 NA NA 9 ...
##  $ CANT_PERROS          : Factor w/ 14 levels "0","1","10","12",..: NA NA NA NA NA NA 7 NA NA 2 ...
##  $ CANT_GATOS           : Factor w/ 8 levels "0","1","10","12",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ CANT_AVES_CORRAL     : Factor w/ 21 levels "0","1","10","100",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ CANT_CUYES           : Factor w/ 17 levels "0","1","10","11",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ CANT_CONEJOS         : Factor w/ 8 levels "0","1","15","2",..: NA NA NA NA NA NA 1 NA NA 1 ...
##  $ TEXT_OTROS           : Factor w/ 9 levels "0","Canarios",..: NA NA NA NA NA NA 2 NA NA 1 ...
##  $ CANT_OTROS           : Factor w/ 8 levels "0","1","12","2",..: NA NA NA NA NA NA 5 NA NA 1 ...
##  $ TEST_DATA            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DATETIME             : Factor w/ 661 levels "1/11/17 19:57",..: 610 391 390 389 387 386 415 385 384 383 ...
##  $ INSPECTION_FLAG      : logi  NA NA NA NA NA NA ...
##  $ PREDICTED_PROBAB     : num  9.23e-05 5.84e-11 1.39e-10 1.39e-10 2.41e-11 ...
##  $ PREDICTED_PROBAB_MEAN: num  0.0142 0.0144 0.0144 0.0144 0.0144 ...
##  $ PREDICTED_COLOR      : Factor w/ 8 levels "#808080","#BD0026",..: 2 6 5 5 6 4 3 3 4 5 ...
##  $ ANIMALES             : Factor w/ 24 levels "aves_corral",..: NA NA NA NA NA NA 24 NA NA 7 ...
##  $ LATITUDE             : num  -16.4 -16.4 -16.4 NA -16.4 ...
##  $ LONGITUDE            : num  -71.5 -71.5 -71.5 NA -71.5 ...
##  $ DATETIME1            : POSIXct, format: "2017-06-02 16:51:00" "2017-05-19 16:32:00" ...
dt1 <- as.data.frame(dt1)
keeps <- c("UNICODE", "USER_NAME", "GROUP_NAME", "DATA_ACTION", "CARACT_PREDIO", "STATUS_INSPECCION", "TEST_DATA", "DATETIME", "PREDICTED_PROBAB", "PREDICTED_PROBAB_MEAN", "PREDICTED_COLOR", "LATITUDE", "LONGITUDE","DATETIME1")
dt2 <- dt1[ , keeps, drop = FALSE]
str(dt2)
## 'data.frame':    667 obs. of  14 variables:
##  $ UNICODE              : Factor w/ 755 levels "1.-1.1.2","1.1.1.1",..: 71 72 73 74 76 77 78 78 79 80 ...
##  $ USER_NAME            : Factor w/ 19 levels "CC","CCP_1V",..: 11 6 6 6 6 6 6 6 6 6 ...
##  $ GROUP_NAME           : Factor w/ 6 levels "MINISTERIO_DE_SALUD",..: 1 3 3 3 3 3 3 3 3 3 ...
##  $ DATA_ACTION          : Factor w/ 2 levels "INSPECTION_NEW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ CARACT_PREDIO        : Factor w/ 7 levels "CR","DES","LP",..: 2 6 6 6 6 6 6 6 6 6 ...
##  $ STATUS_INSPECCION    : Factor w/ 10 levels "","C","ENT","I",..: 2 9 2 2 2 9 10 7 9 10 ...
##  $ TEST_DATA            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DATETIME             : Factor w/ 661 levels "1/11/17 19:57",..: 610 391 390 389 387 386 415 385 384 383 ...
##  $ PREDICTED_PROBAB     : num  9.23e-05 5.84e-11 1.39e-10 1.39e-10 2.41e-11 ...
##  $ PREDICTED_PROBAB_MEAN: num  0.0142 0.0144 0.0144 0.0144 0.0144 ...
##  $ PREDICTED_COLOR      : Factor w/ 8 levels "#808080","#BD0026",..: 2 6 5 5 6 4 3 3 4 5 ...
##  $ LATITUDE             : num  -16.4 -16.4 -16.4 NA -16.4 ...
##  $ LONGITUDE            : num  -71.5 -71.5 -71.5 NA -71.5 ...
##  $ DATETIME1            : POSIXct, format: "2017-06-02 16:51:00" "2017-05-19 16:32:00" ...
summary(dt2)
##          UNICODE      USER_NAME                 GROUP_NAME 
##  1.10.38.379A:  4   CCP_1V :187   MINISTERIO_DE_SALUD:265  
##  1.10.38.1110:  3   Rt1    :179   NULL               :  9  
##  1.10.38.888 :  3   OCA_3V :130   UPCH_GROUP         :355  
##  1.10.38.908 :  3   RT1    : 75   UPCH_GROUP_SOCABAYA: 19  
##  1.10.38.917 :  3   CC     : 19   data_sasha         :  0  
##  1.10.38.1005:  2   test1  : 19   real_model1        : 19  
##  (Other)     :649   (Other): 58                            
##          DATA_ACTION       CARACT_PREDIO  STATUS_INSPECCION   TEST_DATA
##  INSPECTION_NEW:658   CR          :  0   C         :259     Min.   :0  
##  NULL          :  9   DES         : 21   inspeccion:203     1st Qu.:0  
##                       LP          :  9   entrevista:185     Median :0  
##                       LV          :  0   R         : 12     Mean   :0  
##                       casa_reg    :  0   V         :  5     3rd Qu.:0  
##                       casa_regular:637   (Other)   :  0     Max.   :0  
##                       hola        :  0   NA's      :  3                
##           DATETIME   PREDICTED_PROBAB  PREDICTED_PROBAB_MEAN
##  1/11/17 19:57: 28   Min.   :0.00000   Min.   :0.01021      
##  3/6/17 15:32 :  4   1st Qu.:0.00000   1st Qu.:0.01425      
##  6/15/17 17:26:  4   Median :0.00001   Median :0.01438      
##  1/26/17 17:46:  3   Mean   :0.02230   Mean   :0.01788      
##  3/6/17 15:04 :  3   3rd Qu.:0.00206   3rd Qu.:0.01447      
##  1/16/17 16:17:  2   Max.   :0.69381   Max.   :0.08737      
##  (Other)      :623   NA's   :106       NA's   :106          
##  PREDICTED_COLOR    LATITUDE        LONGITUDE     
##  #BD0026:120     Min.   :-16.46   Min.   :-71.52  
##  #F03B20:112     1st Qu.:-16.41   1st Qu.:-71.51  
##  #FECC5C: 96     Median :-16.41   Median :-71.51  
##  #FD8D3C: 86     Mean   :-16.41   Mean   :-71.51  
##  #FFFFB2: 70     3rd Qu.:-16.41   3rd Qu.:-71.50  
##  (Other):  4     Max.   :-16.41   Max.   :-71.50  
##  NA's   :179     NA's   :25       NA's   :25      
##    DATETIME1                  
##  Min.   :2017-01-11 19:57:00  
##  1st Qu.:2017-04-12 16:50:00  
##  Median :2017-05-15 14:05:00  
##  Mean   :2017-04-24 08:54:27  
##  3rd Qu.:2017-05-24 17:27:00  
##  Max.   :2017-06-15 17:27:00  
## 
dt3 <- dt2[complete.cases(dt2[c("LATITUDE", "LONGITUDE")]),]
summary(dt3)
##          UNICODE      USER_NAME                 GROUP_NAME 
##  1.10.38.1110:  3   CCP_1V :178   MINISTERIO_DE_SALUD:254  
##  1.10.38.888 :  3   Rt1    :169   NULL               :  9  
##  1.10.38.908 :  3   OCA_3V :126   UPCH_GROUP         :341  
##  1.10.38.917 :  3   RT1    : 75   UPCH_GROUP_SOCABAYA: 19  
##  1.10.38.1005:  2   CC     : 19   data_sasha         :  0  
##  1.10.38.1014:  2   test1  : 19   real_model1        : 19  
##  (Other)     :626   (Other): 56                            
##          DATA_ACTION       CARACT_PREDIO  STATUS_INSPECCION   TEST_DATA
##  INSPECTION_NEW:633   CR          :  0   C         :245     Min.   :0  
##  NULL          :  9   DES         : 19   inspeccion:199     1st Qu.:0  
##                       LP          :  9   entrevista:179     Median :0  
##                       LV          :  0   R         : 11     Mean   :0  
##                       casa_reg    :  0   V         :  5     3rd Qu.:0  
##                       casa_regular:614   (Other)   :  0     Max.   :0  
##                       hola        :  0   NA's      :  3                
##           DATETIME   PREDICTED_PROBAB  PREDICTED_PROBAB_MEAN
##  1/11/17 19:57: 28   Min.   :0.00000   Min.   :0.01021      
##  3/6/17 15:04 :  3   1st Qu.:0.00000   1st Qu.:0.01425      
##  1/16/17 16:17:  2   Median :0.00001   Median :0.01438      
##  1/17/17 16:35:  2   Mean   :0.02096   Mean   :0.01792      
##  1/19/17 19:08:  2   3rd Qu.:0.00206   3rd Qu.:0.01447      
##  1/20/17 19:12:  2   Max.   :0.69381   Max.   :0.08737      
##  (Other)      :603   NA's   :101       NA's   :101          
##  PREDICTED_COLOR    LATITUDE        LONGITUDE     
##  #BD0026:117     Min.   :-16.46   Min.   :-71.52  
##  #F03B20:112     1st Qu.:-16.41   1st Qu.:-71.51  
##  #FECC5C: 88     Median :-16.41   Median :-71.51  
##  #FD8D3C: 84     Mean   :-16.41   Mean   :-71.51  
##  #FFFFB2: 67     3rd Qu.:-16.41   3rd Qu.:-71.50  
##  (Other):  4     Max.   :-16.41   Max.   :-71.50  
##  NA's   :170                                      
##    DATETIME1                  
##  Min.   :2017-01-11 19:57:00  
##  1st Qu.:2017-04-12 16:53:45  
##  Median :2017-05-14 03:55:00  
##  Mean   :2017-04-24 11:53:26  
##  3rd Qu.:2017-05-24 16:59:00  
##  Max.   :2017-06-15 17:27:00  
## 

Analyzing the Dataset

nrow(dt3)
## [1] 642
length(unique(dt3$UNICODE))
## [1] 598

Leaflet Trial

dt3 %>%
  leaflet() %>%
  addTiles() %>%
  addMarkers(clusterOptions = markerClusterOptions(), popup=dt3$UNICODE)