Clean the data
- A. Dimensionality Reduction: Now we have created a new data by making appropriate modifications in the source data. Now let us remove the unnecessary columns / attributes.
dt1 <- dt
str(dt)
## 'data.frame': 667 obs. of 40 variables:
## $ UNICODE : Factor w/ 755 levels "1.-1.1.2","1.1.1.1",..: 71 72 73 74 76 77 78 78 79 80 ...
## $ ID : int NA NA NA NA NA NA NA NA NA NA ...
## $ USER_NAME : Factor w/ 19 levels "CC","CCP_1V",..: 11 6 6 6 6 6 6 6 6 6 ...
## $ GROUP_NAME : Factor w/ 6 levels "MINISTERIO_DE_SALUD",..: 1 3 3 3 3 3 3 3 3 3 ...
## $ DATA_ACTION : Factor w/ 2 levels "INSPECTION_NEW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ OBS_UNICODE : int NA 2 NA NA NA NA NA NA NA NA ...
## $ OBS_TEXT : logi NA NA NA NA NA NA ...
## $ CARACT_PREDIO : Factor w/ 7 levels "CR","DES","LP",..: 2 6 6 6 6 6 6 6 6 6 ...
## $ TIPO_LP : Factor w/ 8 levels "","Banco BCP",..: NA NA NA NA NA NA NA NA NA NA ...
## $ STATUS_INSPECCION : Factor w/ 10 levels "","C","ENT","I",..: 2 9 2 2 2 9 10 7 9 10 ...
## $ MOTIVO_VOLVER : Factor w/ 1 level "Esta de salida": NA NA NA NA NA NA NA 1 NA NA ...
## $ INTRA_INSPECCION : int NA NA NA NA NA NA NA NA NA NA ...
## $ INTRA_CHIRIS : int NA NA NA NA NA NA NA NA NA NA ...
## $ INTRA_RASTROS : int NA NA NA NA NA NA NA NA NA NA ...
## $ PERI_INSPECCION : int NA NA NA NA NA NA NA NA NA NA ...
## $ PERI_CHIRIS : int NA NA NA NA NA NA NA NA NA NA ...
## $ PERI_RASTROS : int NA NA NA NA NA NA NA NA NA NA ...
## $ LUGAR_INSPECCION : Factor w/ 3 levels "intra","intra-peri",..: NA NA NA NA NA NA 3 NA NA 2 ...
## $ ENTREVISTA : Factor w/ 4 levels "","cree_no_tiene",..: NA 2 NA NA NA 2 NA NA 2 NA ...
## $ TOT_INTRA : Factor w/ 10 levels "0","1","10","14",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ TOT_PERI : Factor w/ 13 levels "0","1","10","100",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ RASTROS : Factor w/ 9 levels "","0","NULL",..: NA NA NA NA NA NA 2 NA NA 2 ...
## $ PERSONAS_PREDIO : Factor w/ 19 levels "0","1","10","11",..: NA NA NA NA NA NA 11 NA NA 9 ...
## $ CANT_PERROS : Factor w/ 14 levels "0","1","10","12",..: NA NA NA NA NA NA 7 NA NA 2 ...
## $ CANT_GATOS : Factor w/ 8 levels "0","1","10","12",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ CANT_AVES_CORRAL : Factor w/ 21 levels "0","1","10","100",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ CANT_CUYES : Factor w/ 17 levels "0","1","10","11",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ CANT_CONEJOS : Factor w/ 8 levels "0","1","15","2",..: NA NA NA NA NA NA 1 NA NA 1 ...
## $ TEXT_OTROS : Factor w/ 9 levels "0","Canarios",..: NA NA NA NA NA NA 2 NA NA 1 ...
## $ CANT_OTROS : Factor w/ 8 levels "0","1","12","2",..: NA NA NA NA NA NA 5 NA NA 1 ...
## $ TEST_DATA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATETIME : Factor w/ 661 levels "1/11/17 19:57",..: 610 391 390 389 387 386 415 385 384 383 ...
## $ INSPECTION_FLAG : logi NA NA NA NA NA NA ...
## $ PREDICTED_PROBAB : num 9.23e-05 5.84e-11 1.39e-10 1.39e-10 2.41e-11 ...
## $ PREDICTED_PROBAB_MEAN: num 0.0142 0.0144 0.0144 0.0144 0.0144 ...
## $ PREDICTED_COLOR : Factor w/ 8 levels "#808080","#BD0026",..: 2 6 5 5 6 4 3 3 4 5 ...
## $ ANIMALES : Factor w/ 24 levels "aves_corral",..: NA NA NA NA NA NA 24 NA NA 7 ...
## $ LATITUDE : num -16.4 -16.4 -16.4 NA -16.4 ...
## $ LONGITUDE : num -71.5 -71.5 -71.5 NA -71.5 ...
## $ DATETIME1 : POSIXct, format: "2017-06-02 16:51:00" "2017-05-19 16:32:00" ...
dt1 <- as.data.frame(dt1)
keeps <- c("UNICODE", "USER_NAME", "GROUP_NAME", "DATA_ACTION", "CARACT_PREDIO", "STATUS_INSPECCION", "TEST_DATA", "DATETIME", "PREDICTED_PROBAB", "PREDICTED_PROBAB_MEAN", "PREDICTED_COLOR", "LATITUDE", "LONGITUDE","DATETIME1")
dt2 <- dt1[ , keeps, drop = FALSE]
str(dt2)
## 'data.frame': 667 obs. of 14 variables:
## $ UNICODE : Factor w/ 755 levels "1.-1.1.2","1.1.1.1",..: 71 72 73 74 76 77 78 78 79 80 ...
## $ USER_NAME : Factor w/ 19 levels "CC","CCP_1V",..: 11 6 6 6 6 6 6 6 6 6 ...
## $ GROUP_NAME : Factor w/ 6 levels "MINISTERIO_DE_SALUD",..: 1 3 3 3 3 3 3 3 3 3 ...
## $ DATA_ACTION : Factor w/ 2 levels "INSPECTION_NEW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CARACT_PREDIO : Factor w/ 7 levels "CR","DES","LP",..: 2 6 6 6 6 6 6 6 6 6 ...
## $ STATUS_INSPECCION : Factor w/ 10 levels "","C","ENT","I",..: 2 9 2 2 2 9 10 7 9 10 ...
## $ TEST_DATA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATETIME : Factor w/ 661 levels "1/11/17 19:57",..: 610 391 390 389 387 386 415 385 384 383 ...
## $ PREDICTED_PROBAB : num 9.23e-05 5.84e-11 1.39e-10 1.39e-10 2.41e-11 ...
## $ PREDICTED_PROBAB_MEAN: num 0.0142 0.0144 0.0144 0.0144 0.0144 ...
## $ PREDICTED_COLOR : Factor w/ 8 levels "#808080","#BD0026",..: 2 6 5 5 6 4 3 3 4 5 ...
## $ LATITUDE : num -16.4 -16.4 -16.4 NA -16.4 ...
## $ LONGITUDE : num -71.5 -71.5 -71.5 NA -71.5 ...
## $ DATETIME1 : POSIXct, format: "2017-06-02 16:51:00" "2017-05-19 16:32:00" ...
- B. Handle Missing Values Here we are concerned more about Latitudes and Longitudes since we are focusing on Route visualization. So we are going to drop the rows for which latitude and longitude values are missing. We shall preserve other rows although it contains missing values in other attributes (except Latitude and Longitude).
summary(dt2)
## UNICODE USER_NAME GROUP_NAME
## 1.10.38.379A: 4 CCP_1V :187 MINISTERIO_DE_SALUD:265
## 1.10.38.1110: 3 Rt1 :179 NULL : 9
## 1.10.38.888 : 3 OCA_3V :130 UPCH_GROUP :355
## 1.10.38.908 : 3 RT1 : 75 UPCH_GROUP_SOCABAYA: 19
## 1.10.38.917 : 3 CC : 19 data_sasha : 0
## 1.10.38.1005: 2 test1 : 19 real_model1 : 19
## (Other) :649 (Other): 58
## DATA_ACTION CARACT_PREDIO STATUS_INSPECCION TEST_DATA
## INSPECTION_NEW:658 CR : 0 C :259 Min. :0
## NULL : 9 DES : 21 inspeccion:203 1st Qu.:0
## LP : 9 entrevista:185 Median :0
## LV : 0 R : 12 Mean :0
## casa_reg : 0 V : 5 3rd Qu.:0
## casa_regular:637 (Other) : 0 Max. :0
## hola : 0 NA's : 3
## DATETIME PREDICTED_PROBAB PREDICTED_PROBAB_MEAN
## 1/11/17 19:57: 28 Min. :0.00000 Min. :0.01021
## 3/6/17 15:32 : 4 1st Qu.:0.00000 1st Qu.:0.01425
## 6/15/17 17:26: 4 Median :0.00001 Median :0.01438
## 1/26/17 17:46: 3 Mean :0.02230 Mean :0.01788
## 3/6/17 15:04 : 3 3rd Qu.:0.00206 3rd Qu.:0.01447
## 1/16/17 16:17: 2 Max. :0.69381 Max. :0.08737
## (Other) :623 NA's :106 NA's :106
## PREDICTED_COLOR LATITUDE LONGITUDE
## #BD0026:120 Min. :-16.46 Min. :-71.52
## #F03B20:112 1st Qu.:-16.41 1st Qu.:-71.51
## #FECC5C: 96 Median :-16.41 Median :-71.51
## #FD8D3C: 86 Mean :-16.41 Mean :-71.51
## #FFFFB2: 70 3rd Qu.:-16.41 3rd Qu.:-71.50
## (Other): 4 Max. :-16.41 Max. :-71.50
## NA's :179 NA's :25 NA's :25
## DATETIME1
## Min. :2017-01-11 19:57:00
## 1st Qu.:2017-04-12 16:50:00
## Median :2017-05-15 14:05:00
## Mean :2017-04-24 08:54:27
## 3rd Qu.:2017-05-24 17:27:00
## Max. :2017-06-15 17:27:00
##
dt3 <- dt2[complete.cases(dt2[c("LATITUDE", "LONGITUDE")]),]
summary(dt3)
## UNICODE USER_NAME GROUP_NAME
## 1.10.38.1110: 3 CCP_1V :178 MINISTERIO_DE_SALUD:254
## 1.10.38.888 : 3 Rt1 :169 NULL : 9
## 1.10.38.908 : 3 OCA_3V :126 UPCH_GROUP :341
## 1.10.38.917 : 3 RT1 : 75 UPCH_GROUP_SOCABAYA: 19
## 1.10.38.1005: 2 CC : 19 data_sasha : 0
## 1.10.38.1014: 2 test1 : 19 real_model1 : 19
## (Other) :626 (Other): 56
## DATA_ACTION CARACT_PREDIO STATUS_INSPECCION TEST_DATA
## INSPECTION_NEW:633 CR : 0 C :245 Min. :0
## NULL : 9 DES : 19 inspeccion:199 1st Qu.:0
## LP : 9 entrevista:179 Median :0
## LV : 0 R : 11 Mean :0
## casa_reg : 0 V : 5 3rd Qu.:0
## casa_regular:614 (Other) : 0 Max. :0
## hola : 0 NA's : 3
## DATETIME PREDICTED_PROBAB PREDICTED_PROBAB_MEAN
## 1/11/17 19:57: 28 Min. :0.00000 Min. :0.01021
## 3/6/17 15:04 : 3 1st Qu.:0.00000 1st Qu.:0.01425
## 1/16/17 16:17: 2 Median :0.00001 Median :0.01438
## 1/17/17 16:35: 2 Mean :0.02096 Mean :0.01792
## 1/19/17 19:08: 2 3rd Qu.:0.00206 3rd Qu.:0.01447
## 1/20/17 19:12: 2 Max. :0.69381 Max. :0.08737
## (Other) :603 NA's :101 NA's :101
## PREDICTED_COLOR LATITUDE LONGITUDE
## #BD0026:117 Min. :-16.46 Min. :-71.52
## #F03B20:112 1st Qu.:-16.41 1st Qu.:-71.51
## #FECC5C: 88 Median :-16.41 Median :-71.51
## #FD8D3C: 84 Mean :-16.41 Mean :-71.51
## #FFFFB2: 67 3rd Qu.:-16.41 3rd Qu.:-71.50
## (Other): 4 Max. :-16.41 Max. :-71.50
## NA's :170
## DATETIME1
## Min. :2017-01-11 19:57:00
## 1st Qu.:2017-04-12 16:53:45
## Median :2017-05-14 03:55:00
## Mean :2017-04-24 11:53:26
## 3rd Qu.:2017-05-24 16:59:00
## Max. :2017-06-15 17:27:00
##