NOTE: The data munging and feature engineering operations (along with sample visualizations) involved 500+ lines of code in R. As this project is worked for the client based out in Arequipa (Peru), the code is hidden intentionally and only a sample of the dataset is used here to make sure that the anonymity is preserved.
rm(list=ls())
library(lubridate) # for csv files
library(leaflet) # maps
library(dplyr) # for piping purpose %>%
library(sp)
library(rgdal)
library(geosphere)
library(dismo)
library(rgeos)
library(fields)
#library(lpSolve) # fir linear programming in R
setwd("/Users/Mughundhan/Data Science Intern/Chagas")
fdata <- read.csv("fdata.csv")
## 'data.frame': 642 obs. of 20 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ UNICODE : Factor w/ 598 levels "1.10.38.100",..: 1 2 3 4 5 6 6 7 8 9 ...
## $ USER_NAME : Factor w/ 8 levels "CC","CCP_1V",..: 6 4 4 4 4 4 4 4 4 4 ...
## $ GROUP_NAME : Factor w/ 5 levels "MINISTERIO_DE_SALUD",..: 1 3 3 3 3 3 3 3 3 3 ...
## $ DATA_ACTION : Factor w/ 2 levels "INSPECTION_NEW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CARACT_PREDIO : Factor w/ 3 levels "DES","LP","casa_regular": 1 3 3 3 3 3 3 3 3 3 ...
## $ STATUS_INSPECCION : Factor w/ 5 levels "C","R","V","entrevista",..: 1 4 1 1 4 5 3 4 5 5 ...
## $ TEST_DATA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DATETIME : Factor w/ 548 levels "01/11/17 19:57",..: 183 437 436 435 434 441 433 432 431 430 ...
## $ PREDICTED_PROBAB : num 9.23e-05 5.84e-11 1.39e-10 2.41e-11 8.48e-10 ...
## $ PREDICTED_PROBAB_MEAN: num 0.0142 0.0144 0.0144 0.0144 0.0144 ...
## $ PREDICTED_COLOR : Factor w/ 6 levels "#BD0026","#F03B20",..: 1 5 4 5 3 2 2 3 4 3 ...
## $ LATITUDE : num -16.4 -16.4 -16.4 -16.4 -16.4 ...
## $ LONGITUDE : num -71.5 -71.5 -71.5 -71.5 -71.5 ...
## $ DATETIME1 : Factor w/ 548 levels "01/06/17 15:59",..: 31 384 383 382 381 396 380 379 378 377 ...
## $ LOCAL_TIME : Factor w/ 548 levels "0001-06-17 16:49:36",..: 31 384 383 382 381 396 380 379 378 377 ...
## $ LOCAL_DATETIME_new : Factor w/ 548 levels "01/17/11 08:47 PM",..: 483 377 376 375 374 381 373 372 371 370 ...
## $ week : Factor w/ 5 levels "Friday","Monday",..: 1 1 1 1 1 2 1 1 1 1 ...
## $ date : Factor w/ 44 levels "2017-01-11","2017-01-16",..: 38 29 29 29 29 30 29 29 29 29 ...
## X V1 UNICODE
## 0 0 0
## USER_NAME GROUP_NAME DATA_ACTION
## 0 0 0
## CARACT_PREDIO STATUS_INSPECCION TEST_DATA
## 0 3 0
## DATETIME PREDICTED_PROBAB PREDICTED_PROBAB_MEAN
## 0 101 101
## PREDICTED_COLOR LATITUDE LONGITUDE
## 170 0 0
## DATETIME1 LOCAL_TIME LOCAL_DATETIME_new
## 0 0 0
## week date
## 0 0
##
## FALSE TRUE
## 12465 375
## id LATITUDE LONGITUDE
## 1 1 -16.41048 -71.50998
## 2 2 -16.40770 -71.50583
## 3 3 -16.40775 -71.50592
## 4 4 -16.40776 -71.50611
## 5 5 -16.40783 -71.50617
## 6 6 -16.40788 -71.50632
latitude<-fdata$LATITUDE
longitude<-fdata$LONGITUDE
km <- kmeans(cbind(latitude, longitude), centers = 6)
plot(longitude, latitude, col = km$cluster, pch = 20)
## id LATITUDE LONGITUDE km$cluster
## 1 1 -16.41048 -71.50998 1
## 2 2 -16.40770 -71.50583 3
## 3 3 -16.40775 -71.50592 3
## 4 4 -16.40776 -71.50611 3
## 5 5 -16.40783 -71.50617 3
## 6 6 -16.40788 -71.50632 3
## id LATITUDE LONGITUDE Cluster_No Type
## 639 639 -16.45664 -71.51649 6 Residence / House
## 640 640 -16.45675 -71.51659 6 Residence / House
## 641 641 -16.45688 -71.51669 6 Residence / House
## 642 642 -16.45677 -71.51682 6 Residence / House
## 1100 Cluster -16.40967 -71.50976 1 Health Facility
## 2100 Cluster -16.40768 -71.51135 2 Health Facility
## 3100 Cluster -16.40792 -71.50657 3 Health Facility
## 4100 Cluster -16.40738 -71.50393 4 Health Facility
## 5100 Cluster -16.40694 -71.49940 5 Health Facility
## 643 Cluster -16.45609 -71.51621 6 Health Facility
The optimal locations for installing the health facilities are displayed:
## id LATITUDE LONGITUDE Cluster_No Type
## 1100 Cluster -16.40967 -71.50976 1 Health Facility
## 2100 Cluster -16.40768 -71.51135 2 Health Facility
## 3100 Cluster -16.40792 -71.50657 3 Health Facility
## 4100 Cluster -16.40738 -71.50393 4 Health Facility
## 5100 Cluster -16.40694 -71.49940 5 Health Facility
## 643 Cluster -16.45609 -71.51621 6 Health Facility