Load data and summarize data

rm(list = ls())

### library
library(ggplot2)
library(magrittr)
library(reshape2)
library(caret)
## Loading required package: lattice
library(Boruta)
## Loading required package: ranger
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(grid)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(viridis)


### load data
rawdata = read.csv('AccessLog.csv', header = T, sep = ',')
saveRDS(rawdata, 'rawdata.rds')
#rawdata = readRDS('rawdata.rds')
summary(rawdata)
##                Date.Time              Employee.Name      Employee.ID   
##  11/19/2013 2:48:50 :   29   Gerardo Munoz   :17415   GMunoz3  :17415  
##  11/19/2013 12:22:31:   25   Dustin Steele   :13447   DSteele9 :13447  
##  11/26/2013 12:08:16:   25   Kathleen Wheeler:11521   KWheele10:11521  
##  12/2/2013 12:05:00 :   20   Richard Cummings:11484   RCummin0 :11484  
##  11/19/2013 12:39:40:   19   Taylor Carroll  :10760   TCarrol8 :10760  
##  11/20/2013 3:40:00 :   19   Lucas Chavez    : 6600   LChavez8 : 6600  
##  (Other)            :96473   (Other)         :25383   (Other)  :25383  
##                        EMR.Module             Patient.Name  
##  Report                     :43105   Pat Kelley     :  648  
##  Encounter                  : 7266   James Sanderson:  563  
##  Patient Lookup Form        : 5441   Sophie Hunt    :  561  
##  Notes Section              : 3277   Carolyn Mathis :  552  
##  Demographics               : 3177   Lucy Harrington:  551  
##  Chart Review Encounters tab: 2977   Michaele Bolton:  550  
##  (Other)                    :31367   (Other)        :93185  
##  Medical.Record.Number Patient.ID.Number
##  JH94173864:  648      Z93108 :  648    
##  JH76240207:  561      Z70957 :  561    
##  JH29532130:  552      Z73483 :  552    
##  JH22634601:  551      Z98693 :  551    
##  JH64607870:  550      Z36633 :  550    
##  JH91736583:  545      Z71324 :  545    
##  (Other)   :93203      (Other):93203
names(rawdata)
## [1] "Date.Time"             "Employee.Name"         "Employee.ID"          
## [4] "EMR.Module"            "Patient.Name"          "Medical.Record.Number"
## [7] "Patient.ID.Number"
head(rawdata)
##            Date.Time  Employee.Name Employee.ID
## 1 11/14/2013 6:42:58 Kristin Guzman    KGuzman9
## 2 11/14/2013 6:43:18 Kristin Guzman    KGuzman9
## 3 11/14/2013 6:43:19 Kristin Guzman    KGuzman9
## 4 11/14/2013 6:43:21 Kristin Guzman    KGuzman9
## 5 11/14/2013 7:03:14 Kristin Guzman    KGuzman9
## 6 11/14/2013 7:09:24 Kristin Guzman    KGuzman9
##                    EMR.Module  Patient.Name Medical.Record.Number
## 1         Patient Lookup Form Leonard Perez            JH90939388
## 2         Patient Lookup Form Leonard Perez            JH90939388
## 3 Chart Review Procedures tab Leonard Perez            JH90939388
## 4 Chart Review Encounters tab Leonard Perez            JH90939388
## 5         Patient Lookup Form Shelia Bailey            JH22901225
## 6                      Report Dustin Wagner            JH42520855
##   Patient.ID.Number
## 1          Z8755695
## 2          Z8755695
## 3          Z8755695
## 4          Z8755695
## 5             Z3442
## 6            Z36493
str(rawdata)
## 'data.frame':    96610 obs. of  7 variables:
##  $ Date.Time            : Factor w/ 59269 levels "11/14/2013 1:00:32",..: 2340 2341 2342 2343 2353 2354 2355 2356 2357 2358 ...
##  $ Employee.Name        : Factor w/ 16 levels "Candace Castro",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ Employee.ID          : Factor w/ 16 levels "CCastro5","CDaniel2",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ EMR.Module           : Factor w/ 75 levels "Addendum","Admin",..: 57 57 17 7 57 62 62 25 57 57 ...
##  $ Patient.Name         : Factor w/ 3510 levels "Abel Dunn","Abigail Allan",..: 2017 2017 2017 2017 2996 970 970 970 970 970 ...
##  $ Medical.Record.Number: Factor w/ 3523 levels "JH10003485","JH10025223",..: 3190 3190 3190 3190 514 1317 1317 1317 1317 1317 ...
##  $ Patient.ID.Number    : Factor w/ 3492 levels "Z10001","Z10020",..: 3025 3025 3025 3025 968 1036 1036 1036 1036 1036 ...
## look at NA
sum(is.na(rawdata))
## [1] 0
summary(rawdata$Patient.ID.Number)
##  Z93108  Z70957  Z73483  Z98693  Z36633  Z71324 Z476353  Z38181  Z63422 
##     648     561     552     551     550     545     531     503     492 
##  Z71829  Z52757  Z52838  Z50210  Z86000  Z68559  Z97242  Z84385  Z32293 
##     481     478     449     442     425     412     411     392     387 
##  Z97384  Z71927  Z85370  Z59178  Z16913  Z19863  Z51300  Z54363 Z849151 
##     384     382     379     373     365     350     340     338     337 
##  Z42308  Z19669  Z23379  Z93470  Z59216  Z85646  Z27585  Z46167  Z46903 
##     331     326     323     320     312     312     310     299     299 
##  Z64453  Z73122  Z86994  Z82742  Z10185  Z91438  Z34198  Z87299  Z73924 
##     298     296     296     295     291     285     283     279     276 
##  Z83207  Z22088  Z21252  Z95777  Z57980  Z59327  Z38586  Z27324  Z78003 
##     276     274     271     271     268     261     252     248     248 
##  Z51775   Z6469  Z83531  Z73363  Z37536  Z61387  Z54908  Z61206  Z74248 
##     247     245     245     244     243     243     237     236     236 
##  Z86939  Z97568  Z20539   Z1287  Z45810  Z74981  Z26490  Z99969  Z53190 
##     235     233     228     225     225     225     224     222     221 
##  Z25661  Z13048  Z88555  Z60160  Z61298  Z19707  Z67319   Z9873  Z44968 
##     219     217     215     213     210     208     208     208     206 
##  Z69432  Z83171  Z18216  Z56038  Z16483  Z79365  Z33827  Z68513  Z79882 
##     206     206     203     203     202     201     197     197     194 
##  Z50803  Z45591 Z222191  Z77125  Z11336  Z31906  Z31169   Z9406   Z1184 
##     193     191     190     189     188     188     187     187     186 
## (Other) 
##   67126
summary(rawdata$Employee.ID)
##  CCastro5  CDaniel2  DSteele9  EJennin9  GMatthe5   GMunoz3  KGuzman9 
##      1740      2087     13447      4589       335     17415      6446 
##  KKenned8 KWheele10  LChavez8  MEricks0  NGreene4  RCummin0  RKeller3 
##      1953     11521      6600      1657      1824     11484       532 
##    RRice4  TCarrol8 
##      4220     10760
##process time
date_sub = substr(rawdata$Date.Time, 1, 10)
date_data = as.Date(date_sub, format = '%m/%d/%Y')
time_data = as.POSIXct(rawdata$Date.Time, format="%m/%d/%Y %H:%M:%S")
                  

rawdata$date_data = date_data
rawdata$time_data = time_data

## look at each employee record over date
plot(rawdata$Employee.Name, rawdata$date_data)

# this is a table to look at record per day per employee
tapply(rawdata$Patient.ID.Number, FUN = length, INDEX = list(rawdata$Employee.ID,rawdata$date_data))
##           2013-11-14 2013-11-15 2013-11-18 2013-11-19 2013-11-20
## CCastro5         115         81        135        173        233
## CDaniel2         100        169        132         55        188
## DSteele9         993       1009        952        560        636
## EJennin9         234        289        126        238        412
## GMatthe5          50          6         NA         60         56
## GMunoz3         1301       1620       1211        254       1218
## KGuzman9         272        165        197        197        274
## KKenned8          88        108         71        210        144
## KWheele10         NA         NA        509        590        878
## LChavez8         109        298         NA        606        198
## MEricks0          54        117         79        179         84
## NGreene4          53         97        123         47        104
## RCummin0         375        661       1167        803        760
## RKeller3          NA         21         NA         29         NA
## RRice4           282        170        150        271        304
## TCarrol8        1068         NA        821        582        479
##           2013-11-21 2013-11-22 2013-11-24 2013-11-25 2013-11-26
## CCastro5          85         34         NA         65         NA
## CDaniel2         104         76         NA        133         44
## DSteele9         803       1192         NA        744       1101
## EJennin9         142        229         NA        100        283
## GMatthe5          NA         73         NA          5         NA
## GMunoz3         1185        141         NA         NA       1542
## KGuzman9         418        368         NA        260        372
## KKenned8          NA         NA         NA         NA         NA
## KWheele10        321        642         NA        931        471
## LChavez8         282        192         NA        206        520
## MEricks0          89         66         NA        141        111
## NGreene4         134         50         NA         64         90
## RCummin0         619         NA         NA        530        706
## RKeller3          19         NA         NA         NA         13
## RRice4           253         73         NA        210        270
## TCarrol8         604        420         68        773        830
##           2013-11-27 2013-11-29 2013-12-02 2013-12-03 2013-12-04
## CCastro5          NA         NA         67         60        211
## CDaniel2         112         NA        138         42        177
## DSteele9         476         NA         NA         NA         NA
## EJennin9         470         NA        118        353        383
## GMatthe5          NA         NA         24         NA         46
## GMunoz3         1024         NA        175        358       1846
## KGuzman9         461         NA        300        401        267
## KKenned8          NA         NA        214         73         NA
## KWheele10        708         NA        604        735        757
## LChavez8         403         59         NA        521        548
## MEricks0          46         NA         NA        151         51
## NGreene4          81        104         NA         NA        121
## RCummin0         398         NA        850        975        919
## RKeller3         114         NA          6        325          5
## RRice4           270         64        187        315        207
## TCarrol8         573         NA         NA        308        509
##           2013-12-05 2013-12-06 2013-12-07 2013-12-08 2013-12-09
## CCastro5           4         33         NA         NA         35
## CDaniel2         128         63         NA         NA         25
## DSteele9          NA        797         NA         NA        748
## EJennin9         210        208         NA         NA         NA
## GMatthe5          NA         NA         NA         NA          4
## GMunoz3           70         NA         NA         NA        173
## KGuzman9         442        147         NA         NA        274
## KKenned8         311        205         NA         NA         77
## KWheele10        987        658         NA         NA        387
## LChavez8         512        678         NA         NA         NA
## MEricks0          91         57         NA         NA         36
## NGreene4         196         77         NA         NA         NA
## RCummin0         819        460         NA         NA        530
## RKeller3          NA         NA         NA         NA         NA
## RRice4           184         54         NA         NA        125
## TCarrol8         774        425        231         21        339
##           2013-12-10 2013-12-11 2013-12-12 2013-12-13
## CCastro5         197         87         71         54
## CDaniel2          61        112        123        105
## DSteele9         570       1064        866        936
## EJennin9          NA        421         92        281
## GMatthe5          11         NA         NA         NA
## GMunoz3         1258       1400       1251       1388
## KGuzman9         350        301        400        580
## KKenned8         141        169        112         30
## KWheele10        374        719        661        589
## LChavez8         118        290        420        640
## MEricks0         118         33         26        128
## NGreene4         158        118        159         48
## RCummin0         482        295         NA        135
## RKeller3          NA         NA         NA         NA
## RRice4           200        276        288         67
## TCarrol8          NA        650        713        572

look up patients function

## function 1 ## 
## use a index time, search all the employee and return a list of all the employees durng that time
start = rawdata$time_data[1]
end = rawdata$time_data[1] + 60*24*60*60
# end and start difference : 10 hours

time_lookup <- function(start, end) {
    # start and end are date
    
    subset = which(rawdata$time_data > start)
    subset2 = which(rawdata$time_data < end)
    subset3 = intersect(subset, subset2)
    
    return(rawdata[subset3,])
    
}

## lets try
new_try = time_lookup(start, end)


##find the patients ID who has been looked up by more than one employee

commonpatient <- function(new_try){
    new_patient = matrix(0, nrow = 1000, ncol = 16)
    new_patient = data.frame(new_patient)
    colnames(new_patient) = levels(rawdata$Employee.ID)
    
    for (i in c(1:16)) {
        subset = which(new_try$Employee.ID == levels(new_try$Employee.ID)[i])
        if (length(subset) > 0){
            new_patient[1:length(subset), i] =
                new_try[subset,]$Patient.ID.Number
            } 
        }
    common_patient = Reduce(intersect, list(new_patient$CCastro5,
                       new_patient$CDaniel2,
                       new_patient$DSteele9,
                       new_patient$EJennin9,
                       new_patient$GMatthe5,
                       new_patient$GMunoz3,
                       new_patient$KGuzman9,
                       new_patient$KKenned8,
                       new_patient$KWheele10,
                       new_patient$LChavez8,
                       new_patient$MEricks0,
                       new_patient$NGreene4,
                       new_patient$RCummin0,
                       new_patient$RKeller3,
                       new_patient$RRice4,
                       new_patient$TCarrol8))
    return(common_patient)

}
matrix = time_lookup(start, end)
patient = commonpatient(matrix)
print(patient)
## numeric(0)
#now no patient are common


## function to return frequency 
find_freq <- function(patient, new_try) {
    for (name in levels(new_try$Employee.ID)) {
        freq = match(new_try$Patient.ID.Number[new_try$Employee.ID == name], patient)
        print('Employe: %', name)
        print('freq:%' + freq)
    }
}

Conclusion

  • The given data has one month of time duration. During this time, it seems that there’s no common records shared between these employees.
  • Based on my findings, these employees might not work in the same place.
  • These employee might from different hospitals.
  • It is possible that I searched the wrong variables (patient_ID). I might need search other variables such as record ID.

What features can be generated?

  • For each employee, generate a logical varible which indicates whether or nor his accessed data has been visited by other employee or not.
  • For each employee, generate the name of other employee who shared the same visit record at the same time.