Packages Install

library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(plotly)

Raw Data Import and Exploration

Adf = read.csv("F:/Appier Task Mission/Data Analyst Task/event_A_logs.csv")
Bdf = read.csv("F:/Appier Task Mission/Data Analyst Task/event_B_logs.csv")

A Area Exploration

## Observations: 3,288,726
## Variables: 4
## $ cluster     <fct> 6e65ccd2dcb418d0bdb844179b2418294e2292a0, 6e65ccd2...
## $ machine_id  <fct> c82e735d88458df9fe2b6981e8f62bcd6a07517e, 9cb02918...
## $ tracking_id <fct> 98a5cfd2579b59f14158f96a7ab1723a6b585677, a4f094e6...
## $ event_time  <int> 1498882699, 1498889612, 1498892915, 1498896375, 14...
## [1] "Numbers of Clusters"
## [1] 11
## [1] "Numbers of Machine ID"
## [1] 21756
## [1] "Numbers of Tracking ID"
## [1] 3288726

A Dataset Restructure

testAdf = Adf
testAdf = testAdf %>%
  mutate(cluster_label = factor(cluster, levels = levels(testAdf$cluster), labels = c(1:11)),
    event_time = as.POSIXct(as.numeric(event_time),origin="1970-01-01 00:00:00", tz="Asia/Taipei"))
glimpse(testAdf)
## Observations: 3,288,726
## Variables: 5
## $ cluster       <fct> 6e65ccd2dcb418d0bdb844179b2418294e2292a0, 6e65cc...
## $ machine_id    <fct> c82e735d88458df9fe2b6981e8f62bcd6a07517e, 9cb029...
## $ tracking_id   <fct> 98a5cfd2579b59f14158f96a7ab1723a6b585677, a4f094...
## $ event_time    <dttm> 2017-07-01 12:18:19, 2017-07-01 14:13:32, 2017-...
## $ cluster_label <fct> 5, 5, 5, 5, 5, 8, 5, 8, 5, 8, 5, 5, 5, 5, 3, 3, ...

Cluster’s Label list

##                                     cluster cluster_label
## 1  3208a150340467318528f6a5baeba873df36e4ee             1
## 2  404da7e4c191bdae90457fecc3999a99b737fac8             2
## 3  4922e4fda1521def7082751df900643a351f135b             3
## 4  68135d9c74290fef060b1b5d0ccb75126ad94f6f             4
## 5  6e65ccd2dcb418d0bdb844179b2418294e2292a0             5
## 6  9a996346e2c398ca7c12a0dd06e32237de36c1f0             6
## 7  a4962b7bf2d2954839a28c0058c5917cbee6891f             7
## 8  b3a012651bad958d7527618fc6a044654192b498             8
## 9  bd3113fe98b4d2dce8480ce74682e6a463b4b487             9
## 10 c38f5dfd43ac1a9f5a691c259900ae49920736c4            10
## 11 e581850f705724caebf21f35fdc1395da88a9388            11

B Dataset Exploration and Restructure

## [1] "Numbers of Tracking ID"
## [1] 24165
testBdf = Bdf
testBdf = testBdf %>%
  mutate(event_time = as.POSIXct(as.numeric(event_time),origin="1970-01-01 00:00:00", tz="Asia/Taipei"))
glimpse(testBdf)
## Observations: 24,166
## Variables: 2
## $ tracking_id <fct> 0223ce54dafeaad61826f2921ccd2eee871c675a, 3dd0ceda...
## $ event_time  <dttm> 2017-07-06 17:04:32, 2017-07-06 18:13:21, 2017-07...

A and B Combine Area

#Trigger DataSet
trigger_df = testAdf %>%
  filter(tracking_id %in% levels(Bdf$tracking_id)) %>%
  rename(A_tracking_id = tracking_id, A_event_time = event_time) %>%
  arrange(A_event_time)
trigger_df = as.data.frame(trigger_df)
head(trigger_df)
##                                    cluster
## 1 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 2 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 3 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 4 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 5 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 6 6e65ccd2dcb418d0bdb844179b2418294e2292a0
##                                 machine_id
## 1 9857653c2ad55c7abb695e5879089215ac473fd0
## 2 2738c54ecad124e2e58e0161aed1d371d1a102dd
## 3 64344e6fab8f85c92ab2e05f1c2e2fd1a376624d
## 4 69de1ada1b8b2ac63b48c9501630ff62c6373c9d
## 5 9ff1d98dafd6442bc6f48a11b8b917b6691b31ec
## 6 4633b09175e091e1f92c6179ba9ef29d1470d1d7
##                              A_tracking_id        A_event_time
## 1 771a72dfc7af4fd2fc02d0631819aed9ecade052 2017-07-01 10:51:12
## 2 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce 2017-07-01 10:51:35
## 3 72b63c9074d832aad7840125c724c5bc06a64c3b 2017-07-01 10:56:07
## 4 9ed321fe566099cfaab9ba2426e438d05e2c5a40 2017-07-01 10:57:45
## 5 ea21135f6445f6f2d8bbfa0aed9cea5f734d94bb 2017-07-01 10:58:52
## 6 ffa84616b8a81b6b0619b709fce91bc1d9a881b5 2017-07-01 10:59:10
##   cluster_label
## 1             5
## 2             5
## 3             5
## 4             5
## 5             5
## 6             5
#This is a record which is not matched with two dataset
test1Bdf = testBdf %>%
  filter(!tracking_id %in% trigger_df$A_tracking_id)
#Delete Duplicated Tracking_ID
test2Bdf = testBdf[!duplicated(testBdf$tracking_id),]
test3Bdf = test2Bdf %>%
  filter(tracking_id != test1Bdf$tracking_id) %>%
  rename(B_tracking_id = tracking_id, B_event_time = event_time) %>%
  arrange(B_event_time)

#New Dataset Combine for trigger situation
trigger_df = trigger_df[match(test3Bdf$B_tracking_id,trigger_df$A_tracking_id),]
new_trigger_df = cbind(trigger_df,test3Bdf)
#Duration Time unit is secs
new_trigger_df = new_trigger_df %>%
  mutate(Duration_Time = difftime(B_event_time,A_event_time,units="secs"))
head(new_trigger_df)
##                                    cluster
## 1 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 2 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 3 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 4 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 5 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 6 6e65ccd2dcb418d0bdb844179b2418294e2292a0
##                                 machine_id
## 1 2738c54ecad124e2e58e0161aed1d371d1a102dd
## 2 9857653c2ad55c7abb695e5879089215ac473fd0
## 3 64344e6fab8f85c92ab2e05f1c2e2fd1a376624d
## 4 4633b09175e091e1f92c6179ba9ef29d1470d1d7
## 5 69de1ada1b8b2ac63b48c9501630ff62c6373c9d
## 6 362fb0a5bcf6e55f23b6bb98ebc011b7b3beac2b
##                              A_tracking_id        A_event_time
## 1 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce 2017-07-01 10:51:35
## 2 771a72dfc7af4fd2fc02d0631819aed9ecade052 2017-07-01 10:51:12
## 3 72b63c9074d832aad7840125c724c5bc06a64c3b 2017-07-01 10:56:07
## 4 ffa84616b8a81b6b0619b709fce91bc1d9a881b5 2017-07-01 10:59:10
## 5 90ec5efbb9cb0f26f8c6c3affdc5b929d91939ac 2017-07-01 11:00:53
## 6 f0cd5d879518826313b2446951dc2cc918e5359e 2017-07-01 11:00:19
##   cluster_label                            B_tracking_id
## 1             5 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce
## 2             5 771a72dfc7af4fd2fc02d0631819aed9ecade052
## 3             5 72b63c9074d832aad7840125c724c5bc06a64c3b
## 4             5 ffa84616b8a81b6b0619b709fce91bc1d9a881b5
## 5             5 90ec5efbb9cb0f26f8c6c3affdc5b929d91939ac
## 6             5 f0cd5d879518826313b2446951dc2cc918e5359e
##          B_event_time Duration_Time
## 1 2017-07-01 10:51:45       10 secs
## 2 2017-07-01 10:51:45       33 secs
## 3 2017-07-01 10:56:53       46 secs
## 4 2017-07-01 11:00:40       90 secs
## 5 2017-07-01 11:01:01        8 secs
## 6 2017-07-01 11:01:32       73 secs

The list which clusters was been triggered

library(qicharts)
#Trigger's Cluster
T_Clusters = new_trigger_df$cluster_label[!duplicated(new_trigger_df$cluster_label)]
T_Clusters_df = data.frame(T_Clusters)
T_Clusters_df = as.list(T_Clusters_df %>% arrange(T_Clusters))

Quality Control by T-Chart

Using T-Chart to detect outliers which are false alarms.

## [1] "cluster Number:1"

## [1] "cluster Number:2"

## [1] "cluster Number:3"

## [1] "cluster Number:5"

## [1] "cluster Number:6"

## [1] "cluster Number:7"

## [1] "cluster Number:8"

## [1] "cluster Number:9"

[False Alarms (Outliers) VS Problems] Percentage of outliers, those outliers should be concluded as false alarms

It is obvious that the cluster 9 has the highest percentage of outliers. It means that it has weak relationship between A event and B event. However, it still has some problems inside it. On the other hand, other clusters have high percentage of in-control records, so it shows that they are our target clusters to be analyzed. In a short conclusion, there must be somethings wrong in them.

##   T_Clusters percentage_Outliers
## 1          1          0.06747638
## 2          2          0.11764706
## 3          3          0.14895470
## 4          5          0.11143955
## 5          6          0.15617128
## 6          7          0.15254237
## 7          8          0.05928345
## 8          9          0.54125413

Percentage of InControl

##   T_Clusters percentage_InControl
## 1          1            0.9325236
## 2          2            0.8823529
## 3          3            0.8510453
## 4          5            0.8885604
## 5          6            0.8438287
## 6          7            0.8474576
## 7          8            0.9407165
## 8          9            0.4587459

Point out each cluster’s top three ranking machine which happen most.

Below, it demonstrates the top three most likely machines by each cluster that happen this kind of problem frequently. Moreover, it provides average duration time for each machine. This detail information could support our client’s engineers to narrow down the problem searching area, and save time on debugging as well.

#Dataset of InControl Records
ture_wrong_df = do.call("rbind",ture_wrong_df)

#tracking Machine's situation
machine_record_count = ture_wrong_df %>%
  select(cluster_label,machine_id,Duration_Time) %>%
  group_by(cluster_label,machine_id) %>%
  summarise(record_count = sum(n()),
            avg_Duration_Time = mean(Duration_Time))
machine_record_count = as_tibble(as.data.frame(machine_record_count))
top_three_targetmachine = machine_record_count %>% 
  group_by(cluster_label) %>% 
  top_n(n=3, wt=record_count)

top_three_targetmachine
## # A tibble: 25 x 4
## # Groups:   cluster_label [8]
##    cluster_label machine_id                   record_count avg_Duration_Ti~
##    <fct>         <fct>                               <int> <time>          
##  1 1             1ed053d6e154ba48d613c97eae5~           31  149.03226 secs 
##  2 1             3d927bd6dae58ebe040f498e881~           44  135.06818 secs 
##  3 1             4c33d810e9536ca40165a2fa1d9~           28  104.60714 secs 
##  4 2             6eb42523b053546469333abcec8~           19  143.21053 secs 
##  5 2             8a36f9cf24a74938ee99a3fad8a~           11   45.36364 secs 
##  6 2             bc48c3cb1916ea8c94a67a56cbb~           23   15.30435 secs 
##  7 3             a040a47d959d2d923a454e530d2~          174  148.17816 secs 
##  8 3             a0844211b336ef095e93440df6f~          207  169.91787 secs 
##  9 3             d7d0b32c450c3d228c9ec70a5a1~          109  170.20183 secs 
## 10 5             69de1ada1b8b2ac63b48c950163~          495 4233.36364 secs 
## # ... with 15 more rows