Packages Install
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(plotly)
Raw Data Import and Exploration
Adf = read.csv("F:/Appier Task Mission/Data Analyst Task/event_A_logs.csv")
Bdf = read.csv("F:/Appier Task Mission/Data Analyst Task/event_B_logs.csv")
A Area Exploration
## Observations: 3,288,726
## Variables: 4
## $ cluster <fct> 6e65ccd2dcb418d0bdb844179b2418294e2292a0, 6e65ccd2...
## $ machine_id <fct> c82e735d88458df9fe2b6981e8f62bcd6a07517e, 9cb02918...
## $ tracking_id <fct> 98a5cfd2579b59f14158f96a7ab1723a6b585677, a4f094e6...
## $ event_time <int> 1498882699, 1498889612, 1498892915, 1498896375, 14...
## [1] "Numbers of Clusters"
## [1] 11
## [1] "Numbers of Machine ID"
## [1] 21756
## [1] "Numbers of Tracking ID"
## [1] 3288726
A Dataset Restructure
testAdf = Adf
testAdf = testAdf %>%
mutate(cluster_label = factor(cluster, levels = levels(testAdf$cluster), labels = c(1:11)),
event_time = as.POSIXct(as.numeric(event_time),origin="1970-01-01 00:00:00", tz="Asia/Taipei"))
glimpse(testAdf)
## Observations: 3,288,726
## Variables: 5
## $ cluster <fct> 6e65ccd2dcb418d0bdb844179b2418294e2292a0, 6e65cc...
## $ machine_id <fct> c82e735d88458df9fe2b6981e8f62bcd6a07517e, 9cb029...
## $ tracking_id <fct> 98a5cfd2579b59f14158f96a7ab1723a6b585677, a4f094...
## $ event_time <dttm> 2017-07-01 12:18:19, 2017-07-01 14:13:32, 2017-...
## $ cluster_label <fct> 5, 5, 5, 5, 5, 8, 5, 8, 5, 8, 5, 5, 5, 5, 3, 3, ...
Cluster’s Label list
## cluster cluster_label
## 1 3208a150340467318528f6a5baeba873df36e4ee 1
## 2 404da7e4c191bdae90457fecc3999a99b737fac8 2
## 3 4922e4fda1521def7082751df900643a351f135b 3
## 4 68135d9c74290fef060b1b5d0ccb75126ad94f6f 4
## 5 6e65ccd2dcb418d0bdb844179b2418294e2292a0 5
## 6 9a996346e2c398ca7c12a0dd06e32237de36c1f0 6
## 7 a4962b7bf2d2954839a28c0058c5917cbee6891f 7
## 8 b3a012651bad958d7527618fc6a044654192b498 8
## 9 bd3113fe98b4d2dce8480ce74682e6a463b4b487 9
## 10 c38f5dfd43ac1a9f5a691c259900ae49920736c4 10
## 11 e581850f705724caebf21f35fdc1395da88a9388 11
B Dataset Exploration and Restructure
## [1] "Numbers of Tracking ID"
## [1] 24165
testBdf = Bdf
testBdf = testBdf %>%
mutate(event_time = as.POSIXct(as.numeric(event_time),origin="1970-01-01 00:00:00", tz="Asia/Taipei"))
glimpse(testBdf)
## Observations: 24,166
## Variables: 2
## $ tracking_id <fct> 0223ce54dafeaad61826f2921ccd2eee871c675a, 3dd0ceda...
## $ event_time <dttm> 2017-07-06 17:04:32, 2017-07-06 18:13:21, 2017-07...
A and B Combine Area
#Trigger DataSet
trigger_df = testAdf %>%
filter(tracking_id %in% levels(Bdf$tracking_id)) %>%
rename(A_tracking_id = tracking_id, A_event_time = event_time) %>%
arrange(A_event_time)
trigger_df = as.data.frame(trigger_df)
head(trigger_df)
## cluster
## 1 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 2 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 3 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 4 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 5 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 6 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## machine_id
## 1 9857653c2ad55c7abb695e5879089215ac473fd0
## 2 2738c54ecad124e2e58e0161aed1d371d1a102dd
## 3 64344e6fab8f85c92ab2e05f1c2e2fd1a376624d
## 4 69de1ada1b8b2ac63b48c9501630ff62c6373c9d
## 5 9ff1d98dafd6442bc6f48a11b8b917b6691b31ec
## 6 4633b09175e091e1f92c6179ba9ef29d1470d1d7
## A_tracking_id A_event_time
## 1 771a72dfc7af4fd2fc02d0631819aed9ecade052 2017-07-01 10:51:12
## 2 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce 2017-07-01 10:51:35
## 3 72b63c9074d832aad7840125c724c5bc06a64c3b 2017-07-01 10:56:07
## 4 9ed321fe566099cfaab9ba2426e438d05e2c5a40 2017-07-01 10:57:45
## 5 ea21135f6445f6f2d8bbfa0aed9cea5f734d94bb 2017-07-01 10:58:52
## 6 ffa84616b8a81b6b0619b709fce91bc1d9a881b5 2017-07-01 10:59:10
## cluster_label
## 1 5
## 2 5
## 3 5
## 4 5
## 5 5
## 6 5
#This is a record which is not matched with two dataset
test1Bdf = testBdf %>%
filter(!tracking_id %in% trigger_df$A_tracking_id)
#Delete Duplicated Tracking_ID
test2Bdf = testBdf[!duplicated(testBdf$tracking_id),]
test3Bdf = test2Bdf %>%
filter(tracking_id != test1Bdf$tracking_id) %>%
rename(B_tracking_id = tracking_id, B_event_time = event_time) %>%
arrange(B_event_time)
#New Dataset Combine for trigger situation
trigger_df = trigger_df[match(test3Bdf$B_tracking_id,trigger_df$A_tracking_id),]
new_trigger_df = cbind(trigger_df,test3Bdf)
#Duration Time unit is secs
new_trigger_df = new_trigger_df %>%
mutate(Duration_Time = difftime(B_event_time,A_event_time,units="secs"))
head(new_trigger_df)
## cluster
## 1 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 2 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 3 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 4 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 5 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## 6 6e65ccd2dcb418d0bdb844179b2418294e2292a0
## machine_id
## 1 2738c54ecad124e2e58e0161aed1d371d1a102dd
## 2 9857653c2ad55c7abb695e5879089215ac473fd0
## 3 64344e6fab8f85c92ab2e05f1c2e2fd1a376624d
## 4 4633b09175e091e1f92c6179ba9ef29d1470d1d7
## 5 69de1ada1b8b2ac63b48c9501630ff62c6373c9d
## 6 362fb0a5bcf6e55f23b6bb98ebc011b7b3beac2b
## A_tracking_id A_event_time
## 1 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce 2017-07-01 10:51:35
## 2 771a72dfc7af4fd2fc02d0631819aed9ecade052 2017-07-01 10:51:12
## 3 72b63c9074d832aad7840125c724c5bc06a64c3b 2017-07-01 10:56:07
## 4 ffa84616b8a81b6b0619b709fce91bc1d9a881b5 2017-07-01 10:59:10
## 5 90ec5efbb9cb0f26f8c6c3affdc5b929d91939ac 2017-07-01 11:00:53
## 6 f0cd5d879518826313b2446951dc2cc918e5359e 2017-07-01 11:00:19
## cluster_label B_tracking_id
## 1 5 4ce71c19a146bd49aa6fb27b79b37bb7f6376cce
## 2 5 771a72dfc7af4fd2fc02d0631819aed9ecade052
## 3 5 72b63c9074d832aad7840125c724c5bc06a64c3b
## 4 5 ffa84616b8a81b6b0619b709fce91bc1d9a881b5
## 5 5 90ec5efbb9cb0f26f8c6c3affdc5b929d91939ac
## 6 5 f0cd5d879518826313b2446951dc2cc918e5359e
## B_event_time Duration_Time
## 1 2017-07-01 10:51:45 10 secs
## 2 2017-07-01 10:51:45 33 secs
## 3 2017-07-01 10:56:53 46 secs
## 4 2017-07-01 11:00:40 90 secs
## 5 2017-07-01 11:01:01 8 secs
## 6 2017-07-01 11:01:32 73 secs
The list which clusters was been triggered
library(qicharts)
#Trigger's Cluster
T_Clusters = new_trigger_df$cluster_label[!duplicated(new_trigger_df$cluster_label)]
T_Clusters_df = data.frame(T_Clusters)
T_Clusters_df = as.list(T_Clusters_df %>% arrange(T_Clusters))
Quality Control by T-Chart
Using T-Chart to detect outliers which are false alarms.
## [1] "cluster Number:1"

## [1] "cluster Number:2"

## [1] "cluster Number:3"

## [1] "cluster Number:5"

## [1] "cluster Number:6"

## [1] "cluster Number:7"

## [1] "cluster Number:8"

## [1] "cluster Number:9"

[False Alarms (Outliers) VS Problems] Percentage of outliers, those outliers should be concluded as false alarms
It is obvious that the cluster 9 has the highest percentage of outliers. It means that it has weak relationship between A event and B event. However, it still has some problems inside it. On the other hand, other clusters have high percentage of in-control records, so it shows that they are our target clusters to be analyzed. In a short conclusion, there must be somethings wrong in them.
## T_Clusters percentage_Outliers
## 1 1 0.06747638
## 2 2 0.11764706
## 3 3 0.14895470
## 4 5 0.11143955
## 5 6 0.15617128
## 6 7 0.15254237
## 7 8 0.05928345
## 8 9 0.54125413
Percentage of InControl
## T_Clusters percentage_InControl
## 1 1 0.9325236
## 2 2 0.8823529
## 3 3 0.8510453
## 4 5 0.8885604
## 5 6 0.8438287
## 6 7 0.8474576
## 7 8 0.9407165
## 8 9 0.4587459
Point out each cluster’s top three ranking machine which happen most.