timestamp                      client             query            query_type         answer_ip          domain_l1          domain_l2        
 Min.   :2017-02-20 23:12:13   Length:10000000    Length:10000000    Length:10000000    Length:10000000    Length:10000000    Length:10000000   
 1st Qu.:2017-03-07 19:37:18   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character  
 Median :2017-03-08 22:39:36   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Mean   :2017-03-09 07:33:45                                                                                                                    
 3rd Qu.:2017-03-11 12:45:21                                                                                                                    
 Max.   :2017-03-15 07:16:55                                                                                                                    
                                                                                                                                                
      ttl               answer          dga.probability    dga.class     
 Min.   :0.000e+00   Length:10000000    Min.   :0        Min.   :0       
 1st Qu.:0.000e+00   Class :character   1st Qu.:0        1st Qu.:0       
 Median :3.000e+01   Mode  :character   Median :0        Median :0       
 Mean   :4.872e+03                      Mean   :0        Mean   :0       
 3rd Qu.:3.000e+02                      3rd Qu.:0        3rd Qu.:0       
 Max.   :2.147e+09                      Max.   :1        Max.   :1       
                                        NA's   :388724   NA's   :388724  

Feature Generation

Feature file available at https://www.dropbox.com/s/vl9y5p32e2umxu1/wbone-client-profile-1h-last8days.csv?dl=1

summary(clients_profiles)
    client           tot_requests      tot_detected     ratio_detected       tot_nx           ratio_nx           tot_mx        
 Length:15870       Min.   :    1.0   Min.   :  0.000   Min.   :0.0000   Min.   :    0.0   Min.   :0.00000   Min.   :  0.0000  
 Class :character   1st Qu.:    2.0   1st Qu.:  0.000   1st Qu.:0.0000   1st Qu.:    0.0   1st Qu.:0.00000   1st Qu.:  0.0000  
 Mode  :character   Median :    6.0   Median :  0.000   Median :0.0000   Median :    0.0   Median :0.00000   Median :  0.0000  
                    Mean   :  516.7   Mean   :  3.822   Mean   :0.0082   Mean   :  149.8   Mean   :0.10165   Mean   :  0.6547  
                    3rd Qu.:   43.0   3rd Qu.:  0.000   3rd Qu.:0.0000   3rd Qu.:    3.0   3rd Qu.:0.07692   3rd Qu.:  0.0000  
                    Max.   :61787.0   Max.   :609.000   Max.   :1.0000   Max.   :54037.0   Max.   :1.00000   Max.   :170.0000  
    ratio_mx           tot_fail         ratio_fail       tot_reverse      ratio_reverse       max_sameip         tot_sameip     
 Min.   :0.000000   Min.   :    0.0   Min.   :0.00000   Min.   :   0.00   Min.   :0.00000   Min.   :   0.000   Min.   :   0.00  
 1st Qu.:0.000000   1st Qu.:    0.0   1st Qu.:0.00000   1st Qu.:   0.00   1st Qu.:0.00000   1st Qu.:   0.000   1st Qu.:   0.00  
 Median :0.000000   Median :    0.0   Median :0.00000   Median :   0.00   Median :0.00000   Median :   0.000   Median :   0.00  
 Mean   :0.002088   Mean   :  105.5   Mean   :0.03087   Mean   :  15.35   Mean   :0.02532   Mean   :   4.256   Mean   :  55.59  
 3rd Qu.:0.000000   3rd Qu.:    0.0   3rd Qu.:0.00000   3rd Qu.:   0.00   3rd Qu.:0.00000   3rd Qu.:   2.000   3rd Qu.:   2.00  
 Max.   :1.000000   Max.   :12510.0   Max.   :1.00000   Max.   :2177.00   Max.   :1.00000   Max.   :1610.000   Max.   :5692.00  
   avg_sameip        sd_sameip       tot_samedomain    max_samedomain   avg_samedomain  sd_samedomain    ratio_tot_samedomain
 Min.   : 0.0000   Min.   : 0.0000   Min.   :   0.00   Min.   : 0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000      
 1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:   0.00   1st Qu.: 0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000      
 Median : 0.0000   Median : 0.0000   Median :   0.00   Median : 0.000   Median :0.000   Median :0.0000   Median :0.0000      
 Mean   : 0.7989   Mean   : 0.4673   Mean   :  68.87   Mean   : 2.177   Mean   :1.158   Mean   :0.2819   Mean   :0.1419      
 3rd Qu.: 2.0000   3rd Qu.: 0.0000   3rd Qu.:   7.00   3rd Qu.: 3.000   3rd Qu.:2.444   3rd Qu.:0.4754   3rd Qu.:0.2208      
 Max.   :41.7500   Max.   :70.1429   Max.   :5310.00   Max.   :84.000   Max.   :9.000   Max.   :5.6569   Max.   :1.0000      
 ratio_max_samedomain ratio_avg_samedomain ratio_sd_samedomain ratio_tot_sameip  ratio_max_sameip   ratio_avg_sameip    ratio_sd_sameip   
 Min.   :0.00000      Min.   :0.00000      Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.00000      1st Qu.:0.00000      1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000  
 Median :0.00000      Median :0.00000      Median :0.0000000   Median :0.00000   Median :0.000000   Median :0.0000000   Median :0.000000  
 Mean   :0.08588      Mean   :0.08116      Mean   :0.0037496   Mean   :0.03802   Mean   :0.020587   Mean   :0.0181529   Mean   :0.001514  
 3rd Qu.:0.05039      3rd Qu.:0.03571      3rd Qu.:0.0001406   3rd Qu.:0.02563   3rd Qu.:0.005566   3rd Qu.:0.0006352   3rd Qu.:0.000000  
 Max.   :1.00000      Max.   :1.00000      Max.   :0.3535534   Max.   :1.00000   Max.   :1.000000   Max.   :1.0000000   Max.   :0.489536  
   profilenum      currenttm                  
 Min.   :  0.0   Min.   :2017-03-07 07:16:51  
 1st Qu.: 59.0   1st Qu.:2017-03-08 20:16:51  
 Median :113.0   Median :2017-03-10 14:16:51  
 Mean   :105.8   Mean   :2017-03-10 21:25:51  
 3rd Qu.:155.0   3rd Qu.:2017-03-12 20:16:51  
 Max.   :192.0   Max.   :2017-03-15 07:16:51  

Selecting features

client_profile_selected_features<-clients_profiles   %>% mutate(ratio_requests=tot_requests/max(tot_requests)) %>%
  #select(client,tot_requests, tot_detected,tot_nx,tot_mx,tot_fail,tot_reverse,tot_samedomain,tot_sameip)
 select(client,tot_requests,ratio_requests,ratio_detected,ratio_nx,ratio_mx,ratio_fail,ratio_reverse,
        ratio_tot_samedomain,ratio_tot_sameip,
        ratio_avg_samedomain,ratio_avg_sameip,
        ratio_sd_samedomain,ratio_sd_sameip)
client_profile_ratio<-unique(client_profile_ratio)
  #select(client,tot_requests,tot_detected,max_sameip,avg_sameip,sd_sameip,max_samedomain,avg_samedomain,max_samedomain,ratio_nx)
client_profile_selected_features

Matrix Correlation heatmap

library(d3heatmap)
corr_client_profile_selected_features<-cor(client_profile_selected_features[,c(-1,-2)],use="complete.obs")
d3heatmap(corr_client_profile_selected_features,symm=T,colors = "Reds",Rowv=T,
          width = 600,height = 400, 
          yaxis_font_size = "8pt",
          xaxis_font_size = "8pt")

Per Feature Scatter plot

pairs(client_profile_selected_features[,c(-1,-2)],cex.labels = 1.6)

ggplot(client_profile_ratio,aes(ratio_nx,ratio_detected))+
  geom_point(aes(alpha=tot_requests),color='skyblue')+
  geom_smooth(method = 'lm',color='orange')+
  theme_bw()

Kmeans clustering

kmeans_model<-kmeans(client_profile_selected_features[,c(-1,-2)],centers=5,nstart=40)
client_profile_ratio<-cbind(client_profile_selected_features,cluster=as.factor(kmeans_model$cluster))
client_profile_ratio %>% group_by(cluster) %>% summarise(n=n())
save(kmeans_model,file="../models/kmeans_model_1h_last8days.Rdata")

Some basic per cluster information

cluster_summary<-unique(client_profile_ratio[,c(-2)]) %>% group_by(cluster) %>% summarise(n=n(),nx=mean(ratio_nx),mx=mean(ratio_mx),sameip=mean(ratio_tot_sameip),samedomain=mean(ratio_tot_samedomain),reverse=mean(ratio_reverse),detected=mean(ratio_detected),requests=mean(ratio_requests),fail=mean(ratio_fail), ipdist=n_distinct(client))
                                                                                         
cluster_summary
d3heatmap(cluster_summary[,c(-1,-2,-11)],colors = "Reds",Rowv=T,
          width = 700,height = 400, 
          yaxis_font_size = "8pt",
          xaxis_font_size = "8pt")

Clustering Results

According to clustering results

  1. Cluster n. 1 groups clients with high NX replies as well as some reverse queries among others. (possible DGA)
  2. Cluster n. 2 groups those clients not showing DGA or other malwaver behavior (Possible NORMAL??)
  3. Cluster n. 3 groups those clients with the highest samedomains values (possible Fastflux?)
  4. Cluster n. 4 groups those clients with a value increment for sameip and samedomains (Possible DGA/Fastflux? Not really)
  5. Cluster n. 5 groups those clients with high values in FAIL queries (Possible what??)

some IPS to check

213.191.105.210 shows some possible random generated domains

213.191.105.242 Shows a lot of NX records. Some queries contain the kk. TLD (??)

T-NSE 2D representation with clusters

Limitations of PCA

PCA is a linear algorithm. It will not be able to interpret complex polynomial relationship between features. On the other hand, t-SNE is based on probability distributions with random walk on neighborhood graphs to find the structure within the data.

tsne_model<-Rtsne(unique(client_profile_ratio[,c(-2,-ncol(client_profile_ratio))]),pca=FALSE, pca_center =TRUE, pca_scale=TRUE,check_duplicates=F,
                  perplexity = 400, max_iter = 1000)

To remember: It seems that if we generate 1h profiles the the 2D representation tends to be more difficult to analyze. The are many points very similar corresponding to the same client profile at different time periods. Using a higher perplexity value seems to solve that issue.

Each point represents a 1 hour profile. The size of a given point/profile corresponds to the request ratios for that profile. Finally, color presents differents clusters. Remember you can select/deselect the cluster by clicking the legend.

client_profile_unique<-unique(client_profile_ratio[,c(-2)])
tsne_client_profile<-data.frame(tsne_model$Y,cluster=client_profile_unique$cluster,client=client_profile_unique$client,requests=
                                  client_profile_unique$ratio_requests)
sampleid<-sample(nrow(tsne_client_profile),nrow(tsne_client_profile))
g<-ggplot(tsne_client_profile[sampleid,],aes(x=X1,y=X2))+
  geom_point(aes(color=as.factor(cluster),size=requests,text=paste("ipddr",client)))+
  #geom_point(aes(shape=asignacion),size=3)+
  ylab("X1")+xlab("X2")+
  theme_classic()+
#scale_shape_manual(values=c(8,6))+
   guides(color=FALSE,alpha=FALSE)
ggplotly(g)

Decision Tree Analysis

library(rpart.plot)
library(rpart)
library(rpart.utils)

formula <- as.formula(cluster~.)
tree=rpart(formula,data=client_profile_ratio[,c(-1,-2,-3,-4)],control = rpart.control(minsplit=500, cp=0.005,xval=10),model=T,x=T,y=T)
rpart.plot(tree,
           extra=4, 
           box.palette="GnBu",
           branch.lty=5, shadow.col=0, nn=TRUE, cex =0.9,under=T
        )
printcp(tree)

Clients in cluster 1

tsne_client_profile %>% filter(cluster==1) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0 )  %>% select(client)

Clients in cluster 2

tsne_client_profile %>% filter(cluster==2) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 3

tsne_client_profile %>% filter(cluster==3) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 4

tsne_client_profile %>% filter(cluster==4) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 5

tsne_client_profile %>% filter(cluster==5) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 6

tsne_client_profile %>% filter(cluster==6) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
