timestamp                      client             query            query_type         answer_ip          domain_l1          domain_l2        
 Min.   :2017-02-20 23:12:13   Length:10000000    Length:10000000    Length:10000000    Length:10000000    Length:10000000    Length:10000000   
 1st Qu.:2017-03-07 19:37:18   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character  
 Median :2017-03-08 22:39:36   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Mean   :2017-03-09 07:33:45                                                                                                                    
 3rd Qu.:2017-03-11 12:45:21                                                                                                                    
 Max.   :2017-03-15 07:16:55                                                                                                                    
                                                                                                                                                
      ttl               answer          dga.probability    dga.class     
 Min.   :0.000e+00   Length:10000000    Min.   :0        Min.   :0       
 1st Qu.:0.000e+00   Class :character   1st Qu.:0        1st Qu.:0       
 Median :3.000e+01   Mode  :character   Median :0        Median :0       
 Mean   :4.872e+03                      Mean   :0        Mean   :0       
 3rd Qu.:3.000e+02                      3rd Qu.:0        3rd Qu.:0       
 Max.   :2.147e+09                      Max.   :1        Max.   :1       
                                        NA's   :388724   NA's   :388724  

Feature Generation

Feature file available at https://www.dropbox.com/s/pbz2iqcedn4kef5/wbone-client-profile-24hs-last3weeks.csv?dl=1

max_num_profiles=21
windowsize<-60*24 # 24Hs window
initial_tm<-domains$timestamp[nrow(domains)-1]
clients_profiles<-data.frame()
clients_profiles<-foreach (profilenum = 0:max_num_profiles,.combine='rbind', .multicombine=FALSE ) %dopar% {
  currenttm<-initial_tm - (minutes(windowsize)*profilenum)
  domains_window<-domains %>% filter(timestamp > currenttm -minutes(windowsize) & timestamp < currenttm)
 
   # Total amount of domans requested 
  n_requests <-domains_window %>% group_by(client) %>% 
  summarise(tot_requests=n()) #%>% arrange(desc(tot_requests))
  # Total amount of domains detected by the NN and ratio.
  n_detected<-domains_window %>% group_by(client) %>% 
  summarise(tot_detected=sum(ifelse(dga.class==1,1,0),na.rm=T))
  n_detected$ratio_detected<-n_detected$tot_detected/n_requests$tot_requests
  # Total amount of NXDOMAIN and ratio
  n_nx<-domains_window  %>% group_by(client) %>% 
  summarise(tot_nx=sum(grepl("NXDOMAIN",answer)))
  n_nx$ratio_nx<-n_nx$tot_nx/n_requests$tot_requests
  # Total amount of SERVFAIL and ratio
  n_fail<-domains_window  %>% group_by(client) %>% 
  summarise(tot_fail=sum(grepl("SERVFAIL",answer)))
  n_fail$ratio_fail<-n_fail$tot_fail/n_requests$tot_requests
  # Total amount of MX and ratio
  n_mx<-domains_window  %>%  group_by(client)  %>% 
  summarise(tot_mx=sum(query_type=="MX")) 
  n_mx$ratio_mx<-n_mx$tot_mx/n_requests$tot_requests
  
  n_reverse <- domains_window %>% group_by(client) %>%
  summarise(tot_reverse=sum(grepl("in-addr.arpa.",query))) 
  n_reverse$ratio_reverse<-n_reverse$tot_reverse/n_requests$tot_requests
    
  # domains sharing the same IP 
  all_sameip<-domains_window %>%  
  filter(query_type=="A" & answer_ip!="NXDOMAIN" & answer_ip!="SERVFAIL") %>%  
  group_by(client,answer_ip) %>% 
  summarise(tot_distquery=n_distinct(query)) 
  #The max (int) amount of requests where one IP resolved to more than one domain.
  max_sameip<-all_sameip %>% filter(tot_distquery >1) %>% group_by(client) %>% summarise(max_sameip=max(tot_distquery))
   #The avg  of requests where one IP resolved to more than one domain.
  avg_sameip<-all_sameip %>% filter(tot_distquery >1) %>% group_by(client) %>% summarise(avg_sameip=mean(tot_distquery))
  #The avg  of requests where one IP resolved to more than one domain.
  sd_sameip<-all_sameip %>% filter(tot_distquery >1) %>% group_by(client) %>% summarise(sd_sameip=sd(tot_distquery))
  #The amount of groups of same IPs that resolved to more than one domain.
  n_sameip<-all_sameip %>% filter(tot_distquery >1) %>% group_by(client) %>% summarise(tot_sameip=sum(tot_distquery))
  
  
  # The amount of domains that each of them had more than 1 IP.
  # (Fast flux)
  all_samedomain<-domains_window %>%  
  filter(query_type=="A" & answer_ip!="NXDOMAIN" & answer_ip!="SERVFAIL") %>%  
  group_by(client,query) %>% 
  summarise(tot_distdomain=n_distinct(answer_ip)) 
  
  #The max (int) amount of requests where one domain resolved to more than one IP
  max_samedomain<-all_samedomain %>% filter(tot_distdomain >1) %>% group_by(client) %>% summarise(max_samedomain=max(tot_distdomain))
  #The avg  of requests where one domain resolved to more than one IP
  avg_samedomain<-all_samedomain %>% filter(tot_distdomain >1) %>% group_by(client) %>% summarise(avg_samedomain=mean(tot_distdomain))
  #The avg  of requests where one domain resolved to more than one IP
  sd_samedomain<-all_samedomain %>% filter(tot_distdomain >1) %>% group_by(client) %>% summarise(sd_samedomain=sd(tot_distdomain))
  #The amount of groups of same domain that resolved to more than one IP
  n_samedomain<-all_samedomain %>% filter(tot_distdomain >1) %>% group_by(client) %>% summarise(tot_samedomain=sum(tot_distdomain))
  
  
  # Profile Database Creation ------
  
  client_profile<-inner_join(n_requests,n_detected,by="client") %>% 
  inner_join(n_nx,by="client") %>%
  inner_join(n_mx,by="client") %>%
  inner_join(n_fail,by="client") %>%
  inner_join(n_reverse,by="client") %>%
  full_join(max_sameip,by="client") %>%
  full_join(n_sameip,by="client") %>%
  full_join(avg_sameip,by="client") %>%
  full_join(sd_sameip,by="client")  %>%
  full_join(n_samedomain,by="client") %>%
  full_join(max_samedomain,by="client") %>%
  full_join(avg_samedomain,by="client") %>%
  full_join(sd_samedomain,by="client") 
    
  
  # replace NAs with zeros
  client_profile<-client_profile %>% mutate(max_sameip=ifelse(is.na(max_sameip),0,max_sameip),
                                            tot_sameip=ifelse(is.na(tot_sameip),0,tot_sameip),
                                            avg_sameip=ifelse(is.na(avg_sameip),0,avg_sameip),
                                            sd_sameip=ifelse(is.na(sd_sameip),0,sd_sameip)
                                            
                                              )
  client_profile<-client_profile %>% mutate(max_samedomain=ifelse(is.na(max_samedomain),0,max_samedomain),
                                            tot_samedomain=ifelse(is.na(tot_samedomain),0,tot_samedomain),
                                            avg_samedomain=ifelse(is.na(avg_samedomain),0,avg_samedomain),
                                            sd_samedomain=ifelse(is.na(sd_samedomain),0,sd_samedomain)
                                            )
  
  client_profile<-client_profile %>% mutate(ratio_tot_samedomain=tot_samedomain/tot_requests) 
  client_profile<-client_profile %>% mutate(ratio_max_samedomain=max_samedomain/tot_requests) 
  client_profile<-client_profile %>% mutate(ratio_avg_samedomain=avg_samedomain/tot_requests) 
  client_profile<-client_profile %>% mutate(ratio_sd_samedomain=sd_samedomain/tot_requests) 
  
  client_profile<-client_profile %>% mutate(ratio_tot_sameip=tot_sameip/tot_requests) 
   client_profile<-client_profile %>% mutate(ratio_max_sameip=max_sameip/tot_requests) 
   client_profile<-client_profile %>% mutate(ratio_avg_sameip=avg_sameip/tot_requests) 
   client_profile<-client_profile %>% mutate(ratio_sd_sameip=sd_sameip/tot_requests) 
 
  
  
  
   client_profile$profilenum<-profilenum
  client_profile$currenttm<-currenttm
  #clients_profiles<-rbind(clients_profiles,client_profile)
  #currenttm<-currenttm - minutes(windowsize)
  
  return(client_profile)
}

Selecting features

Matrix Correlation heatmap

library(d3heatmap)
corr_client_profile_selected_features<-cor(client_profile_selected_features[,c(-1,-2)],use="complete.obs")
d3heatmap(corr_client_profile_selected_features,symm=T,colors = "Reds",Rowv=T,
          width = 600,height = 400, 
          yaxis_font_size = "8pt",
          xaxis_font_size = "8pt")

Per Feature Scatter plot

pairs(client_profile_selected_features[,c(-1,-2)],cex.labels = 1.6)

ggplot(client_profile_ratio,aes(ratio_nx,ratio_detected))+
  geom_point(aes(alpha=tot_requests),color='skyblue')+
  geom_smooth(method = 'lm',color='orange')+
  theme_bw()

Kmeans clustering

kmeans_model<-kmeans(client_profile_selected_features[,c(-1,-2)],centers=5,nstart=40)
client_profile_ratio<-cbind(client_profile_selected_features,cluster=as.factor(kmeans_model$cluster))
client_profile_ratio %>% group_by(cluster) %>% summarise(n=n())
save(kmeans_model,file="../models/kmeans_model_24hs_last3weeks-2.Rdata")

Some basic per cluster information

cluster_summary<-unique(client_profile_ratio[,c(-2)]) %>% group_by(cluster) %>% summarise(n=n(),nx=mean(ratio_nx),mx=mean(ratio_mx),sameip=mean(ratio_tot_sameip),samedomain=mean(ratio_tot_samedomain),reverse=mean(ratio_reverse),detected=mean(ratio_detected),requests=mean(ratio_requests),fail=mean(ratio_fail), ipdist=n_distinct(client))
                                                                                         
cluster_summary
d3heatmap(cluster_summary[,c(-1,-2,-11)],colors = "Reds",Rowv=T,
          width = 700,height = 400, 
          yaxis_font_size = "8pt",
          xaxis_font_size = "8pt")

Clustering Results

According to clustering results

  1. Cluster n. 1 groups those clients with the highest samedomains values (possible Fastflux?)
  2. Cluster n. 2 groups those clients with a value increment for sameip and samedomains (Possible DGA/Fastflux? Not really)
  3. Cluster n. 3 groups clients with high NX replies as well as some reverse queries among others. (possible DGA)
  4. Cluster n. 4 groups those clients not showing DGA or other malwaver behavior (Possible NORMAL??)
  5. Cluster n. 5 groups those clients with high values in FAIL queries (Possible what??)

some IPS to check

213.191.105.210 shows some possible random generated domains

213.191.105.242 Shows a lot of NX records. Some queries contain the kk. TLD (??)

T-NSE 2D representation with clusters

Limitations of PCA

PCA is a linear algorithm. It will not be able to interpret complex polynomial relationship between features. On the other hand, t-SNE is based on probability distributions with random walk on neighborhood graphs to find the structure within the data.

tsne_model<-Rtsne(unique(client_profile_ratio[,c(-2,-ncol(client_profile_ratio))]),pca=FALSE, pca_center =TRUE, pca_scale=TRUE,check_duplicates=F,
                  perplexity = 400, max_iter = 1000)

To remember: It seems that if we generate 1h profiles the the 2D representation tends to be more difficult to analyze. The are many points very similar corresponding to the same client profile at different time periods. Using a higher perplexity value seems to solve that issue.

Each point represents a 1 hour profile. The size of a given point/profile corresponds to the request ratios for that profile. Finally, color presents differents clusters. Remember you can select/deselect the cluster by clicking the legend.

client_profile_unique<-unique(client_profile_ratio[,c(-2)])
tsne_client_profile<-data.frame(tsne_model$Y,cluster=client_profile_unique$cluster,client=client_profile_unique$client,requests=
                                  client_profile_unique$ratio_requests)
sampleid<-sample(nrow(tsne_client_profile),nrow(tsne_client_profile))
g<-ggplot(tsne_client_profile[sampleid,],aes(x=X1,y=X2))+
  geom_point(aes(color=as.factor(cluster),size=requests,text=paste("ipddr",client)))+
  #geom_point(aes(shape=asignacion),size=3)+
  ylab("X1")+xlab("X2")+
  theme_classic()+
#scale_shape_manual(values=c(8,6))+
   guides(color=FALSE,alpha=FALSE)
ggplotly(g)

Decision Tree Analysis

library(rpart.plot)
library(rpart)
library(rpart.utils)

formula <- as.formula(cluster~.)
tree=rpart(formula,data=client_profile_ratio[,c(-1,-2,-3,-4)],control = rpart.control(minsplit=500, cp=0.005,xval=10),model=T,x=T,y=T)
rpart.plot(tree,
           extra=4, 
           box.palette="GnBu",
           branch.lty=5, shadow.col=0, nn=TRUE, cex =0.9,under=T
        )
printcp(tree)

Clients in cluster 1

tsne_client_profile %>% filter(cluster==1) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0 )  %>% select(client)

Clients in cluster 2

tsne_client_profile %>% filter(cluster==2) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 3

tsne_client_profile %>% filter(cluster==3) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 4

tsne_client_profile %>% filter(cluster==4) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 5

tsne_client_profile %>% filter(cluster==5) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0)  %>% select(client)

Clients in cluster 6

tsne_client_profile %>% filter(cluster==6) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
