timestamp client query query_type answer_ip domain_l1 domain_l2
Min. :2017-02-20 23:12:13 Length:10000000 Length:10000000 Length:10000000 Length:10000000 Length:10000000 Length:10000000
1st Qu.:2017-03-07 19:37:18 Class :character Class :character Class :character Class :character Class :character Class :character
Median :2017-03-08 22:39:36 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
Mean :2017-03-09 07:33:45
3rd Qu.:2017-03-11 12:45:21
Max. :2017-03-15 07:16:55
ttl answer dga.probability dga.class
Min. :0.000e+00 Length:10000000 Min. :0 Min. :0
1st Qu.:0.000e+00 Class :character 1st Qu.:0 1st Qu.:0
Median :3.000e+01 Mode :character Median :0 Median :0
Mean :4.872e+03 Mean :0 Mean :0
3rd Qu.:3.000e+02 3rd Qu.:0 3rd Qu.:0
Max. :2.147e+09 Max. :1 Max. :1
NA's :388724 NA's :388724
Feature Generation
Feature file available at https://www.dropbox.com/s/vl9y5p32e2umxu1/wbone-client-profile-1h-last8days.csv?dl=1
summary(clients_profiles)
client tot_requests tot_detected ratio_detected tot_nx ratio_nx tot_mx
Length:15870 Min. : 1.0 Min. : 0.000 Min. :0.0000 Min. : 0.0 Min. :0.00000 Min. : 0.0000
Class :character 1st Qu.: 2.0 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 0.0 1st Qu.:0.00000 1st Qu.: 0.0000
Mode :character Median : 6.0 Median : 0.000 Median :0.0000 Median : 0.0 Median :0.00000 Median : 0.0000
Mean : 516.7 Mean : 3.822 Mean :0.0082 Mean : 149.8 Mean :0.10165 Mean : 0.6547
3rd Qu.: 43.0 3rd Qu.: 0.000 3rd Qu.:0.0000 3rd Qu.: 3.0 3rd Qu.:0.07692 3rd Qu.: 0.0000
Max. :61787.0 Max. :609.000 Max. :1.0000 Max. :54037.0 Max. :1.00000 Max. :170.0000
ratio_mx tot_fail ratio_fail tot_reverse ratio_reverse max_sameip tot_sameip
Min. :0.000000 Min. : 0.0 Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.000 Min. : 0.00
1st Qu.:0.000000 1st Qu.: 0.0 1st Qu.:0.00000 1st Qu.: 0.00 1st Qu.:0.00000 1st Qu.: 0.000 1st Qu.: 0.00
Median :0.000000 Median : 0.0 Median :0.00000 Median : 0.00 Median :0.00000 Median : 0.000 Median : 0.00
Mean :0.002088 Mean : 105.5 Mean :0.03087 Mean : 15.35 Mean :0.02532 Mean : 4.256 Mean : 55.59
3rd Qu.:0.000000 3rd Qu.: 0.0 3rd Qu.:0.00000 3rd Qu.: 0.00 3rd Qu.:0.00000 3rd Qu.: 2.000 3rd Qu.: 2.00
Max. :1.000000 Max. :12510.0 Max. :1.00000 Max. :2177.00 Max. :1.00000 Max. :1610.000 Max. :5692.00
avg_sameip sd_sameip tot_samedomain max_samedomain avg_samedomain sd_samedomain ratio_tot_samedomain
Min. : 0.0000 Min. : 0.0000 Min. : 0.00 Min. : 0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
Median : 0.0000 Median : 0.0000 Median : 0.00 Median : 0.000 Median :0.000 Median :0.0000 Median :0.0000
Mean : 0.7989 Mean : 0.4673 Mean : 68.87 Mean : 2.177 Mean :1.158 Mean :0.2819 Mean :0.1419
3rd Qu.: 2.0000 3rd Qu.: 0.0000 3rd Qu.: 7.00 3rd Qu.: 3.000 3rd Qu.:2.444 3rd Qu.:0.4754 3rd Qu.:0.2208
Max. :41.7500 Max. :70.1429 Max. :5310.00 Max. :84.000 Max. :9.000 Max. :5.6569 Max. :1.0000
ratio_max_samedomain ratio_avg_samedomain ratio_sd_samedomain ratio_tot_sameip ratio_max_sameip ratio_avg_sameip ratio_sd_sameip
Min. :0.00000 Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.0000000 Min. :0.000000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.000000
Median :0.00000 Median :0.00000 Median :0.0000000 Median :0.00000 Median :0.000000 Median :0.0000000 Median :0.000000
Mean :0.08588 Mean :0.08116 Mean :0.0037496 Mean :0.03802 Mean :0.020587 Mean :0.0181529 Mean :0.001514
3rd Qu.:0.05039 3rd Qu.:0.03571 3rd Qu.:0.0001406 3rd Qu.:0.02563 3rd Qu.:0.005566 3rd Qu.:0.0006352 3rd Qu.:0.000000
Max. :1.00000 Max. :1.00000 Max. :0.3535534 Max. :1.00000 Max. :1.000000 Max. :1.0000000 Max. :0.489536
profilenum currenttm
Min. : 0.0 Min. :2017-03-07 07:16:51
1st Qu.: 59.0 1st Qu.:2017-03-08 20:16:51
Median :113.0 Median :2017-03-10 14:16:51
Mean :105.8 Mean :2017-03-10 21:25:51
3rd Qu.:155.0 3rd Qu.:2017-03-12 20:16:51
Max. :192.0 Max. :2017-03-15 07:16:51
Selecting features
client_profile_selected_features<-clients_profiles %>% mutate(ratio_requests=tot_requests/max(tot_requests)) %>%
#select(client,tot_requests, tot_detected,tot_nx,tot_mx,tot_fail,tot_reverse,tot_samedomain,tot_sameip)
select(client,tot_requests,ratio_requests,ratio_detected,ratio_nx,ratio_mx,ratio_fail,ratio_reverse,
ratio_tot_samedomain,ratio_tot_sameip,
ratio_avg_samedomain,ratio_avg_sameip,
ratio_sd_samedomain,ratio_sd_sameip)
client_profile_ratio<-unique(client_profile_ratio)
#select(client,tot_requests,tot_detected,max_sameip,avg_sameip,sd_sameip,max_samedomain,avg_samedomain,max_samedomain,ratio_nx)
client_profile_selected_features
Matrix Correlation heatmap
library(d3heatmap)
corr_client_profile_selected_features<-cor(client_profile_selected_features[,c(-1,-2)],use="complete.obs")
d3heatmap(corr_client_profile_selected_features,symm=T,colors = "Reds",Rowv=T,
width = 600,height = 400,
yaxis_font_size = "8pt",
xaxis_font_size = "8pt")
Per Feature Scatter plot
pairs(client_profile_selected_features[,c(-1,-2)],cex.labels = 1.6)

ggplot(client_profile_ratio,aes(ratio_nx,ratio_detected))+
geom_point(aes(alpha=tot_requests),color='skyblue')+
geom_smooth(method = 'lm',color='orange')+
theme_bw()
Kmeans clustering
kmeans_model<-kmeans(client_profile_selected_features[,c(-1,-2)],centers=5,nstart=40)
client_profile_ratio<-cbind(client_profile_selected_features,cluster=as.factor(kmeans_model$cluster))
client_profile_ratio %>% group_by(cluster) %>% summarise(n=n())
save(kmeans_model,file="../models/kmeans_model_1h_last8days.Rdata")
Clustering Results
According to clustering results
- Cluster n. 1 groups clients with high NX replies as well as some reverse queries among others. (possible DGA)
- Cluster n. 2 groups those clients not showing DGA or other malwaver behavior (Possible NORMAL??)
- Cluster n. 3 groups those clients with the highest samedomains values (possible Fastflux?)
- Cluster n. 4 groups those clients with a value increment for sameip and samedomains (Possible DGA/Fastflux? Not really)
- Cluster n. 5 groups those clients with high values in FAIL queries (Possible what??)
some IPS to check
213.191.105.210 shows some possible random generated domains
213.191.105.242 Shows a lot of NX records. Some queries contain the kk. TLD (??)
T-NSE 2D representation with clusters
Limitations of PCA
PCA is a linear algorithm. It will not be able to interpret complex polynomial relationship between features. On the other hand, t-SNE is based on probability distributions with random walk on neighborhood graphs to find the structure within the data.
tsne_model<-Rtsne(unique(client_profile_ratio[,c(-2,-ncol(client_profile_ratio))]),pca=FALSE, pca_center =TRUE, pca_scale=TRUE,check_duplicates=F,
perplexity = 400, max_iter = 1000)
To remember: It seems that if we generate 1h profiles the the 2D representation tends to be more difficult to analyze. The are many points very similar corresponding to the same client profile at different time periods. Using a higher perplexity value seems to solve that issue.
Each point represents a 1 hour profile. The size of a given point/profile corresponds to the request ratios for that profile. Finally, color presents differents clusters. Remember you can select/deselect the cluster by clicking the legend.
client_profile_unique<-unique(client_profile_ratio[,c(-2)])
tsne_client_profile<-data.frame(tsne_model$Y,cluster=client_profile_unique$cluster,client=client_profile_unique$client,requests=
client_profile_unique$ratio_requests)
sampleid<-sample(nrow(tsne_client_profile),nrow(tsne_client_profile))
g<-ggplot(tsne_client_profile[sampleid,],aes(x=X1,y=X2))+
geom_point(aes(color=as.factor(cluster),size=requests,text=paste("ipddr",client)))+
#geom_point(aes(shape=asignacion),size=3)+
ylab("X1")+xlab("X2")+
theme_classic()+
#scale_shape_manual(values=c(8,6))+
guides(color=FALSE,alpha=FALSE)
ggplotly(g)
Decision Tree Analysis
library(rpart.plot)
library(rpart)
library(rpart.utils)
formula <- as.formula(cluster~.)
tree=rpart(formula,data=client_profile_ratio[,c(-1,-2,-3,-4)],control = rpart.control(minsplit=500, cp=0.005,xval=10),model=T,x=T,y=T)
rpart.plot(tree,
extra=4,
box.palette="GnBu",
branch.lty=5, shadow.col=0, nn=TRUE, cex =0.9,under=T
)
printcp(tree)
Clients in cluster 1
tsne_client_profile %>% filter(cluster==1) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0 ) %>% select(client)
Clients in cluster 2
tsne_client_profile %>% filter(cluster==2) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
Clients in cluster 3
tsne_client_profile %>% filter(cluster==3) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
Clients in cluster 4
tsne_client_profile %>% filter(cluster==4) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
Clients in cluster 5
tsne_client_profile %>% filter(cluster==5) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
Clients in cluster 6
tsne_client_profile %>% filter(cluster==6) %>% group_by(client) %>% summarise(profiles=n()) %>% arrange(desc(profiles)) %>% filter(profiles>0) %>% select(client)
