The data contains 1554 binary features of 3278 images, each image of class ‘ad’ or ‘nonad’. There is data on 458 advertisements and 2820 non-advertisements.

Load Data and Organise

Data is loaded and the names file is used to name data columns.

library(adabag)

ad.names<-readLines('ad.names')
ad.data<-read.csv('ad2.csv')

ad.data[1:5,1:5]
##   X0 X0.1 X0.2 X0.3 X0.4
## 1  0    0    0    0    0
## 2  0    0    0    0    0
## 3  0    0    0    0    0
## 4  0    0    0    0    0
## 5  0    0    0    0    0
# Add heading names
names=ad.names[-(1:4)]

#remove description lines
names=names[!grepl('terms',names)]
#add final column
names[1555]='ad.nonad'
colnames(ad.data)=names

ad.data[1:5,1:5]
##   url*images+buttons: 0,1. url*likesbooks.com: 0,1.
## 1                        0                        0
## 2                        0                        0
## 3                        0                        0
## 4                        0                        0
## 5                        0                        0
##   url*www.slake.com: 0,1. url*hydrogeologist: 0,1. url*oso: 0,1.
## 1                       0                        0             0
## 2                       0                        0             0
## 3                       0                        0             0
## 4                       0                        0             0
## 5                       0                        0             0
# Get features dataframe and ad/nonad boolian response
features=ad.data[,-1555]
ad.nonad=ad.data[,1555]=='ad.'

Compute Summary Stats

Features are split into 5 groups - URL, original URL, anchor text, Caption and alt text. A function was written to gather the total features present per group for an image and applied to all images.

#function to get summary by term
summary.ad_data=function(data){
  terms=sub("\\*.*$", "", colnames(data))
  a=aggregate(t(data[,1:dim(data)[2]]),by=list(terms[1:length(terms)]),sum)
  a=t(a)
  names=a[1,]
  colnames(a)=names
  a=a[-1,]
  
  m=matrix(sapply(a[,],as.numeric),ncol=5)
  colnames(m)=names
  return(m)
}

data.summary=summary.ad_data(features)

head(data.summary)
##      alt ancurl caption origurl url
## [1,]   1      4       0       1   4
## [2,]   1     11       0       2   7
## [3,]   3     13       0       2   7
## [4,]   0     13       0       2   8
## [5,]   0     13       0       3   7
## [6,]   0     15       0       3   8

Preliminary Analysis

First we examine summary statistics by group for adverts and non-adverts.

#boxplots of totals  by group
boxplot(data.summary[ad.nonad,],ylim=c(0,40),main='Ads')
grid()

boxplot(data.summary[!ad.nonad,],ylim=c(0,40),main='Non Ads')
grid()

At first glance it appears that the presence of anchor text is a sign that an image is an advert.

The number of features by group is plotted for all advert images and a similarly sized sample of non-advert images below.

matplot(main = "Ad", ylab = "Number of features", xlab = "Image", data.summary[sample(1:457, 
    200, replace = F, ), ], type = "h", col = 1:5, lty = 1)
legend("topright", legend = c(colnames(data.summary)), col = 1:5, pch = 20)

matplot(main = "Non ad", ylab = "Number of features", xlab = "Image", data.summary[sample(458:3278, 
    200, replace = F), ], type = "h", col = 1:5, ylim = c(0, 35))
legend("topright", legend = c(colnames(data.summary)), col = 1:5, pch = 20)

The above graphs help to confirm suspicions that a large number of anchor text features is a strong indicator that an image is an advert. Also a high number of origurl features may also be a sign.

Classification

A number of classifiers were tried including logistic regression, random forests and boosting. Boosting performed best on training and test data and is demonstrated below.

Boosting on full set of features

A test set of 20% of images was taken and a model trained on the remaining 80%.

# Get training/test sets
N=nrow(ad.data)
train=sample(1:N,0.8*N)
test=setdiff(1:N,train)

set.seed(100)

# Fit model
fit.boost=boosting(ad.nonad~.,data=ad.data,subset = train,mfinal = 1)

# Make predictions on test data
pred.boost=predict.boosting(fit.boost,newdata = ad.data[test,])

# Check accuracy of model on test data
sum(fit.boost$class[test]==ad.data$ad.nonad[test])/length(test)
## [1] 0.9740854

The accuracy appears very good but as this is a class imbalance problem then accuracy is not a suitable metric to test performance. Other metrics must be checked. A function was written to compute specitivity, sensitivity etc.

table.summary=function(truth,predicted){
  t=table(truth,predicted)
  acc=(t[1,1]+t[2,2])/sum(t)
  TPR=t[1,1]/(t[1,1]+t[1,2])
  TNR=t[2,2]/(t[2,2]+t[2,1])
  PPV=t[1,1]/(t[1,1]+t[2,1])
  NPV=t[2,2]/(t[1,2]+t[2,2])
  print(t)  
  cat('\nAccuracy: ',acc)
  cat('\nTrue positive rate: ',TPR)
  cat('\nTrue negative rate: ',TNR)
  cat('\nPositive Predictive Value: ',PPV)
  cat('\nNegative Predictive Value: ',NPV)
  cat('\n\n')
  

}

# Check performance on training data
table.summary(ad.data$ad.nonad[train],fit.boost$class[train])
##         predicted
## truth     ad. nonad.
##   ad.     295     79
##   nonad.   29   2219
## 
## Accuracy:  0.9588101
## True positive rate:  0.7887701
## True negative rate:  0.9870996
## Positive Predictive Value:  0.9104938
## Negative Predictive Value:  0.9656223
# Check performance on test data 
table.summary(ad.data$ad.nonad[test],pred.boost$class)
##         predicted
## truth    ad. nonad.
##   ad.     72     12
##   nonad.   5    567
## 
## Accuracy:  0.9740854
## True positive rate:  0.8571429
## True negative rate:  0.9912587
## Positive Predictive Value:  0.9350649
## Negative Predictive Value:  0.9792746

It appears that overfitting has been avoided as all metrics vary very little between the training and test data. When using boosting for classification it is important to be aware of the potential of overfitting the model to training data.

Boosting on summary features

In order to produce a more generalisable model it was investigated whether a model trained using only total group statistics could predict adverts effectively.

# Set up dataframe for fitting, 5 features and a response variable
boost_summary=as.data.frame(data.summary)
boost_summary=cbind(boost_summary,as.factor(ad.data$ad.nonad))
colnames(boost_summary)[6]='ad.nonad'

set.seed(100)

# Can model be fitted on the following data?
head(boost_summary)
##   alt ancurl caption origurl url ad.nonad
## 1   1      4       0       1   4      ad.
## 2   1     11       0       2   7      ad.
## 3   3     13       0       2   7      ad.
## 4   0     13       0       2   8      ad.
## 5   0     13       0       3   7      ad.
## 6   0     15       0       3   8      ad.
# Fit model
fit.boost=boosting(ad.nonad~.,data=boost_summary[train,],mfinal = 100)

# Predict test data
pred.boost=predict.boosting(fit.boost,newdata = boost_summary[test,])

# Check performance on training data
table.summary(boost_summary$ad.nonad[train],fit.boost$class)
##         predicted
## truth     ad. nonad.
##   ad.     263    111
##   nonad.   21   2227
## 
## Accuracy:  0.9496568
## True positive rate:  0.7032086
## True negative rate:  0.9906584
## Positive Predictive Value:  0.9260563
## Negative Predictive Value:  0.9525235
# Check performance on test data
table.summary(ad.data$ad.nonad[test],pred.boost$class)
##         predicted
## truth    ad. nonad.
##   ad.     59     25
##   nonad.   9    563
## 
## Accuracy:  0.9481707
## True positive rate:  0.702381
## True negative rate:  0.9842657
## Positive Predictive Value:  0.8676471
## Negative Predictive Value:  0.957483
# Examine feature importance
fit.boost$importance
##       alt    ancurl   caption   origurl       url 
## 11.068806 49.887351  4.241535 16.478729 18.323578

This model performs surprisingly well considering the nuber of features has been reduced from 1554 to only 5! There is room for improvement in true positive rate (probably the most important metric), but it is an encouraging start. As expected, the presence on anchor text is by far the most substantial indicator of an image being an advert as demonstrated by the importance as deterined by the boosting model.