Problem 0: Find, download and load the dataset

First, we’ll load the dataset. I chose Spam Base Dataset because it has decent number of attributes (not to many, not too few) and medium number of observations. First we need to load all the data from the UCI repository. This is a zip file, so we need to download and unzip it.

url ='http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.zip'
temp = tempfile()
download.file(url,temp, mode="wb")
unzip(zipfile=temp, exdir = './l1')

Let’s read the data and the column names. Processing column names took me a while since they were not in a suitable format. To convert them to real column names I read it ignoring the comments, then converted it to matrix and removed all the punctuation.

setwd('l1')
sbase = read.table("spambase.data", sep=",")
cnames = read.table("spambase.names", comment.char="|", header=F)[1]
cnames = gsub("[[:punct:]]", "", as.matrix(cnames))
cnames = c(cnames[c(2:nrow(cnames))],"target")
colnames(sbase) = cnames

Before doing any processing we need to take a look at the data. The following lines are commented out because the output takes too much space.

#head(sbase)
#summary(sbase)

Problem 1: Split the dataset

There are two common approaches to split the dataset in R. The first one is to take a random sample from the list of indices and then take a subset of the dataset based on the indices. We’ll start with that. And since we’re asked to write a procedure we’ll do that.

splitDataset = function(dataset, rep = FALSE) {
        if (rep) {set.seed(123)}
        #check how many rows we have
        rs = nrow(dataset)
        train_ind = sample.int(rs, size = round(0.7*rs), replace = FALSE)
        train = dataset[train_ind,]
        test = dataset[-train_ind,]
        list(train=train,test=test)
        }

Or we can use even more sophisticated approach that takes into consideration target variable and makes test and train set balanced with respect to target variable.

splitDataset2 = function(dataset, target_col, rep = FALSE) {
        if (rep) {set.seed(123)}
        #check how many rows we have
        rs = ncol(dataset)
        # find all the differet values of the target variable
        all_classes = unique(sbase[,target_col])
        train = data.frame(matrix(ncol = rs, nrow = 0))
        test = data.frame(matrix(ncol = rs, nrow = 0))
        colnames(train) = colnames(dataset)
        colnames(test) = colnames(dataset)
        for(class in all_classes) {
                subase = dataset[dataset[,target_col]==class,]
                train_ind = sample.int(nrow(subase), size = round(0.7*nrow(subase)), replace = FALSE)
                train = rbind(train,subase[train_ind,])
                test = rbind(test,subase[-train_ind,])
                }
        list(train=train,test=test)
        }

Check how our functions works. Let’s start with simple unbalance split:

splittedDS = splitDataset(sbase)
train = splittedDS$train
test = splittedDS$test
splittedDS2 = splitDataset2(sbase,ncol(sbase))
train2 = splittedDS2$train
test2 = splittedDS2$test
nrow(train)
## [1] 3221
nrow(test)
## [1] 1380
nrow(train2)
## [1] 3221
nrow(test2)
## [1] 1380

And check the percentages:

sprintf("Train set (simple split): %d%%", round((nrow(train)/nrow(sbase))*100)) 
## [1] "Train set (simple split): 70%"
sprintf("Test set (simple split): %d%%", round((nrow(test)/nrow(sbase))*100)) 
## [1] "Test set (simple split): 30%"
sprintf("Train set (balanced split): %d%%", round((nrow(train2)/nrow(sbase))*100)) 
## [1] "Train set (balanced split): 70%"
sprintf("Test set (balanced split): %d%%", round((nrow(test2)/nrow(sbase))*100)) 
## [1] "Test set (balanced split): 30%"

And let’s show that the second data set is balanced with respect to target variable. To do that it’s enough to show that the percentage of each class is approximately the same in both sets.

sum(train2[,ncol(train2)])/nrow(train2)
## [1] 0.393977
sum(test2[,ncol(test2)])/nrow(test2)
## [1] 0.3942029

But there is a much easier and cleaner way to do that. We can use caTools package.

library(caTools)
split = sample.split(sbase$target, SplitRatio = 0.7)
train = subset(sbase, split==T)
test = subset(sbase, split==F)

Let’s check that the library function also gives us the balanced test and train sets:

sum(train[,ncol(train)])/nrow(train)
## [1] 0.393977
sum(test[,ncol(test)])/nrow(test)
## [1] 0.3942029

Problem 2: Replace factor variables with numerical

Since the original data set contains only numerical values I will use another data set for this problem. Let’s take classical iris dataset. Our target variable is Species. Let’s take a look at the types of possible values of the target variable:

levels(iris$Species)
## [1] "setosa"     "versicolor" "virginica"

The easiest way to assign number to classes is through levels. And since we need to keep the original column we just copy it before adding levels and name it “Class”:

ds = iris
cnames = colnames(ds)
ds = cbind(ds, ds[,ncol(ds)])
colnames(ds) = c(cnames,"Class")
levels(ds$Class) = c(1:length(levels(iris$Species)))

Check the results by picking 6 random rows from the dataset:

ds[sample(nrow(ds),6),]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species Class
## 17           5.4         3.9          1.3         0.4    setosa     1
## 8            5.0         3.4          1.5         0.2    setosa     1
## 44           5.0         3.5          1.6         0.6    setosa     1
## 47           5.1         3.8          1.6         0.2    setosa     1
## 28           5.2         3.5          1.5         0.2    setosa     1
## 147          6.3         2.5          5.0         1.9 virginica     3

Problem 3: Missing values

For this problem I chose Tennis Major Tournaments Match Statistics.

#url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00300/Tennis-Major-Tournaments-Match#-Statistics.zip'
#temp = tempfile()
#download.file(url,temp, mode="wb")
#unzip(zipfile=temp, exdir = '.')

There are multiple datasets in this zip, but for the sake of this problem we will take only one. Let it be Wimbledon-men-2013.csv:

ds2 = read.csv("Wimbledon-men-2013.csv")

To see the missing values let’s take a look at the data:

summary(ds2)
##         Player1          Player2       Round           Result      
##  N.Djokovic : 7   A.Murray   : 7   Min.   :1.000   Min.   :0.0000  
##  D.Ferrer   : 4   J.Del Potro: 6   1st Qu.:1.000   1st Qu.:0.0000  
##  F.Verdasco : 4   T.Berdych  : 5   Median :1.000   Median :0.0000  
##  K.Anderson : 3   I.Dodig    : 3   Mean   :1.904   Mean   :0.4825  
##  K.Nishikori: 3   J.Chardy   : 3   3rd Qu.:2.000   3rd Qu.:1.0000  
##  L.Kubot    : 3   J.Janowicz : 3   Max.   :7.000   Max.   :1.0000  
##  (Other)    :90   (Other)    :87                                   
##      FNL.1           FNL.2           FSP.1           FSW.1       
##  Min.   :0.000   Min.   :0.000   Min.   :43.00   Min.   : 12.00  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:60.00   1st Qu.: 40.25  
##  Median :2.000   Median :3.000   Median :65.00   Median : 48.00  
##  Mean   :1.719   Mean   :1.825   Mean   :64.25   Mean   : 52.25  
##  3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:68.00   3rd Qu.: 62.75  
##  Max.   :3.000   Max.   :3.000   Max.   :85.00   Max.   :102.00  
##                                                                  
##      SSP.1           SSW.1           ACE.1           DBF.1       
##  Min.   :15.00   Min.   : 3.00   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.:32.00   1st Qu.:15.25   1st Qu.: 5.25   1st Qu.: 2.000  
##  Median :35.00   Median :20.00   Median : 9.00   Median : 3.000  
##  Mean   :35.75   Mean   :20.39   Mean   :11.12   Mean   : 3.478  
##  3rd Qu.:40.00   3rd Qu.:25.00   3rd Qu.:17.00   3rd Qu.: 5.000  
##  Max.   :57.00   Max.   :38.00   Max.   :28.00   Max.   :11.000  
##                                                  NA's   :1       
##      WNR.1           UFE.1           BPC.1            BPW.1      
##  Min.   : 8.00   Min.   : 3.00   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:31.00   1st Qu.:16.25   1st Qu.: 4.250   1st Qu.:1.000  
##  Median :40.00   Median :22.00   Median : 8.000   Median :2.000  
##  Mean   :41.11   Mean   :23.85   Mean   : 8.667   Mean   :2.789  
##  3rd Qu.:53.00   3rd Qu.:29.75   3rd Qu.:12.000   3rd Qu.:4.000  
##  Max.   :80.00   Max.   :60.00   Max.   :24.000   Max.   :9.000  
##                                                                  
##      NPA.1           NPW.1        TPW.1             ST1.1      
##  Min.   : 5.00   Min.   : 3.00   Mode:logical   Min.   :0.000  
##  1st Qu.:22.00   1st Qu.:14.00   NA's:114       1st Qu.:4.000  
##  Median :28.50   Median :20.00                  Median :6.000  
##  Mean   :32.62   Mean   :21.53                  Mean   :4.974  
##  3rd Qu.:41.75   3rd Qu.:28.00                  3rd Qu.:6.000  
##  Max.   :96.00   Max.   :61.00                  Max.   :7.000  
##                                                                
##      ST2.1           ST3.1           ST4.1           ST5.1      
##  Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:3.250  
##  Median :6.000   Median :6.000   Median :6.000   Median :5.500  
##  Mean   :5.202   Mean   :4.965   Mean   :4.795   Mean   :4.778  
##  3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :7.000   Max.   :7.000   Max.   :7.000   Max.   :8.000  
##                                  NA's   :70      NA's   :96     
##      FSP.2           FSW.2            SSP.2           SSW.2      
##  Min.   :47.00   Min.   : 12.00   Min.   :19.00   Min.   : 8.00  
##  1st Qu.:60.00   1st Qu.: 41.00   1st Qu.:31.00   1st Qu.:16.00  
##  Median :64.00   Median : 51.00   Median :36.00   Median :20.00  
##  Mean   :63.96   Mean   : 52.81   Mean   :36.04   Mean   :21.04  
##  3rd Qu.:69.00   3rd Qu.: 63.00   3rd Qu.:40.00   3rd Qu.:25.00  
##  Max.   :81.00   Max.   :101.00   Max.   :53.00   Max.   :47.00  
##                                                                  
##      ACE.2           DBF.2            WNR.2           UFE.2      
##  Min.   : 0.00   Min.   : 0.000   Min.   :16.00   Min.   : 6.00  
##  1st Qu.: 6.00   1st Qu.: 2.000   1st Qu.:29.00   1st Qu.:14.25  
##  Median :10.00   Median : 3.000   Median :39.50   Median :21.00  
##  Mean   :11.32   Mean   : 3.425   Mean   :40.44   Mean   :24.05  
##  3rd Qu.:15.00   3rd Qu.: 4.000   3rd Qu.:49.75   3rd Qu.:30.00  
##  Max.   :36.00   Max.   :13.000   Max.   :95.00   Max.   :74.00  
##                  NA's   :1                                       
##      BPC.2            BPW.2           NPA.2           NPW.2      
##  Min.   : 0.000   Min.   :0.000   Min.   : 4.00   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:21.25   1st Qu.:14.00  
##  Median : 8.000   Median :3.000   Median :30.00   Median :20.00  
##  Mean   : 7.912   Mean   :2.868   Mean   :32.40   Mean   :21.21  
##  3rd Qu.:11.000   3rd Qu.:5.000   3rd Qu.:41.75   3rd Qu.:29.00  
##  Max.   :25.000   Max.   :9.000   Max.   :82.00   Max.   :50.00  
##                                                                  
##   TPW.2             ST1.2           ST2.2           ST3.2      
##  Mode:logical   Min.   :0.000   Min.   :1.000   Min.   :1.000  
##  NA's:114       1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
##                 Median :6.000   Median :6.000   Median :6.000  
##                 Mean   :5.123   Mean   :5.246   Mean   :5.114  
##                 3rd Qu.:6.000   3rd Qu.:6.000   3rd Qu.:6.000  
##                 Max.   :7.000   Max.   :7.000   Max.   :7.000  
##                                                                
##      ST4.2         ST5.2      
##  Min.   :1.0   Min.   :1.000  
##  1st Qu.:3.0   1st Qu.:4.000  
##  Median :6.0   Median :4.000  
##  Mean   :4.5   Mean   :4.778  
##  3rd Qu.:6.0   3rd Qu.:6.000  
##  Max.   :7.0   Max.   :9.000  
##  NA's   :70    NA's   :96

As we can see, TPW.2, ST4.2 and ST5.2 columns have a lot of missing values. Moreover, TPW.1 and TPW.2 attribute has no values at all so it might me a good idea to just get rid of them:

ds2$TPW.2 = NULL
ds2$TPW.1 = NULL

Now let’s actually fill in the missing values. It’s interesting to note two things: * There are more NA values than real values. So normally it’s not a good idea at all to fill in this gaps. * The median is very close to the mean which is a good thing to note because using mean might be reasonable.

ds2$ST5.2[is.na(ds2$ST5.2)]=mean(ds2$ST5.2,na.rm=T)

The same procedure can be carried out for all other columns.

Problem 4. Graphics in R

The final part of this problem set asks to build graphs for the iris dataset.

plot(iris$Sepal.Width,iris$Sepal.Length, main="Scatterplot", xlab="Sepal Width", ylab="Sepal Length", pch=18, col = "royalblue4")

And the histogram:

hist(iris$Petal.Length, col = "royalblue4", main = "Petal Length Histogram",xlab="Petal Length", breaks=30)

Additional notes.

While talking to my coursemates at the university I was asked about solving different problems. For example, what if the original dataset contains some other symbols that denote the missing values. For example, the question mark (“?”). In this case the easiest way to deal with that is to just tell R what our missing values look like right at the moment when we’re loading the data.

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
temp = tempfile()
download.file(url,temp, mode="wb")
auto = read.table(temp, sep=",",na.strings="?")

The next problem encountered by most people was by group imputation. This can be tough but with plyr package it a piece of cake. Let’s first introduce NA to the iris dataset. I chose this one because it’s simple, low-dimensional and it’s easy to observe the results of the operations. So first we need to generate random boolean matrix to indicate where to put NAs. We want NA’s with probability 0.05. Note that we intorduce NA’s just to the attribues:

data(iris)
m = dim(iris)[1]
n = dim(iris)[2]-1
set.seed(123)
selmatrix = matrix(sample(c(F,T), m*n, replace = TRUE, prob=c(0.95,0.05)), m,n)
selmatrix = cbind(selmatrix,rep(F,m))
iris[selmatrix] = NA

Let’s check if we have NA’s now:

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.500   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.300   Median :1.300  
##  Mean   :5.831   Mean   :3.072   Mean   :3.729   Mean   :1.208  
##  3rd Qu.:6.400   3rd Qu.:3.400   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##  NA's   :9       NA's   :8       NA's   :13      NA's   :7      
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##                 
## 

It can be useful to check some of the NAs before and after imputation, so let’s do that:

iris[rowSums(is.na(iris)) > 0,]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 11            NA         3.7          1.5          NA     setosa
## 20            NA         3.8          1.5         0.3     setosa
## 24            NA         3.3          1.7         0.5     setosa
## 27           5.0         3.4           NA         0.4     setosa
## 31            NA         3.1          1.6         0.2     setosa
## 40           5.1         3.4           NA         0.2     setosa
## 41           5.0         3.5          1.3          NA     setosa
## 43           4.4          NA          1.3         0.2     setosa
## 46           4.8         3.0          1.4          NA     setosa
## 47           5.1         3.8           NA         0.2     setosa
## 52           6.4          NA           NA         1.5 versicolor
## 56           5.7         2.8           NA         1.3 versicolor
## 72           6.1          NA          4.0         1.3 versicolor
## 76           6.6         3.0           NA         1.4 versicolor
## 79           6.0         2.9          4.5          NA versicolor
## 80           5.7         2.6           NA         1.0 versicolor
## 87            NA         3.1          4.7         1.5 versicolor
## 91           5.5         2.6           NA         1.2 versicolor
## 96           5.7         3.0          4.2          NA versicolor
## 98           6.2          NA          4.3         1.3 versicolor
## 99           5.1          NA          3.0         1.1 versicolor
## 100          5.7         2.8           NA         1.3 versicolor
## 101          6.3         3.3           NA         2.5  virginica
## 104           NA         2.9          5.6         1.8  virginica
## 114          5.7          NA          5.0         2.0  virginica
## 117          6.5         3.0           NA         1.8  virginica
## 118           NA         3.8          6.7         2.2  virginica
## 126           NA         3.2          6.0         1.8  virginica
## 127          6.2          NA          4.8         1.8  virginica
## 131          7.4         2.8           NA         1.9  virginica
## 134          6.3         2.8           NA         1.5  virginica
## 139           NA         3.0          4.8          NA  virginica
## 143          5.8         2.7          5.1          NA  virginica
## 147          6.3          NA          5.0         1.9  virginica

Let’s take the Petal.Width to test the results of imputation. Let’s look at the rows with missing Petal.Widht and save the boolean vector of these just to remember them.

checkvector = is.na(iris$Petal.Width)
iris[checkvector,]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 11            NA         3.7          1.5          NA     setosa
## 41           5.0         3.5          1.3          NA     setosa
## 46           4.8         3.0          1.4          NA     setosa
## 79           6.0         2.9          4.5          NA versicolor
## 96           5.7         3.0          4.2          NA versicolor
## 139           NA         3.0          4.8          NA  virginica
## 143          5.8         2.7          5.1          NA  virginica

We also want to get the idea of what we expect to see instead of NAs, so let’s take a look at the means by group:

library(plyr)
means = ddply(iris, .(Species), colwise(mean,na.rm=T))
means[,2:5] = round(means[,2:5],1)
means
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          5.0         3.4          1.5         0.2
## 2 versicolor          5.9         2.8          4.3         1.3
## 3  virginica          6.6         3.0          5.5         2.0

Now to impute the missing values we will do two things: * write a procedure to actuly replace the mean in the given column * use ddply to apply this procedure per group (ddply is a part of plyr package and is really awesome)

library(plyr)
replace_by_mean <- function(x) {
        x[is.na(x)]=mean(x,na.rm = T)
        round(x,1)
        }
iris2 = ddply(iris, .(Species), colwise(replace_by_mean))
iris2[checkvector,]
##        Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 11      setosa          5.0         3.7          1.5         0.2
## 41      setosa          5.0         3.5          1.3         0.2
## 46      setosa          4.8         3.0          1.4         0.2
## 79  versicolor          6.0         2.9          4.5         1.3
## 96  versicolor          5.7         3.0          4.2         1.3
## 139  virginica          6.6         3.0          4.8         2.0
## 143  virginica          5.8         2.7          5.1         2.0