# #Explornatory graphs listing
# quantile(
# var()
# hist()
# plot(density())
# table()
# pie(table())
# barplot(table)
# cov(); cor()
# aggregate(Sepal.Length ~ Species, summary, data=iris)
#
# boxplot(Sepal.Length ~ Species, data=iris, xlab="Species",
# ylab="Sepal.Length")
# with(iris, plot(Sepal.Length, Sepal.Width, col=Species,
# pch=as.numeric(Species)))
# plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width))
# smoothScatter(iris$Sepal.Length, iris$Sepal.Width)
# pairs(iris)
#
# library(scatterplot3d)
# scatterplot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width) #1
# library(rgl)
# plot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width) # 2
# distMatrix <- as.matrix(dist(iris[,1:4])); heatmap(distMatrix) # 3
# library(lattice)
# levelplot(Petal.Width~Sepal.Length*Sepal.Width, iris, cuts=9,
# col.regions=grey.colors(10)[10:1]) # 4
# filled.contour(volcano, color=terrain.colors, asp=1,
# plot.axes=contour(volcano, add=T)) #5
# persp(volcano, theta=25, phi=30, expand=0.5, col="lightblue") #6
# library(MASS)
# parcoord(iris[1:4], col=iris$Species) #7
# library(lattice)
# parallelplot(~iris[1:4] | Species, data=iris) # 8
# library(ggplot2)
# qplot(Sepal.Length, Sepal.Width, data=iris, facets=Species ~.)
#
# pdf("myPlot.pdf") # save charts
#
# #First, it cannot handle data with missing values, and users have to
# #impute data before feeding them into the function. Second, there is
# #a limit of 32 to the maximum number of levels of each categorical
# #attribute. Attributes with more than 32 levels have to be transformed
# #first before using randomForest().
#
#
# #OUTLIER DETENTION
# #At first, it demonstrates univariate outlier detection.
# #After that, an example of outlier detection with LOF (Local Outlier Factor) is given,
# #followed by examples on outlier detection by clustering.
# #At last, it demonstrates outlier detection from time series data.
#
# # outliers
# boxplot.stats(x)$out
#
# #Function lofactor() calculates local outlier factors using the LOF algorithm,
# #and it is available in packages DMwR [Torgo, 2010] and dprep.
# library(DMwR)
# # remove "Species", which is a categorical column
# iris2 <- iris[,1:4]
# outlier.scores <- lofactor(iris2, k=5)
# plot(density(outlier.scores))
# # pick top 5 as outliers
# outliers <- order(outlier.scores, decreasing=T)[1:5]
# # who are outliers
# print(outliers)
#
# > print(iris2[outliers,])
# > n <- nrow(iris2)
# > labels <- 1:n
# > labels[-outliers] <- "."
# > biplot(prcomp(iris2), cex=.8, xlabs=labels)
# In the above code, prcomp() performs a principal component analysis,
# and biplot() plots the data with its first two principal components.
# In Figure 7.5, the x- and y-axis are respectively the first and
# second principal components, the arrows show the original columns
# (variables), and the five outliers are labeled with their row numbers.
#
# pch <- rep(".", n)
# > pch[outliers] <- "+"
# > col <- rep("black", n)
# > col[outliers] <- "red"
# > pairs(iris2, pch=pch, col=col)
#
#
#
# Package Rlof [Hu et al., 2015] provides function lof(),
# a parallel implementation of the LOF algorithm.
#
#
# > library(Rlof)
# > outlier.scores <- lof(iris2, k=5)
# > # try with different number of neighbors (k = 5,6,7,8,9 and 10)
# > outlier.scores <- lof(iris2, k=c(5:10))
#
#
# Outlier Detection by Clustering
# > # remove species from the data to cluster
# > iris2 <- iris[,1:4]
# > kmeans.result <- kmeans(iris2, centers=3)
# > # cluster centers
# > kmeans.result$centers
# > # cluster IDs
# > kmeans.result$cluster
# > # calculate distances between objects and cluster centers
# > centers <- kmeans.result$centers[kmeans.result$cluster, ]
# > distances <- sqrt(rowSums((iris2 - centers)^2))
# > # pick top 5 largest distances
# > outliers <- order(distances, decreasing=T)[1:5]
# > # who are outliers
# > print(outliers)
#
# print(iris2[outliers,])
# > # plot clusters
# > plot(iris2[,c("Sepal.Length", "Sepal.Width")], pch="o",
# + col=kmeans.result$cluster, cex=0.3)
# > # plot cluster centers
# > points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col=1:3,
# + pch=8, cex=1.5)
# > # plot outliers
# > points(iris2[outliers, c("Sepal.Length", "Sepal.Width")], pch="+", col=4, cex=1.5)
#
#
# Outlier Detection from Time Series
#
# > # use robust fitting
# > f <- stl(AirPassengers, "periodic", robust=TRUE)
# > (outliers <- which(f$weights<1e-8))
# > # set layout
# > op <- par(mar=c(0, 4, 0, 3), oma=c(5, 0, 4, 0), mfcol=c(4, 1))
# > plot(f, set.pars=NULL)
# > sts <- f$time.series
# > # plot outliers
# > points(time(sts)[outliers], 0.8*sts[,"remainder"][outliers], pch="x", col="red") > par(op) # reset layout
# The LOF algorithm is good at detecting local outliers, but it works on numeric data only. Package Rlof relies on the multicore package, which does not work under Windows. A fast and scalable outlier detection strategy for categorical data is the Attribute Value Frequency (AVF) algorithm
#
# Some other R packages for outlier detection are:
# Package extremevalues [van der Loo, 2010]: univariate outlier detection;
# Package mvoutlier [Filzmoser and Gschwandtner, 2015]: multivariate outlier detection based on robust methods; and
# Package outliers [Komsta, 2011]: tests for outliers.