tmp_MarkDown

# #Explornatory graphs listing
# quantile(
#     var()
#     hist()
#     plot(density())
#     table()
#     pie(table())
#     barplot(table)
#     cov(); cor()
#     aggregate(Sepal.Length ~ Species, summary, data=iris)
#     
# boxplot(Sepal.Length ~ Species, data=iris, xlab="Species", 
#             ylab="Sepal.Length")
#     with(iris, plot(Sepal.Length, Sepal.Width, col=Species, 
#             pch=as.numeric(Species)))
#     plot(jitter(iris$Sepal.Length), jitter(iris$Sepal.Width))
#     smoothScatter(iris$Sepal.Length, iris$Sepal.Width)
#     pairs(iris)
# 
# library(scatterplot3d)
# scatterplot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width) #1
# library(rgl)
# plot3d(iris$Petal.Width, iris$Sepal.Length, iris$Sepal.Width) # 2
# distMatrix <- as.matrix(dist(iris[,1:4])); heatmap(distMatrix) # 3
# library(lattice)
# levelplot(Petal.Width~Sepal.Length*Sepal.Width, iris, cuts=9,
#           col.regions=grey.colors(10)[10:1]) # 4
# filled.contour(volcano, color=terrain.colors, asp=1,
#         plot.axes=contour(volcano, add=T)) #5
# persp(volcano, theta=25, phi=30, expand=0.5, col="lightblue") #6
# library(MASS)
# parcoord(iris[1:4], col=iris$Species) #7
# library(lattice)
# parallelplot(~iris[1:4] | Species, data=iris) # 8
# library(ggplot2)
# qplot(Sepal.Length, Sepal.Width, data=iris, facets=Species ~.)
# 
# pdf("myPlot.pdf") # save charts
# 
# #First, it cannot handle data with missing values, and users have to 
# #impute data before feeding them into the function. Second, there is 
# #a limit of 32 to the maximum number of levels of each categorical 
# #attribute. Attributes with more than 32 levels have to be transformed 
# #first before using randomForest().
# 
# 
# #OUTLIER DETENTION
# #At first, it demonstrates univariate outlier detection. 
# #After that, an example of outlier detection with LOF (Local Outlier Factor) is given, 
# #followed by examples on outlier detection by clustering. 
# #At last, it demonstrates outlier detection from time series data.
# 
# # outliers
# boxplot.stats(x)$out
# 
# #Function lofactor() calculates local outlier factors using the LOF algorithm, 
# #and it is available in packages DMwR [Torgo, 2010] and dprep. 
# library(DMwR)
# # remove "Species", which is a categorical column
# iris2 <- iris[,1:4]
# outlier.scores <- lofactor(iris2, k=5)
# plot(density(outlier.scores))
# # pick top 5 as outliers
# outliers <- order(outlier.scores, decreasing=T)[1:5]
# # who are outliers
# print(outliers)
# 
# > print(iris2[outliers,])
# > n <- nrow(iris2)
# > labels <- 1:n
# > labels[-outliers] <- "."
# > biplot(prcomp(iris2), cex=.8, xlabs=labels)
# In the above code, prcomp() performs a principal component analysis, 
# and biplot() plots the data with its first two principal components.
# In Figure 7.5, the x- and y-axis are respectively the first and 
# second principal components, the arrows show the original columns 
# (variables), and the five outliers are labeled with their row numbers.
# 
# pch <- rep(".", n)
# > pch[outliers] <- "+"
# > col <- rep("black", n)
# > col[outliers] <- "red"
# > pairs(iris2, pch=pch, col=col)
# 
# 
# 
# Package Rlof [Hu et al., 2015] provides function lof(), 
# a parallel implementation of the LOF algorithm.
# 
# 
# > library(Rlof)
# > outlier.scores <- lof(iris2, k=5)
# > # try with different number of neighbors (k = 5,6,7,8,9 and 10)
#     > outlier.scores <- lof(iris2, k=c(5:10))
# 
# 
# Outlier Detection by Clustering
# > # remove species from the data to cluster
#     > iris2 <- iris[,1:4]
# > kmeans.result <- kmeans(iris2, centers=3)
# > # cluster centers
#     > kmeans.result$centers
# > # cluster IDs
#     > kmeans.result$cluster
# > # calculate distances between objects and cluster centers
#     > centers <- kmeans.result$centers[kmeans.result$cluster, ]
# > distances <- sqrt(rowSums((iris2 - centers)^2))
# > # pick top 5 largest distances
#     > outliers <- order(distances, decreasing=T)[1:5]
# > # who are outliers
#     > print(outliers)
# 
# print(iris2[outliers,])
# > # plot clusters
#     > plot(iris2[,c("Sepal.Length", "Sepal.Width")], pch="o",
#            + col=kmeans.result$cluster, cex=0.3)
# > # plot cluster centers
#     > points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col=1:3,
#              + pch=8, cex=1.5)
# > # plot outliers
#     > points(iris2[outliers, c("Sepal.Length", "Sepal.Width")], pch="+", col=4, cex=1.5)
# 
# 
# Outlier Detection from Time Series
# 
# > # use robust fitting
#     > f <- stl(AirPassengers, "periodic", robust=TRUE)
# > (outliers <- which(f$weights<1e-8))
# > # set layout
#     > op <- par(mar=c(0, 4, 0, 3), oma=c(5, 0, 4, 0), mfcol=c(4, 1))
# > plot(f, set.pars=NULL)
# > sts <- f$time.series
# > # plot outliers
#     > points(time(sts)[outliers], 0.8*sts[,"remainder"][outliers], pch="x", col="red") > par(op) # reset layout
# The LOF algorithm is good at detecting local outliers, but it works on numeric data only. Package Rlof relies on the multicore package, which does not work under Windows. A fast and scalable outlier detection strategy for categorical data is the Attribute Value Frequency (AVF) algorithm
# 
# Some other R packages for outlier detection are:
# Package extremevalues [van der Loo, 2010]: univariate outlier detection;
# Package mvoutlier [Filzmoser and Gschwandtner, 2015]: multivariate outlier detection based on robust methods; and
# Package outliers [Komsta, 2011]: tests for outliers.