Ch3Exercise3-01.R

# Applied Predictive Modeling Chapter 3: Exercise 3.1
#
# This web page provides answers to Exercise 3.1 of Applied Predictive Modeling.
# That exercise says:
#
# "The UC Irvine Machine Learning Repository contains a data set related
# to glass identification.  The data consists of 214 glass samples labled
# as one of seven glass categories.  There are nine predictors, including
# refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca,
# Ba, and Fe."  
#
#
# EXERCISE 3.1(a) DATA VISUALISATIONS
# Using visualizations, explore the predictor variables to
# understand their distributions as well as the relationships between 
# predictors.
# 
# To address 3.1(a), I load the some programs and data and then create graphics.  
#
# Step 1: I load the libraries whose programs are used in the analysis
suppressMessages(library(mlbench))      # contains data sets used in the analysis
suppressMessages(library(caret))        # APM pre-procoessing program and others
suppressMessages(library(dplyr))        # used for some data manipulation
suppressMessages(library(e1071))        # for skewness computation
suppressMessages(library(tidyr))        # reshape a correlation matrix for use in heat map
suppressMessages(library(knitr))        # used to create output for web
suppressMessages(library(GGally))       # used for ggpairs correlation graph graph
#
# Step2: Load the GLASS dataset
#
data(Glass)
#
## 
#
# Step 3: Create a histogram of the Y variable (i.e., Type of Glass)
#
ggplot(Glass,aes(x=Type))+
  geom_histogram(fill="light blue",colour="black")+
  ggtitle("Number of Observations by Type of Glass")

#
# The above analysis informs me that most of the glass Types are either Type 1 or Type 2.
#
# Step 4(a): I explore the X variables by regarding distributions and outliers,
# first by using box plots and then by using histograms.  The following  analysis
# loops through each X variable using "plot," which defaults in this case to a box plot.  
#
# Note: The boxplot analysis also addresses 3.1(b) (below) because the box & whiskers
# shows outliers.
#
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  plot(Glass$Type,Glass[,i],col="light blue",
     xlab="Type of Glass",ylab=names(Glass)[i],main=paste("Series Chart",i,": Box Plot of Chemical Elements",
          names(Glass)[i],"by Type of Glass"))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

# 
# Step 4(b): Here is the same exercise using ggplot with geom_histogram.  I include
# some accompanying notes that describe how I modified the usual (interactive) ggplot
# commands due to the fact that ggplot in this example is inside a function (a for loop):
# Modification 1. I used aes_string instead of aes when aes is inside a for loop.
# Modification 2. I embedded the ggplot function inside of a print() function.  This is 
# necessary to get ggplot to actually plot when it is inside another function (such as the
# for loop).  
#
elem<-names(Glass)
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  print(ggplot(Glass,aes_string(x=elem[10],y=elem[i]))+
   geom_boxplot(fill="light blue")+
   ggtitle(paste("Series Chart",i,": BoxPlot of Trace Chemical Element",elem[i],"\n by Type of Glass"))+
   xlab("Type of Glass")) # in a for loop you must explicitly print ggplot
  }

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

#
# Here, I maintain the work space:
rm(elem,i)
#
# Step 5: In the next exploratory exercise, I compute mean and t.stat of each X by Y.  I created
# files for each component (mean, var, n).  There may be a way to improve the
# efficiency of this analysis.  I also look at the variability of each of the X variables.
#
stat.bytype<-Glass%>%
  group_by(Type)%>%
  summarise_each(funs(n=n(),mean=mean,sd=sd,skew=skewness))%>%
  select(1:2,11:37)
#
# Notice that summarise_each applies sapply(summarise). Must preface include funs(...).
#
# Step 6: I also want to see whether the number of non-zeros varies by Type for each of the chemical elements. 
# I compute the percentage of records (by Glass Type) that have non-zero amounts by chemical element.
#
nz1<-data.frame(sapply(Glass[,1:9],function(x)ifelse(x!=0,1,0)))
nz1$Type<-Glass[,10]
nonzeros.bytype<-nz1%>%
  group_by(Type)%>%
  summarise_each(funs(mean))
nonzeros.bytype

## Source: local data frame [6 x 10]
## 
##     Type    RI    Na        Mg    Al    Si         K    Ca         Ba
##   (fctr) (dbl) (dbl)     (dbl) (dbl) (dbl)     (dbl) (dbl)      (dbl)
## 1      1     1     1 1.0000000     1     1 0.9857143     1 0.04285714
## 2      2     1     1 0.8815789     1     1 0.9605263     1 0.07894737
## 3      3     1     1 1.0000000     1     1 0.9411765     1 0.05882353
## 4      5     1     1 0.4615385     1     1 1.0000000     1 0.15384615
## 5      6     1     1 0.6666667     1     1 0.0000000     1 0.00000000
## 6      7     1     1 0.2068966     1     1 0.4482759     1 0.89655172
## Variables not shown: Fe (dbl)

# There must be a more elegant way!  
# Some tentative conclusions:
#   If no Mg, maybe Type 7, since only 21% of Type 7 Glass had Mg.
#   If no K, maybe type 6 or 7, since 0% of Type 6 and 45% of Type 7 had non-zeros for K.
#   If no Ba, maybe NOT type 7, since 90% of Type 7 Glass had Ba.
#   If no Fe, maybe Type 6, since NOT type 2, since 42% of Type 2 had Fe.  
# Then, you can think of combinations: 
#   If you have no Mg and no Fe, but you have some Ba, then you may have Type 7.
# 
rm(nz1)
#
# Plot the results of the non-zeros analysis.
nz2<-gather(nonzeros.bytype,elem,value,-Type)
#
ggplot(nz2,aes(x=Type,y=value,fill=elem,colour=elem))+
  geom_point(size=8,position=position_jitter(width=0.4, height=.05))+
  labs(xlab("Type of Glass"))+
  labs(ylab("Proportion of Observations with Non-Zero Values"))+
  ggtitle("Percent of Non-Zero Values of Each Element \n (By Type of Glass)")+
  geom_hline(yintercept=.25,colour="dark grey",size=1)+
  geom_hline(yintercept=.75,colour="dark grey",size=1)

# The darker grey lineS shows visually aids the identification of those elements (by Type) for which 
# fewer than 25% of the data is non-zero and for which more than 75% of the data are non-zero.
#
# The chart shows that if you have little or no Ba, you might be anything but Type 7. 
#
# No Mg: maybe Type 7.
# No Mg and No Fe: maybe Type 7. 
# Mg but no Ba: probably not Type 7. 
#
rm(nz2,nonzeros.bytype)
#
# Step 6(b).  I want to see if the variables with numerous zeros have "near zero variance."  If so, then they will be
# unlikely to help classify/predict the type of glass.  I use the filtering functions explained on p. 55.
nearZeroVar(Glass)

## integer(0)

#
# In this case, none of the variables should be removed due to insufficient variability.
#
#
# Step 7(a): In the next exploratory anaysis, I examine graphically the correlations between
# explanatory variables.
# I do this graphically using the pairs() function, which is part of the {base} package.  
pairs(Glass[,1:9],col="black",lower.panel=panel.smooth,pch=20)

#
# Step 7(b): The results are not very appealing, so I do the same exercise using ggpairs, which is part
# of the GGally package.  The GGally package provides some canned programs that call on ggplot2.
#
ggpairs(Glass,columns=1:9,
        title="XY Plot of Trace Elements",
        upper=list(params=list(size=4)))+
  theme_grey(base_size=8)

#
# You can see from the above plot that Mg has a lot of zeros, as do Fe and Ba.  
# It's not clear whether centering/scaling is appropriate given the truncation of these X variables.
#
# Step 7:(c) I do the same correlation exercise using another function from GGally called ggscatmat()
# ggscatmat() shows the pairwise scatter plots of Xs in lower triangle,
# a density plot of the individual Xs on diagonal (i.e. a smoothed histogram 
# of one of the chemical elements) and it reports the pairwise correlation coefficients 
# in upper triangle.  (At this point, I do not know how to adjust the font size
# of the correlation coefficients in the upper triangle.)
#
ggscatmat(Glass,columns=1:9,color=NULL)+
  theme_grey(base_size=10)+
  geom_point(colour="lightseagreen",na.rm=TRUE)+ #na.rm=remove records for which the data are missing
  labs(title="XY Plot of Trace Elements \n (with Density and Correlation)")+
  theme(plot.title = element_text(size = rel(1.5)))+
  labs(xlab(" "))+
  labs(ylab(" "))

#
# It is not clear to me why I had to put the xlab() and ylab() functions on separate lines.
#
# Step 7(d): I compute the Correlation matrix of X variables & associated t-statistics
# Here is the matrix of correlation coefficients:
#
glass.cor<-cor(Glass[,1:9])
glass.cordf<-data.frame(glass.cor) # converts corr matrix to a df
# 
# This computes the correlation coefficient t-stat:
glass.cort<-glass.cor/sqrt((1-glass.cor^2)/(nrow(Glass)-2))
# 
# To improve readability, I hide the upper triangles and print the correlation coefficients and their
# related t-statistics.
index<-upper.tri(glass.cor)
cor.g<-round(glass.cor,4)
cor.g[index]<-""
cor.g<-data.frame(cor.g)
cor.gt<-round(glass.cort,4)
cor.gt[index]<-""
cor.gt<-data.frame(cor.gt)
cor.g

##         RI      Na      Mg      Al      Si       K      Ca      Ba Fe
## RI       1                                                           
## Na -0.1919       1                                                   
## Mg -0.1223 -0.2737       1                                           
## Al -0.4073  0.1568 -0.4818       1                                   
## Si -0.5421 -0.0698 -0.1659 -0.0055       1                           
## K  -0.2898 -0.2661  0.0054   0.326 -0.1933       1                   
## Ca  0.8104 -0.2754 -0.4438 -0.2596 -0.2087 -0.3178       1           
## Ba  -4e-04  0.3266 -0.4923  0.4794 -0.1022 -0.0426 -0.1128       1   
## Fe   0.143 -0.2413  0.0831 -0.0744 -0.0942 -0.0077   0.125 -0.0587  1

cor.gt

##         RI      Na      Mg      Al      Si       K      Ca     Ba  Fe
## RI     Inf                                                           
## Na -2.8468     Inf                                                   
## Mg -1.7938 -4.1439     Inf                                           
## Al -6.4939  2.3115 -8.0055     Inf                                   
## Si -9.3919 -1.0189 -2.4499 -0.0804     Inf                           
## K  -4.4093 -4.0192  0.0786  5.0202 -2.8691     Inf                   
## Ca 20.1403 -4.1719 -7.2098 -3.9139 -3.1076 -4.8809     Inf           
## Ba -0.0056  5.0313 -8.2342  7.9538 -1.4952 -0.6211 -1.6536    Inf    
## Fe  2.1039 -3.6211  1.2136 -1.0863 -1.3777 -0.1124  1.8339 -0.856 Inf

#
# Remove temp datasets
rm(glass.cor,glass.cort,index)
#
# Step 7(e): Here is yet another visualization of correlation: a Heatmap using ggplot.
# This is an attempt to replicate Figure 3.10 in the book.
#
# To create the heatmap, I started with the matrix of correlation coefficients.
# THen, I used the gather() function of the tidyr package to convert the
# correlation table into a format that ggplot could use.  The new
# table has a row for each observation and a column for each variable.  This
# is sometimes called the "long" format.
# Finally, I plotted the heatmap.
#
glass.cor2<-gather(glass.cordf) # This creates the long format.
glass.cor2$names<- names(glass.cordf) # I add a names column.  
glass.cor2$names<-factor(glass.cor2$names,
  levels=rev(names(glass.cordf))) #this controls *order* of y-axis in plot
ggplot(glass.cor2,aes(x=key,y=names))+
  geom_tile(aes(fill = value),colour = "white")+
  scale_fill_gradient2(limits=c(-1,1),low = "red",high = "darkblue")+
  labs(list(title="Heatmap of Correlations of Independent Variables",x="Trace Element",y="Trace Element"))+
  theme(plot.title = element_text(size = rel(1.5)))

#
# The heatmap shows that RI and Ca are positively correlated (0.81) and that RI and Si are negatively correlated,
# (-0.54) as is Ba and Mg (-0.49) and Al and Mg (-.048).
# Remove unneeded data
rm(glass.cordf,glass.cor2)
# 
# By using the ggplot option called scale_fill_gradient2 instead of 
# scale_fill_gradient, I ensured that correlations around 0.00 were white.
# In other words, the option centers whiteness at 0.00.
#
# 
# Step 8: Here are the scatterplots of the five pairs whose correlation coefficients
# are at least abs(0.48).  (Separate colors by Type of Glass):
#
x.var<-c("RI","RI","Mg","Mg","Al")
y.var<-c("Si","Ca","Al","Ba","Ba")
by.var<-"Type"
for(i in 1:length(x.var)){
  x2<-x.var[i]
  y2<-y.var[i]
  readline(prompt="Hit return to see next plot>> ")
  x.pos<-diff(range(Glass[,x2]))*.25+min(Glass[,x2])
  y.pos<-diff(range(Glass[,y2]))*.85+min(Glass[,y2])
  coef<-coef(lm(Glass[,y2]~Glass[,x2]))  
  print(ggplot(Glass,aes_string(x=x2,y=y2,colour=by.var,fill=by.var))+
    geom_point()+
    labs(title=paste("Series Chart",i,": Scatterplot of",x2,"and",y2,"\n (Color by",by.var,"of Glass)"))+
    geom_abline(intercept=coef[1],slope=coef[2],colour="darkgrey")+
    annotate("text",x=x.pos,y=y.pos,label=paste("Correlation=",
          round(cor(Glass[x2],Glass[y2]),3)))
    )
  }

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

  rm(by.var,coef,x.pos,x.var,y.pos,y.var,x2,y2,i)
#
# The graphs show that some of the correlations are heavily influenced by the large numbers of
# zero values in one or the other (or both) of the variables.  
#
# EXERCISE 3.1(B): OUTLIERS ANALYSIS
# NOTE: SEE ABOVE FOR BOX PLOT, WHICH SHOWS OUTLIERS
# ANOTHER ANALYSIS TO IDENTIFY OUTLIERS: 
#
#
# In this exercise, I plot the histograms of all of the Xs to visualize skewness
# I then compute and display skewness on the histograms.  This is the
# same info that is on the diagonal of the XY plots.
# (Note: No skew=0.  -1<=skew<=1 is prob ok. skew<-1 or >1 is highly 
# skewed.)
#
# The annotation in the chart presents the skewness coefficient.  The ifelse 
# statement in the code places the annotation to the right if positively 
# skewed and to the left if negatively skewed.  I have not figured out how to vary 
# the vertical distance, which is hard-wired at a count of 25.
#
skew.g<-sapply(Glass[,1:9],function(x){round(skewness(x),4)})
elem<-names(Glass)
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  quad.pos<-diff(range(Glass[,i]))*.75+min(Glass[,i])
  quad.neg<-diff(range(Glass[,i]))*.25+min(Glass[,i])
  x.g<-ifelse(skew.g[i]>=0,quad.pos,quad.neg) # use midpoint to compute offset for annotation
  binw.g<-diff(range(Glass[,i]))/20 # compute bin width
  suppressWarnings(print(
    ggplot(Glass,aes_string(x=elem[i],fill="Type"))+
      geom_histogram(stat="bin",binwidth=binw.g)+
      ggtitle(paste("Series Chart",i,": Histogram of Trace Chemical",elem[i],"\n(Color by Type of Glass)"))+
      annotate("text",label=paste("Skewness=",skew.g[i]),x=x.g,y=25)
  )
  )
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

rm(quad.neg,quad.pos,skew.g,x.g,elem,binw.g,i)
#
# Warning messages in the above have been suppressed.  To see the justification, see:
# http://stackoverflow.com/questions/14476961/why-do-i-get-position-dodge-requires-constant-width-even-though-widths-are-con
# 
#
# Instead of creating histograms that stack by Type of Glass, we can facet wrap, and display each chart
# separately.  In this case, it is easier to see how the counts of the trace chemical elements differ by type 
# of glass.
#
skew.g<-sapply(Glass[,1:9],function(x){round(skewness(x),3)})
elem<-names(Glass)
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  quad.pos<-diff(range(Glass[,i]))*.75+min(Glass[,i])
  quad.neg<-diff(range(Glass[,i]))*.25+min(Glass[,i])
  x.g<-ifelse(skew.g[i]>=0,quad.pos,quad.neg) # use midpoint to compute offset for annotation
  binw.g<-diff(range(Glass[,i]))/20 # compute bin width
  suppressWarnings(print(
    ggplot(Glass,aes_string(x=elem[i]))+
      geom_histogram(data=Glass,stat="bin",
                     binwidth=binw.g,colour="black",fill="light blue")+
      facet_wrap(~Type)+
      ggtitle(paste("Series Chart",i,": Histogram of Trace Element",elem[i],"\n (by Type of Glass)"))+
      theme(strip.text.x=element_text(color="blue",face="bold"))
  ))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

rm(binw.g,elem,i,quad.neg,quad.pos,x.g,skew.g)
#
# The faceted graphs really highlight how certain trace elements are not found (i.e., many observations
# of zero) for certain types of glass.  For example:
# Chart 7 shows Mg has numerous zeros for Type 7.
# Chart 8 shows Ba has numerous zeros possibly for Type 7.
# Chart 9 shows Fe has non zeros indicating possibly 1 or 2 (though Fe has zeros for all types)
#
# EXERCISE 3.1(C): POSSIBLE TRANSORMATIONS
# Step 1: Since most of the Xs are highly skewed, I want to center/scale/de-skew.
# I used caret's preProcess function to make the transformations.  I performed this on all variables, even
# though the variables with numerous zero values are not good candidates for centering (the logic of which
# escapes me).
# 
# BoxCox determines the transformation that would best normalize the 
# distribution.  Centering and scaling transform the variable to have
# a mean zero and standard deviation of 1.0. 
#
trans<-preProcess(Glass[,1:9],method=c("BoxCox","center","scale"))
# Trans is an object that has the resuls of the computations: it must be
# applied to the raw data to create transformed data.
# Apply the above transformation equation to the raw data
glass.trans<-predict(trans,Glass[,1:9])
glass.trans$Type<-Glass[,10]
#
# Step 2: Let's look at the histograms of the transformed data
#
skew.g<-sapply(glass.trans[,1:9],function(x){round(skewness(x),4)})
elem<-names(Glass)
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  quad.pos<-diff(range(glass.trans[,i]))*.75+min(glass.trans[,i])
  quad.neg<-diff(range(glass.trans[,i]))*.25+min(glass.trans[,i])
  x.g<-ifelse(skew.g[i]>=0,quad.pos,quad.neg) # use midpoint to compute offset for annotation
  binw.g<-diff(range(glass.trans[,i]))/20 # compute bin width
  suppressWarnings(print(
    ggplot(glass.trans,aes_string(x=elem[i],fill="Type"))+
      geom_histogram(stat="bin",binwidth=binw.g)+
      ggtitle(paste("Series Chart",i,": Histogram of Transformed Trace Chemical Element",elem[i],"\n(Color by Type of Glass)"))+
      annotate("text",label=paste("Skewness=",skew.g[i]),x=x.g,y=25)
  ))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

rm(skew.g,elem,i,quad.pos,quad.neg,x.g,binw.g)
#
# The transformation results for Ba and Fe (Charts )
# Step 3: Instead of stacking by type of glass, we can look at the transformed data by Type in individual
# windows (or "facets").
#
skew.g<-sapply(glass.trans[,1:9],function(x){round(skewness(x),3)})
elem<-names(Glass)
for(i in 1:9){
  readline(prompt="Hit return to see next plot>> ")
  quad.pos<-diff(range(glass.trans[,i]))*.75+min(glass.trans[,i])
  quad.neg<-diff(range(glass.trans[,i]))*.25+min(glass.trans[,i])
  x.g<-ifelse(skew.g[i]>=0,quad.pos,quad.neg) # use midpoint to compute offset for annotation
  binw.g<-diff(range(glass.trans[,i]))/20 # compute bin width
  suppressWarnings(print(
    ggplot(glass.trans,aes_string(x=elem[i]))+
      geom_histogram(data=glass.trans,stat="bin",
          binwidth=binw.g,colour="black",fill="light yellow")+
      facet_wrap(~Type)+
      ggtitle(paste("Series Chart",i,": Histogram of Transformed Trace Chemical Element",elem[i],"\n (by Type of Glass)"))
      ))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

rm(i,skew.g,quad.pos,quad.neg,x.g,binw.g)
#
# For example, Chart 3 (Mg) shows the transformed distribution has a cluster of zeros near -2 for Type 7.
# This has less intuition than an untransformed cluster at zero.
#
# Step 4: Here's a comparison of the untransformed and transformed data by Type.  Other than 
# centering the data at a mean of zero, there does not appear to be an obvious normalization of the
# variables. 
# 
for(i in 1:9){
  dat<-Glass%>%
    select(i,10)
  dat$trans<-glass.trans[,i]
  dat<-gather(dat,elem,value,-Type) #this is from tidyr.  It "melts" the data frame so that it can be more easily subsetted.
  readline(prompt="Hit return to see next plot>> ")
  binw.g<-diff(range(dat[,3]))/20 # compute bin width
  suppressWarnings(print(
    ggplot(dat,aes(x=value))+
      geom_histogram(data=subset(dat,elem==names(Glass)[i]),
                    binwidth=binw.g,colour="black",fill="blue",alpha=.2)+
      geom_histogram(data=subset(dat,elem=="trans"),
                    binwidth=binw.g,colour="black",fill="yellow",alpha=.3)+
      facet_wrap(~Type)+
      ggtitle(paste("Histogram of Actual (Blue) and Transformed (Yellow) Trace Chemical Element",
                    elem[i],"\n (by Type of Glass)"))
  ))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

# Step 5: And here is a comparison of the untransformed and transformed data overall.
# To do this, I removed the line "facet_wrap(~Type)" from the above code.
#
for(i in 1:9){
  dat<-Glass%>%
    select(i,10)
  dat$trans<-glass.trans[,i]
  dat<-gather(dat,elem,value,-Type) #this is from tidyr.  It "melts" the data frame so that it can be more easily subsetted.
    readline(prompt="Hit return to see next plot>> ")
  binw.g<-diff(range(dat[,3]))/50 # compute bin width
  suppressWarnings(print(
    ggplot(dat,aes(x=value))+
      geom_histogram(data=subset(dat,elem==names(Glass)[i]),
                     binwidth=binw.g,colour="black",fill="blue",alpha=.2)+
      geom_histogram(data=subset(dat,elem=="trans"),
                     binwidth=binw.g,colour="black",fill="yellow",alpha=.3)+
      ggtitle(paste("Series Chart",i,": Histogram of Actual (Blue) and Transformed (Yellow) Trace Chemical Element",
                    elem[i]))
  ))
}

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

## Hit return to see next plot>>

# It seems that the following are meaningful candidates for centering/scaling:
# Series Chart 2 (Na)
# Chart 4 (Al)
# Chart 5 (Si)
# Chart 7 (Ca)
rm(binw.g,elem,i,dat)
rm(trans,cor.gt,cor.g)
#
#
# EXERCISE 3-1(C) Other relevant transformations of one or more predictors that
# might improve the classification model.
#
# (In process)

Ch3Exercise3-01.R

FPAMPUSH

Tue Oct 27 10:28:47 2015