#3.1 The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

#(a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

pairs(Glass)

#visual distributions

Glass |>
  pivot_longer(-Type) |>
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20, fill = "blue", color = "white") +
  facet_wrap(~ name, scales = "free") +
  theme_minimal()

#boxplots

Glass |>
  pivot_longer(-Type) |>
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(~ name, scales = "free") +
  theme_minimal()

#correlation

cor(Glass[, -10]) 
##               RI          Na           Mg          Al          Si            K
## RI  1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790  1.00000000 -0.273731961  0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196  1.000000000 -0.48179851 -0.16592672  0.005395667
## Al -0.4073260341  0.15679367 -0.481798509  1.00000000 -0.00552372  0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372  1.00000000 -0.193330854
## K  -0.2898327111 -0.26608650  0.005395667  0.32595845 -0.19333085  1.000000000
## Ca  0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189  0.32660288 -0.492262118  0.47940390 -0.10215131 -0.042618059
## Fe  0.1430096093 -0.24134641  0.083059529 -0.07440215 -0.09420073 -0.007719049
##            Ca            Ba           Fe
## RI  0.8104027 -0.0003860189  0.143009609
## Na -0.2754425  0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178  0.083059529
## Al -0.2595920  0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K  -0.3178362 -0.0426180594 -0.007719049
## Ca  1.0000000 -0.1128409671  0.124968219
## Ba -0.1128410  1.0000000000 -0.058691755
## Fe  0.1249682 -0.0586917554  1.000000000

#(b) Do there appear to be any outliers in the data? Are any predictors skewed?

#Several predictors exhibit skewness, particularly Ba and Fe, which contain many near-zero observations and a few larger values. K also appears right-skewed. These distributions suggest the presence of potential outliers

#(c) Are there any relevant transformations of one or more predictors that might improve the classification model?

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
glass_predictors <- Glass[, -10]
trans <- preProcess(glass_predictors,
                    method = c("YeoJohnson", "center", "scale", "pca"))

#Ba, Fe, and K exhibit substantial right skewness, Yeo-Johnson transformation would help reduce skewness and stabilize variance. Additionally, centering and scaling the predictors would improve performance for models sensitive to differences in scale

library(mlbench)
data(Soybean)
??soybean

#3.2 The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

table(Soybean$plant.stand)
## 
##   0   1 
## 354 293
table(Soybean$precip)
## 
##   0   1   2 
##  74 112 459
table(Soybean$temp)
## 
##   0   1   2 
##  80 374 199
prop.table(table(Soybean$plant.stand))
## 
##         0         1 
## 0.5471406 0.4528594
summary(Soybean)
##                  Class          date     plant.stand  precip      temp    
##  brown-spot         : 92   5      :149   0   :354    0   : 74   0   : 80  
##  alternarialeaf-spot: 91   4      :131   1   :293    1   :112   1   :374  
##  frog-eye-leaf-spot : 91   3      :118   NA's: 36    2   :459   2   :199  
##  phytophthora-rot   : 88   2      : 93               NA's: 38   NA's: 30  
##  anthracnose        : 44   6      : 90                                    
##  brown-stem-rot     : 44   (Other):101                                    
##  (Other)            :233   NA's   :  1                                    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  0   :435   0   : 65   0   :123   0   :195   0   :305   0   :165   0   :441    
##  1   :127   1   :165   1   :227   1   :322   1   :222   1   :213   1   :226    
##  NA's:121   2   :219   2   :145   2   : 45   2   : 35   2   :193   NA's: 16    
##             3   :218   3   :187   NA's:121   NA's:121   NA's:112               
##             NA's: 16   NA's:  1                                                
##                                                                                
##                                                                                
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##  0: 77   0   :221   0   :357   0   : 51   0   :487    0   :554   0   :535  
##  1:606   1   : 36   1   : 21   1   :327   1   : 96    1   : 45   1   : 20  
##          2   :342   2   :221   2   :221   NA's:100    NA's: 84   2   : 20  
##          NA's: 84   NA's: 84   NA's: 84                          NA's:108  
##                                                                            
##                                                                            
##                                                                            
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  0   :296   0   :520   0   :379     0   :320      0   :473        0   :497  
##  1   :371   1   : 42   1   : 39     1   : 83      1   :104        1   :135  
##  NA's: 16   NA's:121   2   : 36     2   :177      NA's:106        2   : 13  
##                        3   :191     3   : 65                      NA's: 38  
##                        NA's: 38     NA's: 38                                
##                                                                             
##                                                                             
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  0   :639   0   :581     0   :625   0   :407   0   :345    0   :476  
##  1   :  6   1   : 44     1   : 20   1   :130   1   : 75    1   :115  
##  NA's: 38   2   : 20     NA's: 38   2   : 14   2   : 57    NA's: 92  
##             NA's: 38                3   : 48   4   :100              
##                                     NA's: 84   NA's:106              
##                                                                      
##                                                                      
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  0   :524    0   :513      0   :532   0   :539   0   :551  
##  1   : 67    1   : 64      1   : 59   1   : 38   1   : 86  
##  NA's: 92    NA's:106      NA's: 92   NA's:106   2   : 15  
##                                                  NA's: 31  
##                                                            
##                                                            
## 
nearZeroVar(Soybean)
## [1] 19 26 28

#(a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

#Several categorical predictors exhibit highly imbalanced frequency distributions, with some levels occurring much more frequently than others. In certain variables, one category dominates the majority of observations, suggesting near-zero variance. These degenerate predictors may provide little discriminatory information and could negatively affect some classification models.

#(b) Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

sum(is.na(Soybean$plant.stand))
## [1] 36
table(Soybean$Class, is.na(Soybean$plant.stand))
##                              
##                               FALSE TRUE
##   2-4-d-injury                    0   16
##   alternarialeaf-spot            91    0
##   anthracnose                    44    0
##   bacterial-blight               20    0
##   bacterial-pustule              20    0
##   brown-spot                     92    0
##   brown-stem-rot                 44    0
##   charcoal-rot                   20    0
##   cyst-nematode                   0   14
##   diaporthe-pod-&-stem-blight     9    6
##   diaporthe-stem-canker          20    0
##   downy-mildew                   20    0
##   frog-eye-leaf-spot             91    0
##   herbicide-injury                8    0
##   phyllosticta-leaf-spot         20    0
##   phytophthora-rot               88    0
##   powdery-mildew                 20    0
##   purple-seed-stain              20    0
##   rhizoctonia-root-rot           20    0
sum(is.na(Soybean)) / length(Soybean)
## [1] 64.91667

Several predictors contain substantial missing data. The amount of missingness varies across predictors, indicating that some variables are more prone to missing values than others. When comparing missing values by class, there appears to be variation in missingness across outcome categories, suggesting that the missing data pattern may not be completely random.

#(c) Develop a strategy for handling missing data, either by eliminating predictors or imputation.

preProc_soy <- preProcess(Soybean, method = c("knnImpute"))
## Warning in pre_process_options(method, column_types): The following
## pre-processing methods were eliminated: 'knnImpute', 'center', 'scale'
Soybean_imputed <- predict(preProc_soy, Soybean)

#A reasonable strategy would involve removing predictors with excessive missingness and applying imputation to the remaining variables. For categorical predictors, mode imputation or k-nearest neighbors imputation could be appropriate