library(mlbench)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
data(Soybean)
#Q3.1 A
Glass_long <- Glass |>
pivot_longer(-Type, names_to = "Predictor", values_to = "Value")
ggplot(Glass_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "red", color = "white") +
facet_wrap(~ Predictor, scales = "free")
ggplot(Glass_long, aes(x = Type, y = Value, fill = Type)) +
geom_boxplot() +
facet_wrap(~ Predictor, scales = "free")
#PCA
x <- Glass[, 1:9]
y <- Glass$Type
pca_model <- prcomp(x,
center = TRUE,
scale. = TRUE)
summary(pca_model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 1.585 1.4318 1.1853 1.0760 0.9560 0.72639 0.6074 0.25269
## Proportion of Variance 0.279 0.2278 0.1561 0.1286 0.1016 0.05863 0.0410 0.00709
## Cumulative Proportion 0.279 0.5068 0.6629 0.7915 0.8931 0.95173 0.9927 0.99982
## PC9
## Standard deviation 0.04011
## Proportion of Variance 0.00018
## Cumulative Proportion 1.00000
plot(pca_model, type = "l")
pca_model$rotation
## PC1 PC2 PC3 PC4 PC5 PC6
## RI -0.5451766 0.28568318 0.0869108293 0.14738099 -0.073542700 0.11528772
## Na 0.2581256 0.27035007 -0.3849196197 0.49124204 0.153683304 -0.55811757
## Mg -0.1108810 -0.59355826 0.0084179590 0.37878577 0.123509124 0.30818598
## Al 0.4287086 0.29521154 0.3292371183 -0.13750592 0.014108879 -0.01885731
## Si 0.2288364 -0.15509891 -0.4587088382 -0.65253771 0.008500117 0.08609797
## K 0.2193440 -0.15397013 0.6625741197 -0.03853544 -0.307039842 -0.24363237
## Ca -0.4923061 0.34537980 -0.0009847321 -0.27644322 -0.188187742 -0.14866937
## Ba 0.2503751 0.48470218 0.0740547309 0.13317545 0.251334261 0.65721884
## Fe -0.1858415 -0.06203879 0.2844505524 -0.23049202 0.873264047 -0.24304431
## PC7 PC8 PC9
## RI -0.08186724 -0.75221590 -0.02573194
## Na -0.14858006 -0.12769315 0.31193718
## Mg 0.20604537 -0.07689061 0.57727335
## Al 0.69923557 -0.27444105 0.19222686
## Si -0.21606658 -0.37992298 0.29807321
## K -0.50412141 -0.10981168 0.26050863
## Ca 0.09913463 0.39870468 0.57932321
## Ba -0.35178255 0.14493235 0.19822820
## Fe -0.07372136 -0.01627141 0.01466944
pca_scores <- as.data.frame(pca_model$x)
pca_scores$Type <- y
ggplot(pca_scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 3, alpha = 0.7) +
theme_minimal()
#b.
#Yes there are clearly outliers in several of the percentages such as (Barium)Ba, (Iron)Fe, and (Potassium)K. Barium has a substantial number of 0 values except for glass type 7 which is the exception. There were outlier values in other types for barium but clearly rare. Iron showed a lot of 0 values in types 5,6, and 7 but had less in the other glass types. Despite this Iron had very few values or distributions above 0.3 with outliers only rising to about 0.5 which isn't substatnial. Potassium had a lot more of an interesting spread as most distributions had non-zero values less than 1 but there were some rare huge ouliers including a 6 in glass type 5 which would be very substantial. Calcium is also one that shows some substantial outliers in glass type 2 with with values above 15 which is substantial as well though not as extreme as potassium's one outlier.
#C.
#Since there are some disparities and a large amount of 0 values, it may be valuable to transform the data with a log +1 transformation to help with some of heavily skewed variables influence. Below is an example done on a few of the elements and it does show how when something is severely right tailed. I don't think this is a transformation that helps the accuracy of our PCA, but I do think it is useful for observing the values of these heavily skewed elements to not dismiss their value. Our PCA test did show that in PC5 where Fe was giving a very small impact on our variance.
Glass_log <- Glass %>%
mutate(
Ba_log = log(Ba + 1),
Fe_log = log(Fe + 1),
K_log = log(K + 1)
)
hist(Glass$Ba, main = "Original Ba")
hist(log(Glass$Ba + 1), main = "Log-Transformed Ba")
hist(Glass$K, main = "Original K")
hist(log(Glass$K + 1), main = "Log-Transformed K")
#Q3.2
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
#frequency summary shows some areas like leaves have over 80% of the value 1 while others are somewhat evenly occurring
freq_summary <- lapply(Soybean[, -ncol(Soybean)], function(x) {
prop.table(table(x))
})
freq_summary
## $Class
## x
## 2-4-d-injury alternarialeaf-spot
## 0.02342606 0.13323572
## anthracnose bacterial-blight
## 0.06442167 0.02928258
## bacterial-pustule brown-spot
## 0.02928258 0.13469985
## brown-stem-rot charcoal-rot
## 0.06442167 0.02928258
## cyst-nematode diaporthe-pod-&-stem-blight
## 0.02049780 0.02196193
## diaporthe-stem-canker downy-mildew
## 0.02928258 0.02928258
## frog-eye-leaf-spot herbicide-injury
## 0.13323572 0.01171303
## phyllosticta-leaf-spot phytophthora-rot
## 0.02928258 0.12884334
## powdery-mildew purple-seed-stain
## 0.02928258 0.02928258
## rhizoctonia-root-rot
## 0.02928258
##
## $date
## x
## 0 1 2 3 4 5 6
## 0.03812317 0.10997067 0.13636364 0.17302053 0.19208211 0.21847507 0.13196481
##
## $plant.stand
## x
## 0 1
## 0.5471406 0.4528594
##
## $precip
## x
## 0 1 2
## 0.1147287 0.1736434 0.7116279
##
## $temp
## x
## 0 1 2
## 0.1225115 0.5727412 0.3047473
##
## $hail
## x
## 0 1
## 0.7740214 0.2259786
##
## $crop.hist
## x
## 0 1 2 3
## 0.09745127 0.24737631 0.32833583 0.32683658
##
## $area.dam
## x
## 0 1 2 3
## 0.1803519 0.3328446 0.2126100 0.2741935
##
## $sever
## x
## 0 1 2
## 0.34697509 0.57295374 0.08007117
##
## $seed.tmt
## x
## 0 1 2
## 0.54270463 0.39501779 0.06227758
##
## $germ
## x
## 0 1 2
## 0.2889667 0.3730298 0.3380035
##
## $plant.growth
## x
## 0 1
## 0.6611694 0.3388306
##
## $leaves
## x
## 0 1
## 0.1127379 0.8872621
##
## $leaf.halo
## x
## 0 1 2
## 0.36894825 0.06010017 0.57095159
##
## $leaf.marg
## x
## 0 1 2
## 0.59599332 0.03505843 0.36894825
##
## $leaf.size
## x
## 0 1 2
## 0.0851419 0.5459098 0.3689482
##
## $leaf.shread
## x
## 0 1
## 0.8353345 0.1646655
##
## $leaf.malf
## x
## 0 1
## 0.92487479 0.07512521
##
## $leaf.mild
## x
## 0 1 2
## 0.93043478 0.03478261 0.03478261
##
## $stem
## x
## 0 1
## 0.4437781 0.5562219
##
## $lodging
## x
## 0 1
## 0.9252669 0.0747331
##
## $stem.cankers
## x
## 0 1 2 3
## 0.58759690 0.06046512 0.05581395 0.29612403
##
## $canker.lesion
## x
## 0 1 2 3
## 0.4961240 0.1286822 0.2744186 0.1007752
##
## $fruiting.bodies
## x
## 0 1
## 0.8197574 0.1802426
##
## $ext.decay
## x
## 0 1 2
## 0.77054264 0.20930233 0.02015504
##
## $mycelium
## x
## 0 1
## 0.990697674 0.009302326
##
## $int.discolor
## x
## 0 1 2
## 0.90077519 0.06821705 0.03100775
##
## $sclerotia
## x
## 0 1
## 0.96899225 0.03100775
##
## $fruit.pods
## x
## 0 1 2 3
## 0.67946578 0.21702838 0.02337229 0.08013356
##
## $fruit.spots
## x
## 0 1 2 4
## 0.59792028 0.12998267 0.09878683 0.17331023
##
## $seed
## x
## 0 1
## 0.8054146 0.1945854
##
## $mold.growth
## x
## 0 1
## 0.8866328 0.1133672
##
## $seed.discolor
## x
## 0 1
## 0.8890815 0.1109185
##
## $seed.size
## x
## 0 1
## 0.9001692 0.0998308
##
## $shriveling
## x
## 0 1
## 0.93414211 0.06585789
#Just temp
freq_summary$temp
## x
## 0 1 2
## 0.1225115 0.5727412 0.3047473
#NZV
NearZero_soy <- nearZeroVar(Soybean[, -ncol(Soybean)], saveMetrics = TRUE)
NearZero_soy
## freqRatio percentUnique zeroVar nzv
## Class 1.010989 2.7818448 FALSE FALSE
## date 1.137405 1.0248902 FALSE FALSE
## plant.stand 1.208191 0.2928258 FALSE FALSE
## precip 4.098214 0.4392387 FALSE FALSE
## temp 1.879397 0.4392387 FALSE FALSE
## hail 3.425197 0.2928258 FALSE FALSE
## crop.hist 1.004587 0.5856515 FALSE FALSE
## area.dam 1.213904 0.5856515 FALSE FALSE
## sever 1.651282 0.4392387 FALSE FALSE
## seed.tmt 1.373874 0.4392387 FALSE FALSE
## germ 1.103627 0.4392387 FALSE FALSE
## plant.growth 1.951327 0.2928258 FALSE FALSE
## leaves 7.870130 0.2928258 FALSE FALSE
## leaf.halo 1.547511 0.4392387 FALSE FALSE
## leaf.marg 1.615385 0.4392387 FALSE FALSE
## leaf.size 1.479638 0.4392387 FALSE FALSE
## leaf.shread 5.072917 0.2928258 FALSE FALSE
## leaf.malf 12.311111 0.2928258 FALSE FALSE
## leaf.mild 26.750000 0.4392387 FALSE TRUE
## stem 1.253378 0.2928258 FALSE FALSE
## lodging 12.380952 0.2928258 FALSE FALSE
## stem.cankers 1.984293 0.5856515 FALSE FALSE
## canker.lesion 1.807910 0.5856515 FALSE FALSE
## fruiting.bodies 4.548077 0.2928258 FALSE FALSE
## ext.decay 3.681481 0.4392387 FALSE FALSE
## mycelium 106.500000 0.2928258 FALSE TRUE
## int.discolor 13.204545 0.4392387 FALSE FALSE
## sclerotia 31.250000 0.2928258 FALSE TRUE
## fruit.pods 3.130769 0.5856515 FALSE FALSE
## fruit.spots 3.450000 0.5856515 FALSE FALSE
## seed 4.139130 0.2928258 FALSE FALSE
## mold.growth 7.820896 0.2928258 FALSE FALSE
## seed.discolor 8.015625 0.2928258 FALSE FALSE
## seed.size 9.016949 0.2928258 FALSE FALSE
## shriveling 14.184211 0.2928258 FALSE FALSE
#A.
#As mentioned in the book, a small number of unique values is not a concern but can be indicative of degenerate data if "The ratio of the frequency of the most prevalent value to the frequency of the second most prevalent value is large" which is shown in the frequency ratio. We can test this with the nonZeroVar() function and see that we do have some predictors that may be valuable to remove as the nzv is true and the frequency ratio is exponentially higher than that of the other predictors here.
#B
#Below we can see that the missing data is not even but is more likely to be seen in areas like hail,sever,seed.tmt, and lodging while some variables like plant.growth, and stem have nearly zero missing data. Even more the classes show an interesting result as 2-4-d-injury, cyst-nematode, and herbicide-injury show a lot more missing data than any other class. It is actually apparent that most classes have no missing data which makes the idea that this is MCAR or missing at random data severely low. I think the reporting on those classes is clearly impacted by their class and the data that is missing in each category is more likely correlated to the class than the observation not seen.
mean(is.na(Soybean))
## [1] 0.09504636
missing_by_var <- colMeans(is.na(Soybean))
sort(missing_by_var, decreasing = TRUE)
## hail sever seed.tmt lodging germ
## 0.177159590 0.177159590 0.177159590 0.177159590 0.163982430
## leaf.mild fruiting.bodies fruit.spots seed.discolor shriveling
## 0.158125915 0.155197657 0.155197657 0.155197657 0.155197657
## leaf.shread seed mold.growth seed.size leaf.halo
## 0.146412884 0.134699854 0.134699854 0.134699854 0.122986823
## leaf.marg leaf.size leaf.malf fruit.pods precip
## 0.122986823 0.122986823 0.122986823 0.122986823 0.055636896
## stem.cankers canker.lesion ext.decay mycelium int.discolor
## 0.055636896 0.055636896 0.055636896 0.055636896 0.055636896
## sclerotia plant.stand roots temp crop.hist
## 0.055636896 0.052708638 0.045387994 0.043923865 0.023426061
## plant.growth stem date area.dam Class
## 0.023426061 0.023426061 0.001464129 0.001464129 0.000000000
## leaves
## 0.000000000
missing_class <- Soybean %>%
mutate(missing_count = rowSums(is.na(.))) %>%
group_by(Class) %>%
summarise(mean_missing = mean(missing_count))
missing_class
## # A tibble: 19 × 2
## Class mean_missing
## <fct> <dbl>
## 1 2-4-d-injury 28.1
## 2 alternarialeaf-spot 0
## 3 anthracnose 0
## 4 bacterial-blight 0
## 5 bacterial-pustule 0
## 6 brown-spot 0
## 7 brown-stem-rot 0
## 8 charcoal-rot 0
## 9 cyst-nematode 24
## 10 diaporthe-pod-&-stem-blight 11.8
## 11 diaporthe-stem-canker 0
## 12 downy-mildew 0
## 13 frog-eye-leaf-spot 0
## 14 herbicide-injury 20
## 15 phyllosticta-leaf-spot 0
## 16 phytophthora-rot 13.8
## 17 powdery-mildew 0
## 18 purple-seed-stain 0
## 19 rhizoctonia-root-rot 0
#C.
#Some potential strategies for fixing the missing data by eliminating predictors could involve removing those that may not be substantially useful or those with extreme missing values. Since the missing values are somewhat concentrated in certain classes I would resist it's removal but some values may not have as much predictive power. For example temp is on a scale of 0-2 which may simply be measuring the presence of a temp. If all the classes were being measured on a fixed temperature and this was only checking the presence of the reading or above/below a certain temp, it may be helpful to remove. Imputation could also be a valuable option since the values we are looking at seem to be whole integers between 0-4 in most cases, and 0-2 in many columns as well. I would say mode imputation would be very useful here as it would not run the risk of providing substantial outliers and may fit the average data very well as even the highest missing columns are just under 18%.
#Dummy variables as dicussed here may be very valuable with the NA's present. This data is a great time to use a random forrest model of imputation as it is often well regarded for data that is categorical and with this type of data being non-numeric it also has obvious non-linear relationships. For this reason we would likely want to keep the NA values and turn them into somehting useful for a tree model.The large mtry value is suggesting a broad set of predictors at each iteration improved classification performance most probably due to complex interactions among variables. Based on the accuracy and Kappa values, the model seems to show strong predictive performance.
nzv_metrics <- nearZeroVar(Soybean[, -ncol(Soybean)], saveMetrics = TRUE)
nzv_names <- rownames(nzv_metrics[nzv_metrics$nzv == TRUE, ])
Soybean_clean <- Soybean %>%
select(-all_of(nzv_names)) %>%
mutate(across(-Class, ~ addNA(.)))
ctrl <- trainControl(method = "cv", number = 5)
rf_model <- train(Class ~ .,
data = Soybean_clean,
method = "rf",
trControl = ctrl)
rf_model
## Random Forest
##
## 683 samples
## 32 predictor
## 19 classes: '2-4-d-injury', 'alternarialeaf-spot', 'anthracnose', 'bacterial-blight', 'bacterial-pustule', 'brown-spot', 'brown-stem-rot', 'charcoal-rot', 'cyst-nematode', 'diaporthe-pod-&-stem-blight', 'diaporthe-stem-canker', 'downy-mildew', 'frog-eye-leaf-spot', 'herbicide-injury', 'phyllosticta-leaf-spot', 'phytophthora-rot', 'powdery-mildew', 'purple-seed-stain', 'rhizoctonia-root-rot'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 546, 545, 549, 548, 544
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8125492 0.7918851
## 47 0.9385681 0.9326837
## 92 0.9327166 0.9262533
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 47.
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.