library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(mlbench)
library(ggplot2)
library(lattice)
Having a flexible statistical model would not give us any more information than an inflexible model. Our sample size is large and don’t need more flexibility in the model to gain additional information.
The performance of a flexible model on a large number of predictors and small sample size would be worse. This is due to overfitting of the model.
Flexible model would have better performance due to not being restricted. Although it may have more flexibility, it may be less interpretable.
We would expect the performance on a flexible model to be better. Small changes in the training data could result in large changes in the model.
Squared bias: Will decrease as you increase the flexibility of the model. There are less assumptions about the structure of the data.
Variance: Variance increases as the flexibilty of a model increases.
Training error: Training MSE declines monotonically as flexibility increases.
Test error: Decreases as flexibility increases and then it increases.
Bayes: Corresponds to the lowest achievable test MSE among all possible methods and has no relation to flexibilty due to not being able to be modified.
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
plot(Boston)
round(cor(Boston), digits = 2)
## crim zn indus chas nox rm age dis rad tax ptratio
## crim 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
## zn -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
## indus 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
## chas -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
## nox 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
## rm -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
## age 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
## dis -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
## rad 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
## tax 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
## ptratio 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## lstat 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## medv -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## lstat medv
## crim 0.46 -0.39
## zn -0.41 0.36
## indus 0.60 -0.48
## chas -0.05 0.18
## nox 0.59 -0.43
## rm -0.61 0.70
## age 0.60 -0.38
## dis -0.50 0.25
## rad 0.49 -0.38
## tax 0.54 -0.47
## ptratio 0.37 -0.51
## lstat 1.00 -0.74
## medv -0.74 1.00
plot(Boston$dis, Boston$crim)
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
summary(Boston$tax)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
summary(Boston$ptratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
qplot(Boston$crim,
binwidth=5 ,
xlab = "Crime rate",
ylab="Number of Suburbs" )
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
qplot(Boston$tax,
width=5 ,
xlab = "Full-value property-tax rate per $10,000",
ylab = "Number of Suburbs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(Boston$ptratio,
width=5 ,
xlab = "Pupil-Teacher Ratio by Town",
ylab = "Number of Suburbs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nrow(subset(Boston, chas == 1))
## [1] 35
Median for pupil-teacher ratio is 19.05. It was collected from the previous summary() code.
Census tract 399 has the lowest median value of the owneroccupied homes feature. Crime rate is particularly high compared to the mean value. Proportion of residental land zoned is equal to the median value, which is not an alarming value. Proportion of non-retail business acres per town is higher than normal, close to the 3rd quantile. The tract is not bounded by the Charles River. Nitrogen oxide concentration is also relatively high in suburb 399.Tract 399 has below average rooms per dwelling. Has the highest proportion of owner-occupied units built prior to 1940. Is in close proximity to the five Boston employment centres. Based on the evaluation of each variable for census tract 399, it can be concluded that it is not the best suburb to live in.
selection <- Boston[order(Boston$medv),]
selection[1,]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
plot(Glass)
scatter.smooth(Glass$Ca, Glass$RI, span = 2/4, degree = 1)
Glass |>
ggplot(aes(x= Na,
y= Ba,
color = "red")) +
geom_point() +
geom_smooth(method = NULL)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
a <- Glass[,1:9]
par(mfrow = c(3, 3))
for (i in 1:ncol(a)) {
hist(a[ ,i], xlab = names(a[i]), main = paste(names(a[i]), "Histogram"))
}
par(mfrow=c(2,4))
for (col in 2:ncol(Glass)) {
boxplot(Glass[,col],main = colnames(Glass)[col], xlab = colnames(Glass)[col])
}
data("Soybean")
class("Soybean")
## [1] "character"
zero_cols = nearZeroVar( Soybean )
colnames( Soybean )[ zero_cols ]
## [1] "leaf.mild" "mycelium" "sclerotia"
Soybean = Soybean[,-zero_cols]
table(Soybean$crop.hist)
##
## 0 1 2 3
## 65 165 219 218
Soybean2 <- Soybean
Soybean2$temp <- recode(Soybean2$temp,
"0 = 'low'; 1 = 'norm'; 2 = 'high'; NA = 'missing'",
levels = c("low", "norm", "high", "missing"))
table(Soybean2$temp, useNA = "always")
##
## low norm high missing <NA>
## 80 374 199 30 0
apply(Soybean, 2, function(x){ sum(is.na(x)) })
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf stem lodging
## 84 100 84 16 121
## stem.cankers canker.lesion fruiting.bodies ext.decay int.discolor
## 38 38 106 38 38
## fruit.pods fruit.spots seed mold.growth seed.discolor
## 84 106 92 92 106
## seed.size shriveling roots
## 92 106 31
counts <- table(Soybean$Class) %>%
data.frame()
counts
## Var1 Freq
## 1 2-4-d-injury 16
## 2 alternarialeaf-spot 91
## 3 anthracnose 44
## 4 bacterial-blight 20
## 5 bacterial-pustule 20
## 6 brown-spot 92
## 7 brown-stem-rot 44
## 8 charcoal-rot 20
## 9 cyst-nematode 14
## 10 diaporthe-pod-&-stem-blight 15
## 11 diaporthe-stem-canker 20
## 12 downy-mildew 20
## 13 frog-eye-leaf-spot 91
## 14 herbicide-injury 8
## 15 phyllosticta-leaf-spot 20
## 16 phytophthora-rot 88
## 17 powdery-mildew 20
## 18 purple-seed-stain 20
## 19 rhizoctonia-root-rot 20
Soybean[!complete.cases(Soybean),] %>%
group_by(Var1=Class) %>%
summarize(missing_data=n()) %>%
left_join(counts, by="Var1")
## # A tibble: 5 × 3
## Var1 missing_data Freq
## <fct> <int> <int>
## 1 2-4-d-injury 16 16
## 2 cyst-nematode 14 14
## 3 diaporthe-pod-&-stem-blight 15 15
## 4 herbicide-injury 8 8
## 5 phytophthora-rot 68 88
preProcess( Soybean[,-1], method=c("knnImpute"), na.remove=FALSE )
## Warning in pre_process_options(method, column_types): The following
## pre-processing methods were eliminated: 'knnImpute', 'center', 'scale'
## Created from 562 samples and 32 variables
##
## Pre-processing:
## - ignored (32)