WQ <- read.csv("D:/R course/Practicing/Wine Quality/WQ.csv")
summary(WQ)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.0000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.4000 1st Qu.:0.1100 1st Qu.: 2.000
## Median : 8.00 Median :0.5300 Median :0.2000 Median : 2.300
## Mean : 8.67 Mean :0.5418 Mean :0.2467 Mean : 3.699
## 3rd Qu.: 9.90 3rd Qu.:0.6600 3rd Qu.:0.3850 3rd Qu.: 3.460
## Max. :15.90 Max. :1.5800 Max. :1.0000 Max. :15.990
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01000 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.05600 1st Qu.: 9.00 1st Qu.: 24.00 1st Qu.:0.9953
## Median :0.07500 Median :16.00 Median : 42.00 Median :0.9966
## Mean :0.07586 Mean :20.19 Mean : 52.62 Mean :0.9965
## 3rd Qu.:0.08600 3rd Qu.:27.00 3rd Qu.: 73.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.34 Min. :0.3300 Min. : 8.40 Min. :2.000
## 1st Qu.:3.18 1st Qu.:0.5600 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.30 Median :0.6500 Median :10.40 Median :6.000
## Mean :3.29 Mean :0.9495 Mean :10.67 Mean :5.638
## 3rd Qu.:3.42 3rd Qu.:0.8400 3rd Qu.:11.40 3rd Qu.:6.000
## Max. :4.16 Max. :3.9900 Max. :15.00 Max. :9.000
hist(WQ$fixed.acidity)

hist(WQ$volatile.acidity)

hist(WQ$citric.acid)

hist(WQ$residual.sugar)

hist(WQ$chlorides)

hist(WQ$free.sulfur.dioxide)

hist(WQ$total.sulfur.dioxide)

hist(WQ$density)

hist(WQ$pH)

hist(WQ$sulphates)

plot(WQ$alcohol)

hist(WQ$alcohol)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(WQ, aes(x = "", y = alcohol)) +
geom_violin(trim = FALSE) +
labs(title = "Violin Plot", y = "alcohol%") +
theme_minimal()

table(WQ$quality)
##
## 2 3 4 5 6 7 8 9
## 44 60 98 735 678 265 59 60
hist(WQ$quality)

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
correlation_data <- WQ[, c("fixed.acidity", "quality")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Display the correlation matrix
print(cor_matrix)
## fixed.acidity quality
## fixed.acidity 1.00000000 0.04483084
## quality 0.04483084 1.00000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE,
title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

model <- lm(quality ~ fixed.acidity, data = WQ)
# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = quality ~ fixed.acidity, data = WQ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7969 -0.6260 0.2911 0.4092 3.4544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.41995 0.11221 48.303 <2e-16 ***
## fixed.acidity 0.02513 0.01253 2.005 0.0451 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.255 on 1997 degrees of freedom
## Multiple R-squared: 0.00201, Adjusted R-squared: 0.00151
## F-statistic: 4.022 on 1 and 1997 DF, p-value: 0.04505
# Plot the relationship with the regression line
plot(WQ$fixed.acidity, WQ$quality,
main = "Linear Regression of Quality vs Fixed Acidity",
xlab = "Fixed Acidity", ylab = "Quality", pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)

correlation_data <- WQ[, c("volatile.acidity", "quality")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Display the correlation matrix
print(cor_matrix)
## volatile.acidity quality
## volatile.acidity 1.0000000 -0.1688718
## quality -0.1688718 1.0000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE,
title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

model <- lm(quality ~ volatile.acidity, data = WQ)
# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = quality ~ volatile.acidity, data = WQ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1336 -0.6399 0.0780 0.5012 3.7715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.27465 0.08766 71.577 < 2e-16 ***
## volatile.acidity -1.17547 0.15353 -7.656 2.96e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.238 on 1997 degrees of freedom
## Multiple R-squared: 0.02852, Adjusted R-squared: 0.02803
## F-statistic: 58.62 on 1 and 1997 DF, p-value: 2.956e-14
# Plot the relationship with the regression line
plot(WQ$fixed.acidity, WQ$quality,
main = "Linear Regression of Quality vs Volatile Acidity",
xlab = "Volatile Acidity", ylab = "Quality", pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)

correlation_data <- WQ[, c("residual.sugar", "quality")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Display the correlation matrix
print(cor_matrix)
## residual.sugar quality
## residual.sugar 1.000000000 -0.007908868
## quality -0.007908868 1.000000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE,
title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

correlation_data <- WQ[, c( "pH", "quality")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Wine Parameters",
hc.order = TRUE, outline.color = "black")

model <- lm(quality ~ pH, data = WQ)
# Create a regression plot using ggplot2
ggplot(WQ, aes(x = pH, y = quality)) +
geom_point(color = "blue", alpha = 0.6, size = 2) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Regression of Wine Quality vs pH",
x = "pH",
y = "Quality") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 12)
)
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "fixed.acidity")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Wine Parameters",
hc.order = TRUE, outline.color = "black")

model <- lm(fixed.acidity ~ pH, data = WQ)
# Create a regression plot using ggplot2
ggplot(WQ, aes(x = fixed.acidity, y = pH)) +
geom_point(color = "blue", alpha = 0.6, size = 2) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Regression of Wine Quality vs pH",
x = "fixed.acidity",
y = "pH") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 12)
)
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "volatile.acidity")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Wine Parameters",
hc.order = TRUE, outline.color = "black")

model <- lm(volatile.acidity ~ pH, data = WQ)
# Create a regression plot using ggplot2
ggplot(WQ, aes(x = volatile.acidity, y = pH)) +
geom_point(color = "blue", alpha = 0.6, size = 2) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Regression of Wine Quality vs pH",
x = "volatile.acidity",
y = "pH") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 12)
)
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "citric.acid")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Wine Parameters",
hc.order = TRUE, outline.color = "black")

model <- lm(citric.acid ~ pH, data = WQ)
# Create a regression plot using ggplot2
ggplot(WQ, aes(x = citric.acid, y = pH)) +
geom_point(color = "blue", alpha = 0.6, size = 2) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Regression of citric.acid vs pH",
x = "citric.acid",
y = "pH") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 12)
)
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "total.sulfur.dioxide")]
# Compute the correlation matrix
cor_matrix <- cor(correlation_data)
# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Wine Parameters",
hc.order = TRUE, outline.color = "black")

model <- lm(total.sulfur.dioxide ~ pH, data = WQ)
# Create a regression plot using ggplot2
ggplot(WQ, aes(x = total.sulfur.dioxide, y = pH)) +
geom_point(color = "blue", alpha = 0.6, size = 2) + # Scatter plot
geom_smooth(method = "lm", color = "red", se = TRUE) + # Regression line with confidence interval
labs(title = "Regression of citric.acid vs pH",
x = "total.sulfur.dioxide",
y = "pH") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 12)
)
## `geom_smooth()` using formula = 'y ~ x'

physicochemical_data <- WQ[, c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar",
"chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density",
"pH", "sulphates", "alcohol")]
# Compute the correlation matrix for physicochemical properties
cor_matrix <- cor(physicochemical_data)
# Create a heatmap using ggcorrplot
ggcorrplot(cor_matrix, method = "square", type = "lower",
lab = TRUE, lab_size = 3,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Heatmap of Physicochemical Properties",
hc.order = TRUE, outline.color = "black")

# Load necessary libraries
library(caret) # For classification models and data splitting
## Loading required package: lattice
library(rpart) # For decision tree models
library(ggplot2) # For visualization
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
WQ$quality <- as.numeric(as.character(WQ$quality))
# Step 1: Create a categorical variable for quality
WQ$quality_label <- cut(WQ$quality,
breaks = c(-Inf, 4, 6, Inf),
labels = c("low", "fair", "high"))
# Step 2: Prepare the data by removing the original quality variable
WQ <- WQ %>% select(-quality) # Remove original quality variable
# Step 3: Split the data into training and test sets
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(WQ$quality_label, p = .8, list = FALSE)
train_data <- WQ[trainIndex, ]
test_data <- WQ[-trainIndex, ]
# Step 4: Logistic Regression Model
log_model <- train(quality_label ~ ., data = train_data, method = "multinom")
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1179.345238
## iter 20 value 959.696023
## iter 30 value 955.044431
## iter 40 value 950.223456
## iter 50 value 945.359339
## iter 60 value 945.358271
## iter 70 value 945.356702
## iter 80 value 945.304002
## iter 90 value 945.099155
## final value 945.087567
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1179.504634
## iter 20 value 970.161093
## final value 968.829778
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1179.345398
## iter 20 value 959.708732
## iter 30 value 955.079834
## iter 40 value 950.883929
## iter 50 value 948.068195
## final value 948.067451
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1155.916392
## iter 20 value 987.794709
## iter 30 value 980.574048
## iter 40 value 975.573317
## iter 50 value 975.516618
## iter 60 value 975.509137
## iter 70 value 975.499093
## iter 80 value 975.470362
## iter 90 value 975.369347
## final value 975.360981
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1156.118723
## iter 20 value 995.436928
## final value 991.247867
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1155.916594
## iter 20 value 987.803506
## iter 30 value 980.599802
## iter 40 value 976.906881
## iter 50 value 976.880072
## iter 60 value 976.876604
## iter 70 value 976.872038
## iter 80 value 976.858364
## final value 976.851411
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1115.982262
## iter 20 value 940.701273
## iter 30 value 934.306632
## iter 40 value 931.635285
## iter 50 value 930.212457
## iter 60 value 930.206439
## iter 70 value 930.203460
## iter 80 value 930.157438
## iter 90 value 929.888786
## final value 929.872456
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1116.471512
## iter 20 value 951.258708
## final value 950.688636
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1115.982752
## iter 20 value 940.714923
## iter 30 value 934.357142
## iter 40 value 932.141486
## iter 50 value 931.404495
## iter 60 value 931.401362
## iter 70 value 931.399696
## iter 80 value 931.385617
## final value 931.348981
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1230.929650
## iter 20 value 1029.044551
## iter 30 value 1008.395047
## iter 40 value 1007.404648
## iter 50 value 1005.425546
## iter 50 value 1005.425543
## final value 1005.425543
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1231.050553
## iter 20 value 1039.715889
## final value 1029.665567
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1230.929771
## iter 20 value 1029.063885
## iter 30 value 1008.441690
## iter 40 value 1007.541027
## iter 50 value 1006.345333
## iter 60 value 1006.343504
## iter 70 value 1006.339722
## iter 80 value 1006.305436
## final value 1006.200414
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1231.872508
## iter 20 value 958.662196
## iter 30 value 947.285837
## iter 40 value 940.865496
## iter 50 value 937.592781
## final value 937.591802
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1232.574200
## iter 20 value 969.936117
## final value 960.716308
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1231.873210
## iter 20 value 958.675504
## iter 30 value 947.314986
## iter 40 value 941.941871
## iter 50 value 940.195315
## iter 60 value 940.194731
## iter 70 value 940.194083
## iter 80 value 940.174657
## final value 940.159760
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1220.580757
## iter 20 value 1007.058477
## iter 30 value 1003.667283
## iter 40 value 1000.334325
## iter 50 value 999.489845
## iter 50 value 999.489838
## final value 999.489838
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1220.687005
## iter 20 value 1014.654894
## final value 1012.496572
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1220.580863
## iter 20 value 1007.067492
## iter 30 value 1003.682686
## iter 40 value 1000.944906
## iter 50 value 1000.510367
## iter 50 value 1000.510364
## final value 1000.510364
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1127.950862
## iter 20 value 926.171005
## iter 30 value 918.352971
## iter 40 value 916.911024
## iter 50 value 913.474215
## final value 913.474005
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1128.058182
## iter 20 value 939.890353
## final value 936.060587
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1127.950969
## iter 20 value 926.187024
## iter 30 value 918.392565
## iter 40 value 917.064728
## iter 50 value 914.795216
## final value 914.795119
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1218.437883
## iter 20 value 1006.060564
## iter 30 value 992.726011
## iter 40 value 990.175258
## iter 50 value 986.735025
## iter 60 value 986.726393
## iter 70 value 986.712525
## iter 80 value 985.699180
## iter 90 value 984.702100
## final value 984.670405
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1218.544821
## iter 20 value 1019.448813
## final value 1013.909425
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1218.437990
## iter 20 value 1006.077477
## iter 30 value 992.773288
## iter 40 value 990.487144
## iter 50 value 988.431705
## iter 60 value 988.427306
## final value 988.427168
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1290.580074
## iter 20 value 960.900642
## iter 30 value 947.627921
## iter 40 value 943.674039
## final value 937.750384
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1291.135061
## iter 20 value 974.574094
## final value 968.398357
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1290.580630
## iter 20 value 960.917677
## iter 30 value 947.688138
## iter 40 value 944.082652
## final value 940.226029
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1209.228628
## iter 20 value 988.211898
## iter 30 value 975.384715
## iter 40 value 974.882879
## iter 50 value 974.207906
## final value 974.207761
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1209.342610
## iter 20 value 1001.260482
## final value 993.753329
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1209.228742
## iter 20 value 988.227681
## iter 30 value 975.436914
## iter 40 value 974.973568
## final value 974.546607
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1241.245190
## iter 20 value 1030.920495
## iter 30 value 1009.747596
## iter 40 value 1003.266602
## iter 50 value 1003.256622
## final value 1003.255549
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1241.387300
## iter 20 value 1039.180382
## final value 1024.416316
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1241.245332
## iter 20 value 1030.930024
## iter 30 value 1009.794535
## iter 40 value 1004.916608
## iter 50 value 1004.912117
## final value 1004.911754
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1184.670478
## iter 20 value 1003.523226
## iter 30 value 992.653371
## iter 40 value 988.789981
## iter 50 value 986.518543
## iter 60 value 986.515395
## iter 70 value 986.513034
## iter 80 value 986.487078
## iter 90 value 986.261820
## final value 986.254317
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1184.804893
## iter 20 value 1014.280796
## final value 1008.011548
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1184.670613
## iter 20 value 1003.535635
## iter 30 value 992.681978
## iter 40 value 989.407666
## iter 50 value 988.152047
## iter 60 value 988.149840
## iter 70 value 988.148250
## iter 80 value 988.142042
## final value 988.098120
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.174194
## iter 20 value 989.733566
## iter 30 value 963.949520
## iter 40 value 955.249284
## iter 50 value 955.121882
## final value 955.121372
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.728594
## iter 20 value 998.808745
## final value 990.696472
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.174749
## iter 20 value 989.744508
## iter 30 value 964.054382
## iter 40 value 957.658068
## iter 50 value 957.612776
## final value 957.612077
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1237.703267
## iter 20 value 983.830697
## iter 30 value 969.480848
## iter 40 value 965.110332
## iter 50 value 963.635821
## final value 963.635767
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1238.211473
## iter 20 value 991.462301
## iter 30 value 979.869553
## iter 30 value 979.869551
## iter 30 value 979.869551
## final value 979.869551
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1237.703776
## iter 20 value 983.839859
## iter 30 value 969.498600
## iter 40 value 965.995581
## iter 50 value 965.211474
## final value 965.211436
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1284.364942
## iter 20 value 965.422603
## iter 30 value 956.084279
## iter 40 value 954.810086
## final value 951.689485
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1285.200315
## iter 20 value 977.529413
## iter 30 value 973.688422
## iter 30 value 973.688420
## iter 30 value 973.688420
## final value 973.688420
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1284.365778
## iter 20 value 965.437682
## iter 30 value 956.119497
## iter 40 value 954.927574
## final value 952.836008
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1260.723266
## iter 20 value 1052.878317
## iter 30 value 1033.204760
## iter 40 value 1023.032901
## iter 50 value 1022.912284
## iter 60 value 1022.911199
## iter 70 value 1022.907295
## iter 80 value 1022.869242
## iter 90 value 1022.838443
## final value 1022.825412
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1261.145936
## iter 20 value 1061.763110
## final value 1042.759317
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1260.723689
## iter 20 value 1052.888860
## iter 30 value 1033.224432
## iter 40 value 1025.580755
## iter 50 value 1025.534621
## final value 1025.534027
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1202.747242
## iter 20 value 979.008449
## iter 30 value 964.637680
## iter 40 value 963.731819
## final value 961.654637
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1202.862447
## iter 20 value 994.389765
## final value 987.080106
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1202.747358
## iter 20 value 979.027886
## iter 30 value 964.687626
## iter 40 value 963.832664
## final value 962.470618
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1190.313954
## iter 20 value 985.106317
## iter 30 value 963.184658
## iter 40 value 955.989911
## iter 50 value 955.761422
## final value 955.760947
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1190.505205
## iter 20 value 998.692371
## iter 30 value 983.095161
## iter 30 value 983.095158
## iter 30 value 983.095158
## final value 983.095158
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1190.314145
## iter 20 value 985.123154
## iter 30 value 963.224938
## iter 40 value 957.936279
## iter 50 value 957.814508
## final value 957.814153
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1156.282947
## iter 20 value 969.534604
## iter 30 value 960.924879
## iter 40 value 958.200261
## final value 956.268616
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1156.391335
## iter 20 value 981.525327
## iter 30 value 976.740384
## iter 30 value 976.740382
## iter 30 value 976.740382
## final value 976.740382
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1156.283055
## iter 20 value 969.549626
## iter 30 value 960.952634
## iter 40 value 958.660223
## final value 957.490717
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1181.219917
## iter 20 value 1003.237081
## iter 30 value 998.627370
## iter 40 value 993.902875
## iter 50 value 993.481695
## iter 60 value 993.464975
## iter 70 value 993.341095
## iter 80 value 992.712815
## iter 90 value 992.575664
## final value 992.114412
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1181.307207
## iter 20 value 1012.279856
## final value 1006.160408
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1181.220004
## iter 20 value 1003.244167
## iter 30 value 998.644056
## iter 40 value 995.144231
## iter 50 value 994.935275
## iter 60 value 994.927338
## iter 70 value 994.869916
## iter 80 value 994.782047
## iter 90 value 994.723054
## final value 994.694912
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1285.777181
## iter 20 value 1021.299591
## iter 30 value 998.756366
## iter 40 value 991.946702
## final value 991.885255
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1286.373792
## iter 20 value 1022.881055
## iter 30 value 1015.190277
## iter 30 value 1015.190273
## iter 30 value 1015.190273
## final value 1015.190273
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1285.777778
## iter 20 value 1021.310640
## iter 30 value 998.811136
## iter 40 value 993.683624
## final value 993.660333
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1257.553822
## iter 20 value 1008.619575
## iter 30 value 989.666746
## iter 40 value 985.869062
## final value 980.928322
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1258.042092
## iter 20 value 1021.574368
## final value 1014.166924
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1257.554311
## iter 20 value 1008.634914
## iter 30 value 989.728879
## iter 40 value 986.311151
## final value 983.208815
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1228.287652
## iter 20 value 1007.400973
## iter 30 value 981.119251
## iter 40 value 972.995000
## iter 50 value 972.868165
## iter 60 value 972.844336
## iter 70 value 972.810463
## iter 80 value 972.793797
## iter 90 value 972.426311
## final value 972.391822
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1228.378853
## iter 20 value 1012.810713
## final value 1003.383205
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1228.287743
## iter 20 value 1007.414017
## iter 30 value 981.206340
## iter 40 value 975.283284
## iter 50 value 975.224803
## iter 60 value 975.214318
## iter 70 value 975.199450
## iter 80 value 975.195100
## final value 975.142251
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1175.261496
## iter 20 value 994.828143
## iter 30 value 986.645109
## iter 40 value 985.107457
## iter 50 value 982.240569
## iter 50 value 982.240567
## final value 982.240567
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1175.359996
## iter 20 value 1004.638761
## iter 30 value 1000.621154
## iter 30 value 1000.621151
## iter 30 value 1000.621151
## final value 1000.621151
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1175.261595
## iter 20 value 994.840346
## iter 30 value 986.671845
## iter 40 value 985.240096
## final value 983.311901
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.813559
## iter 20 value 982.819059
## iter 30 value 956.542247
## iter 40 value 954.303205
## iter 50 value 949.002187
## iter 60 value 948.999679
## final value 948.999547
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.925793
## iter 20 value 998.215902
## iter 30 value 991.966150
## iter 30 value 991.966145
## iter 30 value 991.966145
## final value 991.966145
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1233.813672
## iter 20 value 982.838144
## iter 30 value 956.713616
## iter 40 value 954.577661
## iter 50 value 950.988932
## final value 950.987483
## converged
## # weights: 39 (24 variable)
## initial value 1758.878274
## iter 10 value 1219.348559
## iter 20 value 1001.559221
## iter 30 value 992.530342
## iter 40 value 987.792883
## final value 986.327181
## converged
# Predict on test data
log_pred <- predict(log_model, newdata = test_data)
# Step 5: Decision Tree Model
tree_model <- rpart(quality_label ~ ., data = train_data, method = "class")
# Predict on test data using Decision Tree
tree_pred <- predict(tree_model, newdata = test_data, type = "class")
# Step 6: Evaluate both models using confusion matrices
cat("Logistic Regression Confusion Matrix:\n")
## Logistic Regression Confusion Matrix:
confusionMatrix(log_pred, test_data$quality_label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low fair high
## low 3 2 10
## fair 24 264 39
## high 13 16 27
##
## Overall Statistics
##
## Accuracy : 0.7387
## 95% CI : (0.6926, 0.7812)
## No Information Rate : 0.7085
## P-Value [Acc > NIR] : 0.1014
##
## Kappa : 0.3251
##
## Mcnemar's Test P-Value : 2.685e-06
##
## Statistics by Class:
##
## Class: low Class: fair Class: high
## Sensitivity 0.075000 0.9362 0.35526
## Specificity 0.966480 0.4569 0.90994
## Pos Pred Value 0.200000 0.8073 0.48214
## Neg Pred Value 0.903394 0.7465 0.85673
## Prevalence 0.100503 0.7085 0.19095
## Detection Rate 0.007538 0.6633 0.06784
## Detection Prevalence 0.037688 0.8216 0.14070
## Balanced Accuracy 0.520740 0.6965 0.63260
cat("\nDecision Tree Confusion Matrix:\n")
##
## Decision Tree Confusion Matrix:
confusionMatrix(tree_pred, test_data$quality_label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low fair high
## low 7 4 6
## fair 12 257 32
## high 21 21 38
##
## Overall Statistics
##
## Accuracy : 0.7588
## 95% CI : (0.7137, 0.8)
## No Information Rate : 0.7085
## P-Value [Acc > NIR] : 0.014556
##
## Kappa : 0.4277
##
## Mcnemar's Test P-Value : 0.002176
##
## Statistics by Class:
##
## Class: low Class: fair Class: high
## Sensitivity 0.17500 0.9113 0.50000
## Specificity 0.97207 0.6207 0.86957
## Pos Pred Value 0.41176 0.8538 0.47500
## Neg Pred Value 0.91339 0.7423 0.88050
## Prevalence 0.10050 0.7085 0.19095
## Detection Rate 0.01759 0.6457 0.09548
## Detection Prevalence 0.04271 0.7563 0.20101
## Balanced Accuracy 0.57353 0.7660 0.68478
# Optional: Visualize Decision Tree
plot(tree_model)
text(tree_model, pretty = 0)

library(dplyr)
library(tidyr)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
features <- WQ %>% select(fixed.acidity, volatile.acidity, citric.acid,
residual.sugar, chlorides, free.sulfur.dioxide,
total.sulfur.dioxide, density, pH, sulphates, alcohol)
# Scale the features for better clustering performance
features_scaled <- scale(features)
# Step 2: Determine the optimal number of clusters using the elbow method
wss <- sapply(1:10, function(k) {
kmeans(features_scaled, k, nstart = 10)$tot.withinss
})
# Plot the elbow method result
ggplot(data.frame(Clusters = 1:10, WSS = wss), aes(x = Clusters, y = WSS)) +
geom_line() + geom_point() +
labs(title = "Elbow Method for Optimal k",
x = "Number of Clusters",
y = "Total Within-Cluster Sum of Squares") +
theme_minimal()

# Step 3: Fit the K-means clustering model with the chosen number of clusters
optimal_k <- 3 # Based on the elbow method
kmeans_model <- kmeans(features_scaled, centers = optimal_k, nstart = 25)
# Step 4: Add the cluster assignments to the original dataset
WQ$cluster <- as.factor(kmeans_model$cluster)
# Step 5: Visualize the clusters
fviz_cluster(kmeans_model, data = features_scaled,
geom = "point", ellipse.type = "convex",
ggtheme = theme_minimal(),
main = paste("K-means Clustering (k =", optimal_k, ")"))

# Optional: Summarize the clusters
cluster_summary <- WQ %>%
group_by(cluster) %>%
summarise(across(everything(), mean))
## Warning: There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(everything(), mean)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
##PCA##
# Load necessary libraries
library(ggplot2) # For visualization
library(dplyr) # For data manipulation
library(tidyr) # For data manipulation
library(factoextra) # For visualization of PCA results
# Step 1: Data Preparation
# Select relevant physicochemical features (excluding quality)
features <- WQ %>% select(fixed.acidity, volatile.acidity, citric.acid,
residual.sugar, chlorides, free.sulfur.dioxide,
total.sulfur.dioxide, density, pH, sulphates, alcohol)
# Check for any missing values
if (any(is.na(features))) {
cat("Warning: Missing values found in features. Removing rows with NA.\n")
features <- na.omit(features) # Remove rows with missing values
}
# Scale the features for PCA
features_scaled <- scale(features)
# Step 2: Perform PCA
pca_result <- prcomp(features_scaled, center = TRUE, scale. = TRUE)
# Step 3: Evaluate PCA Results
# Get the proportion of variance explained by each principal component
pca_var <- pca_result$sdev^2
pca_var_explained <- pca_var / sum(pca_var)
# Create a data frame for the variance explained
var_df <- data.frame(PC = paste0("PC", 1:length(pca_var_explained)),
Variance = pca_var_explained)
# Plot the variance explained by each principal component
ggplot(var_df, aes(x = PC, y = Variance)) +
geom_bar(stat = "identity") +
labs(title = "Variance Explained by Principal Components",
x = "Principal Component",
y = "Proportion of Variance") +
theme_minimal()

# Step 4: Visualize PCA Results
# Create a biplot
fviz_pca_biplot(pca_result,
geom.ind = "point",
geom.var = "arrow",
col.ind = WQ$quality_label, # Color by quality label if available
legend.title = "Quality Label",
repel = TRUE) +
labs(title = "PCA Biplot")

# Optional: Get PCA loadings
loadings <- as.data.frame(pca_result$rotation)
print(loadings)
## PC1 PC2 PC3 PC4
## fixed.acidity 0.09236898 0.52570491 0.006787771 0.387913461
## volatile.acidity 0.13399181 -0.33565128 0.446002193 0.474563924
## citric.acid -0.24133485 0.52650436 -0.156701307 -0.247102167
## residual.sugar 0.43277584 0.17686051 0.060569657 0.148545369
## chlorides -0.35768234 0.07526146 0.250820615 -0.176317348
## free.sulfur.dioxide 0.42254838 0.07772427 0.246876137 -0.364353068
## total.sulfur.dioxide 0.30909627 0.07605824 0.409546517 -0.534392659
## density -0.24037359 0.34107747 0.414045220 0.219564358
## pH -0.05776045 -0.37268279 -0.055080666 -0.092176948
## sulphates 0.42556996 0.17170812 -0.021441223 0.187249872
## alcohol 0.28935602 0.01690852 -0.553441758 0.001711672
## PC5 PC6 PC7 PC8
## fixed.acidity -0.17378035 0.024931205 0.27645400 0.42795386
## volatile.acidity 0.20879029 -0.218988634 0.42112691 0.06757919
## citric.acid -0.05881385 -0.150988427 0.17733884 0.04700887
## residual.sugar -0.11938978 -0.181780926 -0.30369375 -0.54887929
## chlorides 0.30390702 -0.733313395 -0.17819819 0.03625439
## free.sulfur.dioxide -0.01940729 0.037941341 0.04573332 -0.10606694
## total.sulfur.dioxide -0.03315885 0.022497339 0.30016030 0.19380950
## density -0.33473380 0.006553143 0.09313956 -0.44265534
## pH -0.83118807 -0.342544588 0.03779545 0.14742579
## sulphates -0.01592476 -0.276786043 -0.48247551 0.41451252
## alcohol 0.10716560 -0.402363195 0.50568560 -0.26303003
## PC9 PC10 PC11
## fixed.acidity -0.002067816 -0.425735583 -0.30484723
## volatile.acidity -0.059458836 0.004048655 0.41017017
## citric.acid -0.148939113 0.044723464 0.70344676
## residual.sugar -0.464058321 -0.308232513 0.06735746
## chlorides 0.057163780 -0.209446374 -0.24917423
## free.sulfur.dioxide 0.676974333 -0.339942192 0.18082575
## total.sulfur.dioxide -0.434884633 0.261005864 -0.24260689
## density 0.258362292 0.443203853 -0.16234416
## pH -0.004079270 -0.107163211 0.05802536
## sulphates 0.150327750 0.486983242 0.11254252
## alcohol 0.137385043 0.221599923 -0.20837942