WQ <- read.csv("D:/R course/Practicing/Wine Quality/WQ.csv")
summary(WQ)
##  fixed.acidity   volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.4000   1st Qu.:0.1100   1st Qu.: 2.000  
##  Median : 8.00   Median :0.5300   Median :0.2000   Median : 2.300  
##  Mean   : 8.67   Mean   :0.5418   Mean   :0.2467   Mean   : 3.699  
##  3rd Qu.: 9.90   3rd Qu.:0.6600   3rd Qu.:0.3850   3rd Qu.: 3.460  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.0000   Max.   :15.990  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01000   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.05600   1st Qu.: 9.00       1st Qu.: 24.00       1st Qu.:0.9953  
##  Median :0.07500   Median :16.00       Median : 42.00       Median :0.9966  
##  Mean   :0.07586   Mean   :20.19       Mean   : 52.62       Mean   :0.9965  
##  3rd Qu.:0.08600   3rd Qu.:27.00       3rd Qu.: 73.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH         sulphates         alcohol         quality     
##  Min.   :2.34   Min.   :0.3300   Min.   : 8.40   Min.   :2.000  
##  1st Qu.:3.18   1st Qu.:0.5600   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.30   Median :0.6500   Median :10.40   Median :6.000  
##  Mean   :3.29   Mean   :0.9495   Mean   :10.67   Mean   :5.638  
##  3rd Qu.:3.42   3rd Qu.:0.8400   3rd Qu.:11.40   3rd Qu.:6.000  
##  Max.   :4.16   Max.   :3.9900   Max.   :15.00   Max.   :9.000
hist(WQ$fixed.acidity)

hist(WQ$volatile.acidity)

hist(WQ$citric.acid)

hist(WQ$residual.sugar)

hist(WQ$chlorides)

hist(WQ$free.sulfur.dioxide)

hist(WQ$total.sulfur.dioxide)

hist(WQ$density)

hist(WQ$pH)

hist(WQ$sulphates)

plot(WQ$alcohol)

hist(WQ$alcohol)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(WQ, aes(x = "", y = alcohol)) +
  geom_violin(trim = FALSE) +
  labs(title = "Violin Plot", y = "alcohol%") +
  theme_minimal()

table(WQ$quality)
## 
##   2   3   4   5   6   7   8   9 
##  44  60  98 735 678 265  59  60
hist(WQ$quality)

library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
correlation_data <- WQ[, c("fixed.acidity", "quality")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Display the correlation matrix
print(cor_matrix)
##               fixed.acidity    quality
## fixed.acidity    1.00000000 0.04483084
## quality          0.04483084 1.00000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE, 
           title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

model <- lm(quality ~ fixed.acidity, data = WQ)

# Summary of the regression model
summary(model)
## 
## Call:
## lm(formula = quality ~ fixed.acidity, data = WQ)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7969 -0.6260  0.2911  0.4092  3.4544 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.41995    0.11221  48.303   <2e-16 ***
## fixed.acidity  0.02513    0.01253   2.005   0.0451 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.255 on 1997 degrees of freedom
## Multiple R-squared:  0.00201,    Adjusted R-squared:  0.00151 
## F-statistic: 4.022 on 1 and 1997 DF,  p-value: 0.04505
# Plot the relationship with the regression line
plot(WQ$fixed.acidity, WQ$quality, 
     main = "Linear Regression of Quality vs Fixed Acidity",
     xlab = "Fixed Acidity", ylab = "Quality", pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)

correlation_data <- WQ[, c("volatile.acidity", "quality")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Display the correlation matrix
print(cor_matrix)
##                  volatile.acidity    quality
## volatile.acidity        1.0000000 -0.1688718
## quality                -0.1688718  1.0000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE, 
           title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

model <- lm(quality ~ volatile.acidity, data = WQ)

# Summary of the regression model
summary(model)
## 
## Call:
## lm(formula = quality ~ volatile.acidity, data = WQ)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1336 -0.6399  0.0780  0.5012  3.7715 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.27465    0.08766  71.577  < 2e-16 ***
## volatile.acidity -1.17547    0.15353  -7.656 2.96e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.238 on 1997 degrees of freedom
## Multiple R-squared:  0.02852,    Adjusted R-squared:  0.02803 
## F-statistic: 58.62 on 1 and 1997 DF,  p-value: 2.956e-14
# Plot the relationship with the regression line
plot(WQ$fixed.acidity, WQ$quality, 
     main = "Linear Regression of Quality vs Volatile Acidity",
     xlab = "Volatile Acidity", ylab = "Quality", pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)

correlation_data <- WQ[, c("residual.sugar", "quality")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Display the correlation matrix
print(cor_matrix)
##                residual.sugar      quality
## residual.sugar    1.000000000 -0.007908868
## quality          -0.007908868  1.000000000
# Visualize the correlation matrix using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE, 
           title = "Correlation Matrix of Wine Parameters", colors = c("red", "white", "blue"))

correlation_data <- WQ[, c( "pH", "quality")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Matrix of Wine Parameters", 
           hc.order = TRUE, outline.color = "black")

model <- lm(quality ~ pH, data = WQ)

# Create a regression plot using ggplot2
ggplot(WQ, aes(x = pH, y = quality)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Regression of Wine Quality vs pH",
       x = "pH",
       y = "Quality") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "fixed.acidity")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Matrix of Wine Parameters", 
           hc.order = TRUE, outline.color = "black")

model <- lm(fixed.acidity ~ pH, data = WQ)

# Create a regression plot using ggplot2
ggplot(WQ, aes(x = fixed.acidity, y = pH)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Regression of Wine Quality vs pH",
       x = "fixed.acidity",
       y = "pH") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "volatile.acidity")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Matrix of Wine Parameters", 
           hc.order = TRUE, outline.color = "black")

model <- lm(volatile.acidity ~ pH, data = WQ)

# Create a regression plot using ggplot2
ggplot(WQ, aes(x = volatile.acidity, y = pH)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Regression of Wine Quality vs pH",
       x = "volatile.acidity",
       y = "pH") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "citric.acid")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Matrix of Wine Parameters", 
           hc.order = TRUE, outline.color = "black")

model <- lm(citric.acid ~ pH, data = WQ)

# Create a regression plot using ggplot2
ggplot(WQ, aes(x = citric.acid, y = pH)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Regression of citric.acid vs pH",
       x = "citric.acid",
       y = "pH") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

correlation_data <- WQ[, c( "pH", "total.sulfur.dioxide")]

# Compute the correlation matrix
cor_matrix <- cor(correlation_data)

# Visualize the correlation matrix with improved aesthetics
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Matrix of Wine Parameters", 
           hc.order = TRUE, outline.color = "black")

model <- lm(total.sulfur.dioxide ~ pH, data = WQ)

# Create a regression plot using ggplot2
ggplot(WQ, aes(x = total.sulfur.dioxide, y = pH)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  labs(title = "Regression of citric.acid vs pH",
       x = "total.sulfur.dioxide",
       y = "pH") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    axis.title = element_text(size = 12)
  )
## `geom_smooth()` using formula = 'y ~ x'

physicochemical_data <- WQ[, c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", 
                               "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density", 
                               "pH", "sulphates", "alcohol")]

# Compute the correlation matrix for physicochemical properties
cor_matrix <- cor(physicochemical_data)

# Create a heatmap using ggcorrplot
ggcorrplot(cor_matrix, method = "square", type = "lower", 
           lab = TRUE, lab_size = 3, 
           colors = c("#6D9EC1", "white", "#E46726"), 
           title = "Correlation Heatmap of Physicochemical Properties", 
           hc.order = TRUE, outline.color = "black")

# Load necessary libraries
library(caret)       # For classification models and data splitting
## Loading required package: lattice
library(rpart)       # For decision tree models
library(ggplot2)     # For visualization
library(dplyr)       # For data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
WQ$quality <- as.numeric(as.character(WQ$quality))

# Step 1: Create a categorical variable for quality
WQ$quality_label <- cut(WQ$quality,
                        breaks = c(-Inf, 4, 6, Inf),
                        labels = c("low", "fair", "high"))

# Step 2: Prepare the data by removing the original quality variable
WQ <- WQ %>% select(-quality)  # Remove original quality variable

# Step 3: Split the data into training and test sets
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(WQ$quality_label, p = .8, list = FALSE)
train_data <- WQ[trainIndex, ]
test_data  <- WQ[-trainIndex, ]

# Step 4: Logistic Regression Model
log_model <- train(quality_label ~ ., data = train_data, method = "multinom")
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1179.345238
## iter  20 value 959.696023
## iter  30 value 955.044431
## iter  40 value 950.223456
## iter  50 value 945.359339
## iter  60 value 945.358271
## iter  70 value 945.356702
## iter  80 value 945.304002
## iter  90 value 945.099155
## final  value 945.087567 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1179.504634
## iter  20 value 970.161093
## final  value 968.829778 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1179.345398
## iter  20 value 959.708732
## iter  30 value 955.079834
## iter  40 value 950.883929
## iter  50 value 948.068195
## final  value 948.067451 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1155.916392
## iter  20 value 987.794709
## iter  30 value 980.574048
## iter  40 value 975.573317
## iter  50 value 975.516618
## iter  60 value 975.509137
## iter  70 value 975.499093
## iter  80 value 975.470362
## iter  90 value 975.369347
## final  value 975.360981 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1156.118723
## iter  20 value 995.436928
## final  value 991.247867 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1155.916594
## iter  20 value 987.803506
## iter  30 value 980.599802
## iter  40 value 976.906881
## iter  50 value 976.880072
## iter  60 value 976.876604
## iter  70 value 976.872038
## iter  80 value 976.858364
## final  value 976.851411 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1115.982262
## iter  20 value 940.701273
## iter  30 value 934.306632
## iter  40 value 931.635285
## iter  50 value 930.212457
## iter  60 value 930.206439
## iter  70 value 930.203460
## iter  80 value 930.157438
## iter  90 value 929.888786
## final  value 929.872456 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1116.471512
## iter  20 value 951.258708
## final  value 950.688636 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1115.982752
## iter  20 value 940.714923
## iter  30 value 934.357142
## iter  40 value 932.141486
## iter  50 value 931.404495
## iter  60 value 931.401362
## iter  70 value 931.399696
## iter  80 value 931.385617
## final  value 931.348981 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1230.929650
## iter  20 value 1029.044551
## iter  30 value 1008.395047
## iter  40 value 1007.404648
## iter  50 value 1005.425546
## iter  50 value 1005.425543
## final  value 1005.425543 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1231.050553
## iter  20 value 1039.715889
## final  value 1029.665567 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1230.929771
## iter  20 value 1029.063885
## iter  30 value 1008.441690
## iter  40 value 1007.541027
## iter  50 value 1006.345333
## iter  60 value 1006.343504
## iter  70 value 1006.339722
## iter  80 value 1006.305436
## final  value 1006.200414 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1231.872508
## iter  20 value 958.662196
## iter  30 value 947.285837
## iter  40 value 940.865496
## iter  50 value 937.592781
## final  value 937.591802 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1232.574200
## iter  20 value 969.936117
## final  value 960.716308 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1231.873210
## iter  20 value 958.675504
## iter  30 value 947.314986
## iter  40 value 941.941871
## iter  50 value 940.195315
## iter  60 value 940.194731
## iter  70 value 940.194083
## iter  80 value 940.174657
## final  value 940.159760 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1220.580757
## iter  20 value 1007.058477
## iter  30 value 1003.667283
## iter  40 value 1000.334325
## iter  50 value 999.489845
## iter  50 value 999.489838
## final  value 999.489838 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1220.687005
## iter  20 value 1014.654894
## final  value 1012.496572 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1220.580863
## iter  20 value 1007.067492
## iter  30 value 1003.682686
## iter  40 value 1000.944906
## iter  50 value 1000.510367
## iter  50 value 1000.510364
## final  value 1000.510364 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1127.950862
## iter  20 value 926.171005
## iter  30 value 918.352971
## iter  40 value 916.911024
## iter  50 value 913.474215
## final  value 913.474005 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1128.058182
## iter  20 value 939.890353
## final  value 936.060587 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1127.950969
## iter  20 value 926.187024
## iter  30 value 918.392565
## iter  40 value 917.064728
## iter  50 value 914.795216
## final  value 914.795119 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1218.437883
## iter  20 value 1006.060564
## iter  30 value 992.726011
## iter  40 value 990.175258
## iter  50 value 986.735025
## iter  60 value 986.726393
## iter  70 value 986.712525
## iter  80 value 985.699180
## iter  90 value 984.702100
## final  value 984.670405 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1218.544821
## iter  20 value 1019.448813
## final  value 1013.909425 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1218.437990
## iter  20 value 1006.077477
## iter  30 value 992.773288
## iter  40 value 990.487144
## iter  50 value 988.431705
## iter  60 value 988.427306
## final  value 988.427168 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1290.580074
## iter  20 value 960.900642
## iter  30 value 947.627921
## iter  40 value 943.674039
## final  value 937.750384 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1291.135061
## iter  20 value 974.574094
## final  value 968.398357 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1290.580630
## iter  20 value 960.917677
## iter  30 value 947.688138
## iter  40 value 944.082652
## final  value 940.226029 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1209.228628
## iter  20 value 988.211898
## iter  30 value 975.384715
## iter  40 value 974.882879
## iter  50 value 974.207906
## final  value 974.207761 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1209.342610
## iter  20 value 1001.260482
## final  value 993.753329 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1209.228742
## iter  20 value 988.227681
## iter  30 value 975.436914
## iter  40 value 974.973568
## final  value 974.546607 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1241.245190
## iter  20 value 1030.920495
## iter  30 value 1009.747596
## iter  40 value 1003.266602
## iter  50 value 1003.256622
## final  value 1003.255549 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1241.387300
## iter  20 value 1039.180382
## final  value 1024.416316 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1241.245332
## iter  20 value 1030.930024
## iter  30 value 1009.794535
## iter  40 value 1004.916608
## iter  50 value 1004.912117
## final  value 1004.911754 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1184.670478
## iter  20 value 1003.523226
## iter  30 value 992.653371
## iter  40 value 988.789981
## iter  50 value 986.518543
## iter  60 value 986.515395
## iter  70 value 986.513034
## iter  80 value 986.487078
## iter  90 value 986.261820
## final  value 986.254317 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1184.804893
## iter  20 value 1014.280796
## final  value 1008.011548 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1184.670613
## iter  20 value 1003.535635
## iter  30 value 992.681978
## iter  40 value 989.407666
## iter  50 value 988.152047
## iter  60 value 988.149840
## iter  70 value 988.148250
## iter  80 value 988.142042
## final  value 988.098120 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.174194
## iter  20 value 989.733566
## iter  30 value 963.949520
## iter  40 value 955.249284
## iter  50 value 955.121882
## final  value 955.121372 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.728594
## iter  20 value 998.808745
## final  value 990.696472 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.174749
## iter  20 value 989.744508
## iter  30 value 964.054382
## iter  40 value 957.658068
## iter  50 value 957.612776
## final  value 957.612077 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1237.703267
## iter  20 value 983.830697
## iter  30 value 969.480848
## iter  40 value 965.110332
## iter  50 value 963.635821
## final  value 963.635767 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1238.211473
## iter  20 value 991.462301
## iter  30 value 979.869553
## iter  30 value 979.869551
## iter  30 value 979.869551
## final  value 979.869551 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1237.703776
## iter  20 value 983.839859
## iter  30 value 969.498600
## iter  40 value 965.995581
## iter  50 value 965.211474
## final  value 965.211436 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1284.364942
## iter  20 value 965.422603
## iter  30 value 956.084279
## iter  40 value 954.810086
## final  value 951.689485 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1285.200315
## iter  20 value 977.529413
## iter  30 value 973.688422
## iter  30 value 973.688420
## iter  30 value 973.688420
## final  value 973.688420 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1284.365778
## iter  20 value 965.437682
## iter  30 value 956.119497
## iter  40 value 954.927574
## final  value 952.836008 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1260.723266
## iter  20 value 1052.878317
## iter  30 value 1033.204760
## iter  40 value 1023.032901
## iter  50 value 1022.912284
## iter  60 value 1022.911199
## iter  70 value 1022.907295
## iter  80 value 1022.869242
## iter  90 value 1022.838443
## final  value 1022.825412 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1261.145936
## iter  20 value 1061.763110
## final  value 1042.759317 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1260.723689
## iter  20 value 1052.888860
## iter  30 value 1033.224432
## iter  40 value 1025.580755
## iter  50 value 1025.534621
## final  value 1025.534027 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1202.747242
## iter  20 value 979.008449
## iter  30 value 964.637680
## iter  40 value 963.731819
## final  value 961.654637 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1202.862447
## iter  20 value 994.389765
## final  value 987.080106 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1202.747358
## iter  20 value 979.027886
## iter  30 value 964.687626
## iter  40 value 963.832664
## final  value 962.470618 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1190.313954
## iter  20 value 985.106317
## iter  30 value 963.184658
## iter  40 value 955.989911
## iter  50 value 955.761422
## final  value 955.760947 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1190.505205
## iter  20 value 998.692371
## iter  30 value 983.095161
## iter  30 value 983.095158
## iter  30 value 983.095158
## final  value 983.095158 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1190.314145
## iter  20 value 985.123154
## iter  30 value 963.224938
## iter  40 value 957.936279
## iter  50 value 957.814508
## final  value 957.814153 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1156.282947
## iter  20 value 969.534604
## iter  30 value 960.924879
## iter  40 value 958.200261
## final  value 956.268616 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1156.391335
## iter  20 value 981.525327
## iter  30 value 976.740384
## iter  30 value 976.740382
## iter  30 value 976.740382
## final  value 976.740382 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1156.283055
## iter  20 value 969.549626
## iter  30 value 960.952634
## iter  40 value 958.660223
## final  value 957.490717 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1181.219917
## iter  20 value 1003.237081
## iter  30 value 998.627370
## iter  40 value 993.902875
## iter  50 value 993.481695
## iter  60 value 993.464975
## iter  70 value 993.341095
## iter  80 value 992.712815
## iter  90 value 992.575664
## final  value 992.114412 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1181.307207
## iter  20 value 1012.279856
## final  value 1006.160408 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1181.220004
## iter  20 value 1003.244167
## iter  30 value 998.644056
## iter  40 value 995.144231
## iter  50 value 994.935275
## iter  60 value 994.927338
## iter  70 value 994.869916
## iter  80 value 994.782047
## iter  90 value 994.723054
## final  value 994.694912 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1285.777181
## iter  20 value 1021.299591
## iter  30 value 998.756366
## iter  40 value 991.946702
## final  value 991.885255 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1286.373792
## iter  20 value 1022.881055
## iter  30 value 1015.190277
## iter  30 value 1015.190273
## iter  30 value 1015.190273
## final  value 1015.190273 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1285.777778
## iter  20 value 1021.310640
## iter  30 value 998.811136
## iter  40 value 993.683624
## final  value 993.660333 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1257.553822
## iter  20 value 1008.619575
## iter  30 value 989.666746
## iter  40 value 985.869062
## final  value 980.928322 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1258.042092
## iter  20 value 1021.574368
## final  value 1014.166924 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1257.554311
## iter  20 value 1008.634914
## iter  30 value 989.728879
## iter  40 value 986.311151
## final  value 983.208815 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1228.287652
## iter  20 value 1007.400973
## iter  30 value 981.119251
## iter  40 value 972.995000
## iter  50 value 972.868165
## iter  60 value 972.844336
## iter  70 value 972.810463
## iter  80 value 972.793797
## iter  90 value 972.426311
## final  value 972.391822 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1228.378853
## iter  20 value 1012.810713
## final  value 1003.383205 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1228.287743
## iter  20 value 1007.414017
## iter  30 value 981.206340
## iter  40 value 975.283284
## iter  50 value 975.224803
## iter  60 value 975.214318
## iter  70 value 975.199450
## iter  80 value 975.195100
## final  value 975.142251 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1175.261496
## iter  20 value 994.828143
## iter  30 value 986.645109
## iter  40 value 985.107457
## iter  50 value 982.240569
## iter  50 value 982.240567
## final  value 982.240567 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1175.359996
## iter  20 value 1004.638761
## iter  30 value 1000.621154
## iter  30 value 1000.621151
## iter  30 value 1000.621151
## final  value 1000.621151 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1175.261595
## iter  20 value 994.840346
## iter  30 value 986.671845
## iter  40 value 985.240096
## final  value 983.311901 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.813559
## iter  20 value 982.819059
## iter  30 value 956.542247
## iter  40 value 954.303205
## iter  50 value 949.002187
## iter  60 value 948.999679
## final  value 948.999547 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.925793
## iter  20 value 998.215902
## iter  30 value 991.966150
## iter  30 value 991.966145
## iter  30 value 991.966145
## final  value 991.966145 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1233.813672
## iter  20 value 982.838144
## iter  30 value 956.713616
## iter  40 value 954.577661
## iter  50 value 950.988932
## final  value 950.987483 
## converged
## # weights:  39 (24 variable)
## initial  value 1758.878274 
## iter  10 value 1219.348559
## iter  20 value 1001.559221
## iter  30 value 992.530342
## iter  40 value 987.792883
## final  value 986.327181 
## converged
# Predict on test data
log_pred <- predict(log_model, newdata = test_data)

# Step 5: Decision Tree Model
tree_model <- rpart(quality_label ~ ., data = train_data, method = "class")

# Predict on test data using Decision Tree
tree_pred <- predict(tree_model, newdata = test_data, type = "class")

# Step 6: Evaluate both models using confusion matrices
cat("Logistic Regression Confusion Matrix:\n")
## Logistic Regression Confusion Matrix:
confusionMatrix(log_pred, test_data$quality_label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction low fair high
##       low    3    2   10
##       fair  24  264   39
##       high  13   16   27
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7387          
##                  95% CI : (0.6926, 0.7812)
##     No Information Rate : 0.7085          
##     P-Value [Acc > NIR] : 0.1014          
##                                           
##                   Kappa : 0.3251          
##                                           
##  Mcnemar's Test P-Value : 2.685e-06       
## 
## Statistics by Class:
## 
##                      Class: low Class: fair Class: high
## Sensitivity            0.075000      0.9362     0.35526
## Specificity            0.966480      0.4569     0.90994
## Pos Pred Value         0.200000      0.8073     0.48214
## Neg Pred Value         0.903394      0.7465     0.85673
## Prevalence             0.100503      0.7085     0.19095
## Detection Rate         0.007538      0.6633     0.06784
## Detection Prevalence   0.037688      0.8216     0.14070
## Balanced Accuracy      0.520740      0.6965     0.63260
cat("\nDecision Tree Confusion Matrix:\n")
## 
## Decision Tree Confusion Matrix:
confusionMatrix(tree_pred, test_data$quality_label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction low fair high
##       low    7    4    6
##       fair  12  257   32
##       high  21   21   38
## 
## Overall Statistics
##                                        
##                Accuracy : 0.7588       
##                  95% CI : (0.7137, 0.8)
##     No Information Rate : 0.7085       
##     P-Value [Acc > NIR] : 0.014556     
##                                        
##                   Kappa : 0.4277       
##                                        
##  Mcnemar's Test P-Value : 0.002176     
## 
## Statistics by Class:
## 
##                      Class: low Class: fair Class: high
## Sensitivity             0.17500      0.9113     0.50000
## Specificity             0.97207      0.6207     0.86957
## Pos Pred Value          0.41176      0.8538     0.47500
## Neg Pred Value          0.91339      0.7423     0.88050
## Prevalence              0.10050      0.7085     0.19095
## Detection Rate          0.01759      0.6457     0.09548
## Detection Prevalence    0.04271      0.7563     0.20101
## Balanced Accuracy       0.57353      0.7660     0.68478
# Optional: Visualize Decision Tree
plot(tree_model)
text(tree_model, pretty = 0)

library(dplyr)
library(tidyr)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
features <- WQ %>% select(fixed.acidity, volatile.acidity, citric.acid, 
                          residual.sugar, chlorides, free.sulfur.dioxide,
                          total.sulfur.dioxide, density, pH, sulphates, alcohol)

# Scale the features for better clustering performance
features_scaled <- scale(features)

# Step 2: Determine the optimal number of clusters using the elbow method
wss <- sapply(1:10, function(k) {
  kmeans(features_scaled, k, nstart = 10)$tot.withinss
})

# Plot the elbow method result
ggplot(data.frame(Clusters = 1:10, WSS = wss), aes(x = Clusters, y = WSS)) +
  geom_line() + geom_point() +
  labs(title = "Elbow Method for Optimal k",
       x = "Number of Clusters",
       y = "Total Within-Cluster Sum of Squares") +
  theme_minimal()

# Step 3: Fit the K-means clustering model with the chosen number of clusters
optimal_k <- 3  # Based on the elbow method
kmeans_model <- kmeans(features_scaled, centers = optimal_k, nstart = 25)

# Step 4: Add the cluster assignments to the original dataset
WQ$cluster <- as.factor(kmeans_model$cluster)

# Step 5: Visualize the clusters
fviz_cluster(kmeans_model, data = features_scaled, 
             geom = "point", ellipse.type = "convex", 
             ggtheme = theme_minimal(), 
             main = paste("K-means Clustering (k =", optimal_k, ")"))

# Optional: Summarize the clusters
cluster_summary <- WQ %>%
  group_by(cluster) %>%
  summarise(across(everything(), mean))
## Warning: There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(everything(), mean)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
##PCA##
# Load necessary libraries
library(ggplot2)         # For visualization
library(dplyr)           # For data manipulation
library(tidyr)           # For data manipulation
library(factoextra)      # For visualization of PCA results

# Step 1: Data Preparation
# Select relevant physicochemical features (excluding quality)
features <- WQ %>% select(fixed.acidity, volatile.acidity, citric.acid, 
                          residual.sugar, chlorides, free.sulfur.dioxide,
                          total.sulfur.dioxide, density, pH, sulphates, alcohol)

# Check for any missing values
if (any(is.na(features))) {
  cat("Warning: Missing values found in features. Removing rows with NA.\n")
  features <- na.omit(features)  # Remove rows with missing values
}

# Scale the features for PCA
features_scaled <- scale(features)

# Step 2: Perform PCA
pca_result <- prcomp(features_scaled, center = TRUE, scale. = TRUE)

# Step 3: Evaluate PCA Results
# Get the proportion of variance explained by each principal component
pca_var <- pca_result$sdev^2
pca_var_explained <- pca_var / sum(pca_var)

# Create a data frame for the variance explained
var_df <- data.frame(PC = paste0("PC", 1:length(pca_var_explained)),
                     Variance = pca_var_explained)

# Plot the variance explained by each principal component
ggplot(var_df, aes(x = PC, y = Variance)) +
  geom_bar(stat = "identity") +
  labs(title = "Variance Explained by Principal Components",
       x = "Principal Component",
       y = "Proportion of Variance") +
  theme_minimal()

# Step 4: Visualize PCA Results
# Create a biplot
fviz_pca_biplot(pca_result, 
                 geom.ind = "point", 
                 geom.var = "arrow", 
                 col.ind = WQ$quality_label,  # Color by quality label if available
                 legend.title = "Quality Label",
                 repel = TRUE) + 
  labs(title = "PCA Biplot")

# Optional: Get PCA loadings
loadings <- as.data.frame(pca_result$rotation)
print(loadings)
##                              PC1         PC2          PC3          PC4
## fixed.acidity         0.09236898  0.52570491  0.006787771  0.387913461
## volatile.acidity      0.13399181 -0.33565128  0.446002193  0.474563924
## citric.acid          -0.24133485  0.52650436 -0.156701307 -0.247102167
## residual.sugar        0.43277584  0.17686051  0.060569657  0.148545369
## chlorides            -0.35768234  0.07526146  0.250820615 -0.176317348
## free.sulfur.dioxide   0.42254838  0.07772427  0.246876137 -0.364353068
## total.sulfur.dioxide  0.30909627  0.07605824  0.409546517 -0.534392659
## density              -0.24037359  0.34107747  0.414045220  0.219564358
## pH                   -0.05776045 -0.37268279 -0.055080666 -0.092176948
## sulphates             0.42556996  0.17170812 -0.021441223  0.187249872
## alcohol               0.28935602  0.01690852 -0.553441758  0.001711672
##                              PC5          PC6         PC7         PC8
## fixed.acidity        -0.17378035  0.024931205  0.27645400  0.42795386
## volatile.acidity      0.20879029 -0.218988634  0.42112691  0.06757919
## citric.acid          -0.05881385 -0.150988427  0.17733884  0.04700887
## residual.sugar       -0.11938978 -0.181780926 -0.30369375 -0.54887929
## chlorides             0.30390702 -0.733313395 -0.17819819  0.03625439
## free.sulfur.dioxide  -0.01940729  0.037941341  0.04573332 -0.10606694
## total.sulfur.dioxide -0.03315885  0.022497339  0.30016030  0.19380950
## density              -0.33473380  0.006553143  0.09313956 -0.44265534
## pH                   -0.83118807 -0.342544588  0.03779545  0.14742579
## sulphates            -0.01592476 -0.276786043 -0.48247551  0.41451252
## alcohol               0.10716560 -0.402363195  0.50568560 -0.26303003
##                               PC9         PC10        PC11
## fixed.acidity        -0.002067816 -0.425735583 -0.30484723
## volatile.acidity     -0.059458836  0.004048655  0.41017017
## citric.acid          -0.148939113  0.044723464  0.70344676
## residual.sugar       -0.464058321 -0.308232513  0.06735746
## chlorides             0.057163780 -0.209446374 -0.24917423
## free.sulfur.dioxide   0.676974333 -0.339942192  0.18082575
## total.sulfur.dioxide -0.434884633  0.261005864 -0.24260689
## density               0.258362292  0.443203853 -0.16234416
## pH                   -0.004079270 -0.107163211  0.05802536
## sulphates             0.150327750  0.486983242  0.11254252
## alcohol               0.137385043  0.221599923 -0.20837942