Coffee Prediction Study

library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
dt <- read.csv("/Users/jianingjin/Desktop/IEMS_304/Lab5/coffee_data.csv")

#(2)
ggpairs(dt, column = c("Aroma_level", "Acidity_level", "Flavor_level", "Aftertaste_level"), ggplot2:: aes(color = Coffee_Origin)) +ggtitle("Pairplot of Coffee Flavor Features")

## Warning: Removed 8 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values

## Warning: Removed 8 rows containing missing values (`geom_point()`).

## Warning: Removed 8 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values

## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Removed 8 rows containing missing values (`geom_point()`).

## Warning: Removed 8 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values

## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Removed 9 rows containing missing values (`geom_point()`).
## Removed 9 rows containing missing values (`geom_point()`).

## Warning: Removed 9 rows containing non-finite values (`stat_density()`).

#(3)
dt2 <- dt[-c(1, 2, 5, 6, 12)]
dt2 <- dt2[complete.cases(dt2),]
dt2 <- as.data.frame(dt2)
tapply(dt2$Est.Price, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.030   4.630   5.290   7.092   6.170  90.830 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.400   4.120   4.850   6.312   6.700  57.280 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.99    4.41    5.29   10.71    7.33  875.00 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.810   4.015   4.690   6.663   5.860  89.510

tapply(dt2$Aroma_level, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.000   9.000   9.000   8.818   9.000  10.000 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   8.000   9.000   8.654   9.000   9.000 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   8.000   9.000   8.704   9.000  10.000 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   8.000   9.000   8.572   9.000  10.000

tapply(dt2$Rating, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85.00   92.00   93.00   93.18   94.00   97.00 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      85      91      92      92      93      96 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    75.0    91.0    93.0    92.4    94.0    98.0 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.00   91.00   92.00   91.88   93.00   97.00

tapply(dt2$Acidity_level, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    8.00    8.00    8.45    9.00    9.00 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   8.000   8.000   8.307   9.000   9.000 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   8.000   8.000   8.307   9.000  10.000 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   8.000   8.000   8.219   9.000  10.000

tapply(dt2$Flavor_level, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   8.000   9.000   8.583   9.000  10.000 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.000   8.000   9.000   8.516   9.000  10.000 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   8.000   9.000   8.508   9.000  10.000 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   8.000   8.000   8.333   9.000  10.000

tapply(dt2$Aftertaste_level, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.000   9.000   9.000   8.841   9.000  10.000 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.000   8.000   9.000   8.712   9.000  10.000 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00    8.00    9.00    8.76    9.00   10.00 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   8.000   9.000   8.628   9.000  10.000

tapply(dt2$Quality, dt2$Coffee_Origin, summary)

## $Africa
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   18.00   20.00   19.98   23.00   59.00 
## 
## $Asia
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   14.00   19.00   18.44   22.00   40.00 
## 
## $`Central America`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -21.00   16.00   19.00   19.01   22.00   62.00 
## 
## $Other
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   16.00   16.76   20.00   91.00

#(4)
ggplot(dt2, aes(Coffee_Origin, Aroma_level, fill = Coffee_Origin)) + geom_boxplot() +ggtitle("Boxplot of Aroma Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Rating, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Rating by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Acidity_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Acidity Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Flavor_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Flavor Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Aftertaste_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Aftertaste Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Est.Price, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Est.Price by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Quality, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Quality by Coffee_Origin")

#we could see that in the boxplot of aroma level, the median is very close to the upper lever and the upper levels of four different regions are quite same. The same situation is also in the boxplot of acidity level and aftertaste level. It demonstrates that our data of those levels are very concentrated.

#(5)
library(reshape)
dt3 <- dt2[, -c(2)]
cor1 <- round(cor(dt3), 2)
cor1

##                  Rating Est.Price Aroma_level Acidity_level Flavor_level
## Rating             1.00      0.12        0.25          0.25         0.29
## Est.Price          0.12      1.00        0.10          0.11         0.07
## Aroma_level        0.25      0.10        1.00          0.48         0.38
## Acidity_level      0.25      0.11        0.48          1.00         0.41
## Flavor_level       0.29      0.07        0.38          0.41         1.00
## Aftertaste_level   0.21      0.09        0.53          0.49         0.32
## Quality            0.00      0.04        0.07          0.04         0.09
##                  Aftertaste_level Quality
## Rating                       0.21    0.00
## Est.Price                    0.09    0.04
## Aroma_level                  0.53    0.07
## Acidity_level                0.49    0.04
## Flavor_level                 0.32    0.09
## Aftertaste_level             1.00    0.03
## Quality                      0.03    1.00

melt1 <- melt(cor1)

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

melt1

#(6)
ggplot(melt1, aes(x = X1, y = X2, fill = value)) +
  geom_tile(color = "black") +
  geom_text(aes(label = value), color = "black", size = 4) +
  scale_fill_gradient(low = "white", high = "red") +
  coord_fixed() + ggtitle("Correlation Heatmap of Munerical Coffee Data")

#(7)
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

dt <- dt[complete.cases(dt),]
dt$Coffee_Origin = factor(dt$Coffee_Origin)
rf <- randomForest(Coffee_Origin ~., data = dt, importance = TRUE, proximity = TRUE)
rf

## 
## Call:
##  randomForest(formula = Coffee_Origin ~ ., data = dt, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 25.76%
## Confusion matrix:
##                 Africa Asia Central America Other class.error
## Africa             589    4             102    85   0.2448718
## Asia                38   22              25    68   0.8562092
## Central America    116    6             417   178   0.4184100
## Other               90    3              88  1286   0.1233810

Coffee Prediction Study

Jianing

2023-02-20