library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
dt <- read.csv("/Users/jianingjin/Desktop/IEMS_304/Lab5/coffee_data.csv")
#(2)
ggpairs(dt, column = c("Aroma_level", "Acidity_level", "Flavor_level", "Aftertaste_level"), ggplot2:: aes(color = Coffee_Origin)) +ggtitle("Pairplot of Coffee Flavor Features")
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Removed 8 rows containing missing values (`geom_point()`).
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 9 rows containing missing values
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Removed 9 rows containing missing values (`geom_point()`).
## Removed 9 rows containing missing values (`geom_point()`).
## Warning: Removed 9 rows containing non-finite values (`stat_density()`).

#(3)
dt2 <- dt[-c(1, 2, 5, 6, 12)]
dt2 <- dt2[complete.cases(dt2),]
dt2 <- as.data.frame(dt2)
tapply(dt2$Est.Price, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.030 4.630 5.290 7.092 6.170 90.830
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.400 4.120 4.850 6.312 6.700 57.280
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.99 4.41 5.29 10.71 7.33 875.00
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.810 4.015 4.690 6.663 5.860 89.510
tapply(dt2$Aroma_level, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.000 9.000 9.000 8.818 9.000 10.000
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 8.000 9.000 8.654 9.000 9.000
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 8.000 9.000 8.704 9.000 10.000
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 8.000 9.000 8.572 9.000 10.000
tapply(dt2$Rating, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85.00 92.00 93.00 93.18 94.00 97.00
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85 91 92 92 93 96
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 75.0 91.0 93.0 92.4 94.0 98.0
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.00 91.00 92.00 91.88 93.00 97.00
tapply(dt2$Acidity_level, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 8.00 8.00 8.45 9.00 9.00
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 8.000 8.000 8.307 9.000 9.000
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 8.000 8.000 8.307 9.000 10.000
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 8.000 8.000 8.219 9.000 10.000
tapply(dt2$Flavor_level, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 8.000 9.000 8.583 9.000 10.000
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.000 8.000 9.000 8.516 9.000 10.000
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 8.000 9.000 8.508 9.000 10.000
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 8.000 8.000 8.333 9.000 10.000
tapply(dt2$Aftertaste_level, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.000 9.000 9.000 8.841 9.000 10.000
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 8.000 9.000 8.712 9.000 10.000
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 8.00 9.00 8.76 9.00 10.00
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 8.000 9.000 8.628 9.000 10.000
tapply(dt2$Quality, dt2$Coffee_Origin, summary)
## $Africa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 18.00 20.00 19.98 23.00 59.00
##
## $Asia
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 14.00 19.00 18.44 22.00 40.00
##
## $`Central America`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -21.00 16.00 19.00 19.01 22.00 62.00
##
## $Other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 12.00 16.00 16.76 20.00 91.00
#(4)
ggplot(dt2, aes(Coffee_Origin, Aroma_level, fill = Coffee_Origin)) + geom_boxplot() +ggtitle("Boxplot of Aroma Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Rating, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Rating by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Acidity_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Acidity Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Flavor_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Flavor Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Aftertaste_level, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Aftertaste Level by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Est.Price, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Est.Price by Coffee_Origin")

ggplot(dt2, aes(Coffee_Origin, Quality, fill = Coffee_Origin)) + geom_boxplot() + ggtitle("Boxplot of Quality by Coffee_Origin")

#we could see that in the boxplot of aroma level, the median is very close to the upper lever and the upper levels of four different regions are quite same. The same situation is also in the boxplot of acidity level and aftertaste level. It demonstrates that our data of those levels are very concentrated.
#(5)
library(reshape)
dt3 <- dt2[, -c(2)]
cor1 <- round(cor(dt3), 2)
cor1
## Rating Est.Price Aroma_level Acidity_level Flavor_level
## Rating 1.00 0.12 0.25 0.25 0.29
## Est.Price 0.12 1.00 0.10 0.11 0.07
## Aroma_level 0.25 0.10 1.00 0.48 0.38
## Acidity_level 0.25 0.11 0.48 1.00 0.41
## Flavor_level 0.29 0.07 0.38 0.41 1.00
## Aftertaste_level 0.21 0.09 0.53 0.49 0.32
## Quality 0.00 0.04 0.07 0.04 0.09
## Aftertaste_level Quality
## Rating 0.21 0.00
## Est.Price 0.09 0.04
## Aroma_level 0.53 0.07
## Acidity_level 0.49 0.04
## Flavor_level 0.32 0.09
## Aftertaste_level 1.00 0.03
## Quality 0.03 1.00
melt1 <- melt(cor1)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
melt1
#(6)
ggplot(melt1, aes(x = X1, y = X2, fill = value)) +
geom_tile(color = "black") +
geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient(low = "white", high = "red") +
coord_fixed() + ggtitle("Correlation Heatmap of Munerical Coffee Data")

#(7)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
dt <- dt[complete.cases(dt),]
dt$Coffee_Origin = factor(dt$Coffee_Origin)
rf <- randomForest(Coffee_Origin ~., data = dt, importance = TRUE, proximity = TRUE)
rf
##
## Call:
## randomForest(formula = Coffee_Origin ~ ., data = dt, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 25.76%
## Confusion matrix:
## Africa Asia Central America Other class.error
## Africa 589 4 102 85 0.2448718
## Asia 38 22 25 68 0.8562092
## Central America 116 6 417 178 0.4184100
## Other 90 3 88 1286 0.1233810