I decided to analyse the arabica coffees statistics scraped from the Coffee Quality Institute dataset and published by James LeDoux on https://github.com/jldbc/coffee-quality-database.
Coffee Quality Institute runs a Q Program, which identifies quality coffees across the World. One of the target of program is possibility of comparising coffees from other continents.
Let’s look on the data:
glimpse(coffee_full)
## Rows: 1,311
## Columns: 44
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ Species <chr> "Arabica", "Arabica", "Arabica", "Arabica", "...
## $ Owner <chr> "metad plc", "metad plc", "grounds for health...
## $ Country.of.Origin <chr> "Ethiopia", "Ethiopia", "Guatemala", "Ethiopi...
## $ Farm.Name <chr> "metad plc", "metad plc", "san marcos barranc...
## $ Lot.Number <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ Mill <chr> "metad plc", "metad plc", NA, "wolensu", "met...
## $ ICO.Number <chr> "2014/2015", "2014/2015", NA, NA, "2014/2015"...
## $ Company <chr> "metad agricultural developmet plc", "metad a...
## $ Altitude <chr> "1950-2200", "1950-2200", "1600 - 1800 m", "1...
## $ Region <chr> "guji-hambela", "guji-hambela", NA, "oromia",...
## $ Producer <chr> "METAD PLC", "METAD PLC", NA, "Yidnekachew Da...
## $ Number.of.Bags <int> 300, 300, 5, 320, 300, 100, 100, 300, 300, 50...
## $ Bag.Weight <chr> "60 kg", "60 kg", "1", "60 kg", "60 kg", "30 ...
## $ In.Country.Partner <chr> "METAD Agricultural Development plc", "METAD ...
## $ Harvest.Year <chr> "2014", "2014", NA, "2014", "2014", "2013", "...
## $ Grading.Date <chr> "April 4th, 2015", "April 4th, 2015", "May 31...
## $ Owner.1 <chr> "metad plc", "metad plc", "Grounds for Health...
## $ Variety <chr> NA, "Other", "Bourbon", NA, "Other", NA, "Oth...
## $ Processing.Method <chr> "Washed / Wet", "Washed / Wet", NA, "Natural ...
## $ Aroma <dbl> 8.67, 8.75, 8.42, 8.17, 8.25, 8.58, 8.42, 8.2...
## $ Flavor <dbl> 8.83, 8.67, 8.50, 8.58, 8.50, 8.42, 8.50, 8.3...
## $ Aftertaste <dbl> 8.67, 8.50, 8.42, 8.42, 8.25, 8.42, 8.33, 8.5...
## $ Acidity <dbl> 8.75, 8.58, 8.42, 8.42, 8.50, 8.50, 8.50, 8.4...
## $ Body <dbl> 8.50, 8.42, 8.33, 8.50, 8.42, 8.25, 8.25, 8.3...
## $ Balance <dbl> 8.42, 8.42, 8.42, 8.25, 8.33, 8.33, 8.25, 8.5...
## $ Uniformity <dbl> 10.00, 10.00, 10.00, 10.00, 10.00, 10.00, 10....
## $ Clean.Cup <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...
## $ Sweetness <dbl> 10.00, 10.00, 10.00, 10.00, 10.00, 10.00, 10....
## $ Cupper.Points <dbl> 8.75, 8.58, 9.25, 8.67, 8.58, 8.33, 8.50, 9.0...
## $ Total.Cup.Points <dbl> 90.58, 89.92, 89.75, 89.00, 88.83, 88.83, 88....
## $ Moisture <dbl> 0.12, 0.12, 0.00, 0.11, 0.12, 0.11, 0.11, 0.0...
## $ Category.One.Defects <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Quakers <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Color <chr> "Green", "Green", NA, "Green", "Green", "Blui...
## $ Category.Two.Defects <int> 0, 1, 0, 2, 2, 1, 0, 0, 0, 4, 1, 0, 0, 2, 2, ...
## $ Expiration <chr> "April 3rd, 2016", "April 3rd, 2016", "May 31...
## $ Certification.Body <chr> "METAD Agricultural Development plc", "METAD ...
## $ Certification.Address <chr> "cfa68390e26ad44f91f305dd64b6644b49d9b237", "...
## $ Certification.Contact <chr> "19fef5a731de2db57d16da10287413f5f99bc2dd", "...
## $ unit_of_measurement <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", ...
## $ altitude_low_meters <dbl> 1950.0, 1950.0, 1600.0, 1800.0, 1950.0, NA, N...
## $ altitude_high_meters <dbl> 2200.0, 2200.0, 1800.0, 2200.0, 2200.0, NA, N...
## $ altitude_mean_meters <dbl> 2075.0, 2075.0, 1700.0, 2000.0, 2075.0, NA, N...
Data contain 1311 coffees with 44 attributes. Some of them are IDs, names, and small regions (unique almost), but most are numeric and categorical variables describing specific parameters quality and parameters of production.
I create coffee dataset without clumsy data like row numbers, Lot.Number, unique codes and columns with the same information.
coffee <- coffee_full
coffee[,c("X","Species","Certification.Address","Certification.Contact",
"Lot.Number","Mill","ICO.Number","Expiration","Grading.Date",
"Company","Farm.Name","Owner","Producer","Owner.1",
"In.Country.Partner","Region","Altitude","Certification.Body",
"Harvest.Year")] <- NULL
Now let’s look on the missing values:
colSums(is.na(coffee)) %>%
sort()
## Number.of.Bags Bag.Weight Aroma
## 0 0 0
## Flavor Aftertaste Acidity
## 0 0 0
## Body Balance Uniformity
## 0 0 0
## Clean.Cup Sweetness Cupper.Points
## 0 0 0
## Total.Cup.Points Moisture Category.One.Defects
## 0 0 0
## Category.Two.Defects unit_of_measurement Country.of.Origin
## 0 0 1
## Quakers Processing.Method Variety
## 1 152 201
## Color altitude_low_meters altitude_high_meters
## 216 227 227
## altitude_mean_meters
## 227
coffee %>%
md.pattern(rotate.names = TRUE)
## Number.of.Bags Bag.Weight Aroma Flavor Aftertaste Acidity Body Balance
## 904 1 1 1 1 1 1 1 1
## 93 1 1 1 1 1 1 1 1
## 74 1 1 1 1 1 1 1 1
## 14 1 1 1 1 1 1 1 1
## 24 1 1 1 1 1 1 1 1
## 41 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 12 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1 1
## 13 1 1 1 1 1 1 1 1
## 51 1 1 1 1 1 1 1 1
## 50 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## Uniformity Clean.Cup Sweetness Cupper.Points Total.Cup.Points Moisture
## 904 1 1 1 1 1 1
## 93 1 1 1 1 1 1
## 74 1 1 1 1 1 1
## 14 1 1 1 1 1 1
## 24 1 1 1 1 1 1
## 41 1 1 1 1 1 1
## 2 1 1 1 1 1 1
## 6 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 3 1 1 1 1 1 1
## 12 1 1 1 1 1 1
## 6 1 1 1 1 1 1
## 13 1 1 1 1 1 1
## 13 1 1 1 1 1 1
## 51 1 1 1 1 1 1
## 50 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## Category.One.Defects Category.Two.Defects unit_of_measurement
## 904 1 1 1
## 93 1 1 1
## 74 1 1 1
## 14 1 1 1
## 24 1 1 1
## 41 1 1 1
## 2 1 1 1
## 6 1 1 1
## 3 1 1 1
## 3 1 1 1
## 12 1 1 1
## 6 1 1 1
## 13 1 1 1
## 13 1 1 1
## 51 1 1 1
## 50 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## Country.of.Origin Quakers Processing.Method Variety Color
## 904 1 1 1 1 1
## 93 1 1 1 1 1
## 74 1 1 1 1 0
## 14 1 1 1 1 0
## 24 1 1 1 0 1
## 41 1 1 1 0 1
## 2 1 1 1 0 0
## 6 1 1 1 0 0
## 3 1 1 0 1 1
## 3 1 1 0 1 1
## 12 1 1 0 1 0
## 6 1 1 0 1 0
## 13 1 1 0 0 1
## 13 1 1 0 0 1
## 51 1 1 0 0 0
## 50 1 1 0 0 0
## 1 1 0 1 1 1
## 1 0 1 0 0 0
## 1 1 152 201 216
## altitude_low_meters altitude_high_meters altitude_mean_meters
## 904 1 1 1 0
## 93 0 0 0 3
## 74 1 1 1 1
## 14 0 0 0 4
## 24 1 1 1 1
## 41 0 0 0 4
## 2 1 1 1 2
## 6 0 0 0 5
## 3 1 1 1 1
## 3 0 0 0 4
## 12 1 1 1 2
## 6 0 0 0 5
## 13 1 1 1 2
## 13 0 0 0 5
## 51 1 1 1 3
## 50 0 0 0 6
## 1 1 1 1 1
## 1 0 0 0 7
## 227 227 227 1252
There are 1 NA in Quakers column and 227 NAs in each altitude column.
Dealing with missing value in Quakers column (I replace the missing value with the median):
table(coffee$Quakers, useNA = "ifany")
##
## 0 1 2 3 4 5 6 7 8 9 11 <NA>
## 1216 39 30 5 5 5 4 3 1 1 1 1
coffee$Quakers[is.na(coffee$Quakers)] <- median(coffee$Quakers, na.rm=TRUE)
Dealing with missing values in altitudes columns:
ggplot(coffee,
aes(x = altitude_low_meters)) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
There are 227 out of 1311 rows with NA altitudes.There are also 4 clear outliers - let’s see them:
which(coffee$altitude_low_meters>=11000)
## [1] 544 897 1041 1145
which(coffee$altitude_high_meters>=11000)
## [1] 544 897 1041 1145
which(coffee$altitude_mean_meters>=11000)
## [1] 544 897 1041 1145
They are the same rows.
I assign outliers to the altitude_outliers vector:
altitude_outliers <- which(coffee$altitude_low_meters>=11000)
coffee[altitude_outliers,]
## Country.of.Origin Number.of.Bags Bag.Weight Variety
## 544 Brazil 300 2 kg Moka Peaberry
## 897 Guatemala 25 69 kg Bourbon
## 1041 Nicaragua 275 69 kg Other
## 1145 Guatemala 25 69 kg Bourbon
## Processing.Method Aroma Flavor Aftertaste Acidity Body Balance
## 544 Semi-washed / Semi-pulped 7.08 7.50 7.50 7.83 7.75 7.67
## 897 Washed / Wet 7.42 7.42 7.08 7.50 7.42 7.33
## 1041 Washed / Wet 7.25 7.25 7.17 7.25 7.33 7.25
## 1145 Washed / Wet 7.50 7.42 7.25 7.58 7.33 7.42
## Uniformity Clean.Cup Sweetness Cupper.Points Total.Cup.Points Moisture
## 544 10.00 10.00 10.00 7.58 82.92 0.11
## 897 10.00 10.00 10.00 7.42 81.58 0.12
## 1041 10.00 10.00 10.00 7.25 80.75 0.12
## 1145 9.33 9.33 9.33 7.25 79.75 0.10
## Category.One.Defects Quakers Color Category.Two.Defects
## 544 0 0 Green 16
## 897 0 0 Green 0
## 1041 0 0 Green 5
## 1145 0 4 Green 1
## unit_of_measurement altitude_low_meters altitude_high_meters
## 544 m 11000 11000
## 897 m 190164 190164
## 1041 m 110000 110000
## 1145 m 190164 190164
## altitude_mean_meters
## 544 11000
## 897 190164
## 1041 110000
## 1145 190164
Interesting information - Geisha coffee is one of the best and most expensive coffee in the world. It is grown at high mountains. There is a farm Café Takesi in Bolivia, that claims to be the world’s highest coffee farm, at an elevation of 1,900 to 2,500 meters above sea level.
For us, it means that we should probably treat each value above 2500m as an outlier. There are values very close to 2500m, so I will cut every value above 2650m.
altitude_outliers <- which(coffee$altitude_low_meters>=2650)
altitude_outliers <- which(coffee$altitude_high_meters>=2650)
altitude_outliers <- which(coffee$altitude_mean_meters>=2650)
coffee[altitude_outliers,]
## Country.of.Origin Number.of.Bags Bag.Weight Variety
## 216 Guatemala 130 69 kg Bourbon
## 544 Brazil 300 2 kg Moka Peaberry
## 629 Colombia 1 1 kg <NA>
## 838 Guatemala 130 69 kg Bourbon
## 841 Myanmar 1 2 kg Catuai
## 897 Guatemala 25 69 kg Bourbon
## 1002 Guatemala 120 69 kg Bourbon
## 1039 Myanmar 1 2 kg Catuai
## 1041 Nicaragua 275 69 kg Other
## 1074 Myanmar 1 2 kg Other
## 1099 Myanmar 1 2 kg Catuai
## 1124 Myanmar 1 2 kg Other
## 1145 Guatemala 25 69 kg Bourbon
## 1270 Indonesia 15 2 kg Other
## Processing.Method Aroma Flavor Aftertaste Acidity Body Balance
## 216 Washed / Wet 7.58 7.83 7.58 7.83 7.83 7.67
## 544 Semi-washed / Semi-pulped 7.08 7.50 7.50 7.83 7.75 7.67
## 629 <NA> 7.33 7.58 7.42 7.42 7.67 7.67
## 838 Washed / Wet 7.58 7.50 7.33 7.42 7.58 7.25
## 841 Washed / Wet 7.33 7.58 7.50 7.42 7.33 7.33
## 897 Washed / Wet 7.42 7.42 7.08 7.50 7.42 7.33
## 1002 Washed / Wet 7.42 7.25 7.17 7.50 7.25 7.17
## 1039 Washed / Wet 6.92 7.50 7.00 7.58 7.50 7.08
## 1041 Washed / Wet 7.25 7.25 7.17 7.25 7.33 7.25
## 1074 Washed / Wet 7.17 7.33 7.17 7.42 7.25 7.08
## 1099 Natural / Dry 7.42 7.00 7.08 7.00 7.17 7.33
## 1124 Washed / Wet 7.17 7.33 7.00 7.42 7.17 7.00
## 1145 Washed / Wet 7.50 7.42 7.25 7.58 7.33 7.42
## 1270 Natural / Dry 7.33 7.00 6.50 6.08 7.58 6.33
## Uniformity Clean.Cup Sweetness Cupper.Points Total.Cup.Points Moisture
## 216 10.00 10.00 10.00 7.83 84.17 0.10
## 544 10.00 10.00 10.00 7.58 82.92 0.11
## 629 10.00 10.00 10.00 7.58 82.67 0.11
## 838 10.00 10.00 10.00 7.17 81.83 0.10
## 841 10.00 10.00 10.00 7.33 81.83 0.00
## 897 10.00 10.00 10.00 7.42 81.58 0.12
## 1002 10.00 10.00 10.00 7.25 81.00 0.09
## 1039 10.00 10.00 10.00 7.17 80.75 0.00
## 1041 10.00 10.00 10.00 7.25 80.75 0.12
## 1074 10.00 10.00 10.00 7.08 80.50 0.00
## 1099 10.00 10.00 10.00 7.25 80.25 0.00
## 1124 10.00 10.00 10.00 6.92 80.00 0.00
## 1145 9.33 9.33 9.33 7.25 79.75 0.10
## 1270 9.33 10.00 9.33 6.67 76.17 0.12
## Category.One.Defects Quakers Color Category.Two.Defects
## 216 0 0 Green 2
## 544 0 0 Green 16
## 629 0 0 Green 0
## 838 2 0 Green 6
## 841 0 0 Green 2
## 897 0 0 Green 0
## 1002 0 0 Green 8
## 1039 0 0 Green 2
## 1041 0 0 Green 5
## 1074 0 0 Green 2
## 1099 0 0 Green 1
## 1124 0 0 Green 4
## 1145 0 4 Green 1
## 1270 4 0 None 26
## unit_of_measurement altitude_low_meters altitude_high_meters
## 216 m 3280 3280
## 544 m 11000 11000
## 629 m 1800 5900
## 838 m 3280 3280
## 841 m 4001 4001
## 897 m 190164 190164
## 1002 m 3280 3280
## 1039 m 3825 3825
## 1041 m 110000 110000
## 1074 m 3800 3800
## 1099 m 4287 4287
## 1124 m 3845 3845
## 1145 m 190164 190164
## 1270 m 3500 3500
## altitude_mean_meters
## 216 3280
## 544 11000
## 629 3850
## 838 3280
## 841 4001
## 897 190164
## 1002 3280
## 1039 3825
## 1041 110000
## 1074 3800
## 1099 4287
## 1124 3845
## 1145 190164
## 1270 3500
The rest of the data containing altitude outliers looks reliably. Thus, I will group outliers together with NAs - there is no possibility of coffee growing at such high altitudes.
coffee[altitude_outliers,"altitude_low_meters"] <- NA
coffee[altitude_outliers,"altitude_high_meters"] <- NA
coffee[altitude_outliers,"altitude_mean_meters"] <- NA
Now I try to deal with NAs using multiple methods replace missings for altitudes with sample average:
coffee <- coffee %>%
mutate(altitude_low_meters.impute_mean = ifelse(is.na(altitude_low_meters),
mean(altitude_low_meters,
na.rm = TRUE),
altitude_low_meters),
altitude_high_meters.impute_mean = ifelse(is.na(altitude_high_meters),
mean(altitude_high_meters,
na.rm = TRUE),
altitude_high_meters),
altitude_mean_meters.impute_mean = ifelse(is.na(altitude_mean_meters),
mean(altitude_mean_meters,
na.rm = TRUE),
altitude_mean_meters)
)
coffee <- coffee %>%
mutate(altitude_low_meters.impute_median = ifelse(is.na(altitude_low_meters),
median(altitude_low_meters,
na.rm = TRUE),
altitude_low_meters),
altitude_high_meters.impute_median = ifelse(is.na(altitude_high_meters),
median(altitude_high_meters,
na.rm = TRUE),
altitude_high_meters),
altitude_mean_meters.impute_median = ifelse(is.na(altitude_mean_meters),
median(altitude_mean_meters,
na.rm = TRUE),
altitude_mean_meters)
)
coffee <- coffee %>%
group_by(Country.of.Origin) %>%
mutate(altitude_low_meters.impute.Gmean = ifelse(is.na(altitude_low_meters),
mean(altitude_low_meters,
na.rm = TRUE),
altitude_low_meters),
altitude_low_meters.impute.Gmed = ifelse(is.na(altitude_low_meters),
median(altitude_low_meters,
na.rm = TRUE),
altitude_low_meters),
altitude_high_meters.impute.Gmean = ifelse(is.na(altitude_high_meters),
median(altitude_high_meters,
na.rm = TRUE),
altitude_high_meters),
altitude_high_meters.impute.Gmed = ifelse(is.na(altitude_high_meters),
mean(altitude_high_meters,
na.rm = TRUE),
altitude_high_meters),
altitude_mean_meters.impute.Gmean = ifelse(is.na(altitude_mean_meters),
median(altitude_mean_meters,
na.rm = TRUE),
altitude_mean_meters),
altitude_mean_meters.impute.Gmed = ifelse(is.na(altitude_mean_meters),
mean(altitude_mean_meters,
na.rm = TRUE),
altitude_mean_meters)
) %>% ungroup()
coffee %>%
dplyr::select(starts_with("altitude")) %>%
summary()
## altitude_low_meters altitude_high_meters altitude_mean_meters
## Min. : 1 Min. : 1 Min. : 1
## 1st Qu.:1100 1st Qu.:1100 1st Qu.:1100
## Median :1311 Median :1350 Median :1311
## Mean :1281 Mean :1328 Mean :1304
## 3rd Qu.:1550 3rd Qu.:1600 3rd Qu.:1600
## Max. :2560 Max. :2560 Max. :2560
## NA's :241 NA's :241 NA's :241
## altitude_low_meters.impute_mean altitude_high_meters.impute_mean
## Min. : 1 Min. : 1
## 1st Qu.:1200 1st Qu.:1200
## Median :1281 Median :1328
## Mean :1281 Mean :1328
## 3rd Qu.:1500 3rd Qu.:1550
## Max. :2560 Max. :2560
##
## altitude_mean_meters.impute_mean altitude_low_meters.impute_median
## Min. : 1 Min. : 1
## 1st Qu.:1200 1st Qu.:1200
## Median :1304 Median :1311
## Mean :1304 Mean :1287
## 3rd Qu.:1524 3rd Qu.:1500
## Max. :2560 Max. :2560
##
## altitude_high_meters.impute_median altitude_mean_meters.impute_median
## Min. : 1 Min. : 1
## 1st Qu.:1200 1st Qu.:1200
## Median :1350 Median :1311
## Mean :1332 Mean :1306
## 3rd Qu.:1550 3rd Qu.:1524
## Max. :2560 Max. :2560
##
## altitude_low_meters.impute.Gmean altitude_low_meters.impute.Gmed
## Min. : 1 Min. : 1
## 1st Qu.:1000 1st Qu.:1000
## Median :1300 Median :1300
## Mean :1240 Mean :1243
## 3rd Qu.:1520 3rd Qu.:1550
## Max. :2560 Max. :2560
## NA's :2 NA's :2
## altitude_high_meters.impute.Gmean altitude_high_meters.impute.Gmed
## Min. : 1 Min. : 1
## 1st Qu.:1040 1st Qu.:1067
## Median :1311 Median :1311
## Mean :1291 Mean :1287
## 3rd Qu.:1600 3rd Qu.:1600
## Max. :2560 Max. :2560
## NA's :2 NA's :2
## altitude_mean_meters.impute.Gmean altitude_mean_meters.impute.Gmed
## Min. : 1 Min. : 1
## 1st Qu.:1000 1st Qu.:1050
## Median :1300 Median :1311
## Mean :1269 Mean :1264
## 3rd Qu.:1600 3rd Qu.:1570
## Max. :2560 Max. :2560
## NA's :2 NA's :2
Impute_median seems to be the closest imputations to the original dataset means and medians in subgroups do not give better results.Thus, since now we will use altitude_high_meters.impute_median, altitude_low_meters.impute_median and altitude_mean_meters.impute_median.
coffee$altitude_low_meters.impute.Gmean <- NULL
coffee$altitude_mean_meters.impute.Gmean <- NULL
coffee$altitude_high_meters.impute.Gmean <- NULL
coffee$altitude_low_meters.impute.Gmed <- NULL
coffee$altitude_high_meters.impute.Gmed <- NULL
coffee$altitude_mean_meters.impute.Gmed <- NULL
NAs in Factor columns:
colSums(is.na(coffee)) %>%
sort()
## Number.of.Bags Bag.Weight
## 0 0
## Aroma Flavor
## 0 0
## Aftertaste Acidity
## 0 0
## Body Balance
## 0 0
## Uniformity Clean.Cup
## 0 0
## Sweetness Cupper.Points
## 0 0
## Total.Cup.Points Moisture
## 0 0
## Category.One.Defects Quakers
## 0 0
## Category.Two.Defects unit_of_measurement
## 0 0
## altitude_low_meters.impute_mean altitude_high_meters.impute_mean
## 0 0
## altitude_mean_meters.impute_mean altitude_low_meters.impute_median
## 0 0
## altitude_high_meters.impute_median altitude_mean_meters.impute_median
## 0 0
## Country.of.Origin Processing.Method
## 1 152
## Variety Color
## 201 216
## altitude_low_meters altitude_high_meters
## 241 241
## altitude_mean_meters
## 241
Let’s look on the Country.of.Origin:
table(coffee$Country.of.Origin)
##
## Brazil Burundi
## 132 2
## China Colombia
## 16 183
## Costa Rica Cote d?Ivoire
## 51 1
## Ecuador El Salvador
## 1 21
## Ethiopia Guatemala
## 44 181
## Haiti Honduras
## 6 53
## India Indonesia
## 1 20
## Japan Kenya
## 1 25
## Laos Malawi
## 3 11
## Mauritius Mexico
## 1 236
## Myanmar Nicaragua
## 8 26
## Panama Papua New Guinea
## 4 1
## Peru Philippines
## 10 5
## Rwanda Taiwan
## 1 75
## Tanzania, United Republic Of Thailand
## 40 32
## Uganda United States
## 26 8
## United States (Hawaii) United States (Puerto Rico)
## 73 4
## Vietnam Zambia
## 7 1
There is one coffee without name of the Country of origin. I use mode to input the name:
coffee <- coffee %>%
dplyr::mutate(Country.of.Origin.impute.Gmode = if_else(is.na(Country.of.Origin),
Mode(Country.of.Origin, na.rm = TRUE),
Country.of.Origin))
Colombia assigned to NA. Only one observation was changed, so the distribution of Country.of.Origin.impute.Gmode is very close to the the distribution of Country.of.Origin. Thus, I will use Country.of.Origin.impute.Gmode in the future analysis.
Maybe the region will be more helpful that country, so I create Region column:
table(coffee$Country.of.Origin.impute.Gmode)
##
## Brazil Burundi
## 132 2
## China Colombia
## 16 183
## Costa Rica Cote d?Ivoire
## 51 1
## Ecuador El Salvador
## 1 21
## Ethiopia Guatemala
## 44 181
## Haiti Honduras
## 6 53
## India Indonesia
## 1 20
## Japan Kenya
## 1 25
## Laos Malawi
## 3 11
## Mauritius Mexico
## 1 237
## Myanmar Nicaragua
## 8 26
## Panama Papua New Guinea
## 4 1
## Peru Philippines
## 10 5
## Rwanda Taiwan
## 1 75
## Tanzania, United Republic Of Thailand
## 40 32
## Uganda United States
## 26 8
## United States (Hawaii) United States (Puerto Rico)
## 73 4
## Vietnam Zambia
## 7 1
coffee[,"Region"] <- NA
coffee$Region <- as.character(coffee$Region)
coffee$Country.of.Origin.impute.Gmode <-
as.character(coffee$Country.of.Origin.impute.Gmode)
#East Africa group
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Zambia"|
coffee$Country.of.Origin.impute.Gmode=="Uganda"|
coffee$Country.of.Origin.impute.Gmode=="Burundi"|
coffee$Country.of.Origin.impute.Gmode=="Rwanda"|
coffee$Country.of.Origin.impute.Gmode=="Malawi"|
coffee$Country.of.Origin.impute.Gmode=="Papua New Guinea"|
coffee$Country.of.Origin.impute.Gmode=="Mauritius"|
coffee$Country.of.Origin.impute.Gmode=="Ethiopia"|
coffee$Country.of.Origin.impute.Gmode=="Kenya"
),
"Region"] <- "East Africa"
#West Africa
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Tanzania, United Republic Of"|
coffee$Country.of.Origin.impute.Gmode=="Cote d?Ivoire"
),
"Region"] <- "West Africa"
#Asia and Oceania
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Japan"|
coffee$Country.of.Origin.impute.Gmode=="Thailand"|
coffee$Country.of.Origin.impute.Gmode=="Vietnam"|
coffee$Country.of.Origin.impute.Gmode=="Myanmar"|
coffee$Country.of.Origin.impute.Gmode=="Philippines"|
coffee$Country.of.Origin.impute.Gmode=="Laos"|
coffee$Country.of.Origin.impute.Gmode=="China"|
coffee$Country.of.Origin.impute.Gmode=="India"|
coffee$Country.of.Origin.impute.Gmode=="Taiwan"|
coffee$Country.of.Origin.impute.Gmode=="Indonesia"|
coffee$Country.of.Origin.impute.Gmode=="Papua New Guinea"),
"Region"] <- "Asia and Oceania"
#Central America
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Costa Rica"|
coffee$Country.of.Origin.impute.Gmode=="Guatemala"|
coffee$Country.of.Origin.impute.Gmode=="Haiti"|
coffee$Country.of.Origin.impute.Gmode=="Honduras"|
coffee$Country.of.Origin.impute.Gmode=="Nicaragua"|
coffee$Country.of.Origin.impute.Gmode=="El Salvador"|
coffee$Country.of.Origin.impute.Gmode=="Panama"
),
"Region"] <- "Central America"
#South America
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Peru"|
coffee$Country.of.Origin.impute.Gmode=="Ecuador"|
coffee$Country.of.Origin.impute.Gmode=="Brazil"|
coffee$Country.of.Origin.impute.Gmode=="Colombia"
),
"Region"] <- "South America"
#North America
coffee[which(coffee$Country.of.Origin.impute.Gmode=="Mexico"|
coffee$Country.of.Origin.impute.Gmode=="United States"|
coffee$Country.of.Origin.impute.Gmode=="United States (Hawaii)"|
coffee$Country.of.Origin.impute.Gmode=="United States (Puerto Rico)"
),
"Region"] <- "North America"
coffee$Region <- as.factor(coffee$Region)
coffee$Country.of.Origin.impute.Gmode <-
as.factor(coffee$Country.of.Origin.impute.Gmode)
Now let’s look on the Bag.Weight factor:
table(coffee$Bag.Weight)
##
## 0 kg 0 lbs 1 1 kg 1 kg,lbs 1 lbs 10 kg 100 kg
## 1 3 7 327 1 8 11 1
## 100 lbs 12000 kg 1218 kg 130 lbs 132 lbs 13800 kg 15 kg 150 lbs
## 59 2 1 1 1 1 2 1
## 1500 kg 18 kg 18000 kg 18975 kg 19200 kg 2 2 kg 2 kg,lbs
## 1 1 1 4 2 1 114 1
## 2 lbs 20 kg 24 kg 25 kg 29 kg 3 kg 3 lbs 30 kg
## 5 14 1 2 2 1 7 29
## 34 kg 35 kg 350 kg 4 kg 4 lbs 40 kg 46 kg 5 kg
## 1 2 1 1 4 2 3 7
## 5 lbs 50 kg 55 lbs 59 kg 6 6 kg 60 kg 66 kg
## 21 14 1 10 19 2 242 2
## 660 kg 67 kg 69 kg 70 kg 8 kg 80 kg 80 lbs 9000 kg
## 1 1 200 156 1 4 1 2
coffee$Bag.Weight <- as.character(coffee$Bag.Weight)
There are two types of measurement - we should convert lbs to the kg.
coffee[which(coffee$Bag.Weight=="1 lbs"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="100 lbs"),"Bag.Weight"] <- "45"
coffee[which(coffee$Bag.Weight=="130 lbs"),"Bag.Weight"] <- "58.5"
coffee[which(coffee$Bag.Weight=="132 lbs"),"Bag.Weight"] <- "59.4"
coffee[which(coffee$Bag.Weight=="150 lbs"),"Bag.Weight"] <- "67.5"
coffee[which(coffee$Bag.Weight=="2 lbs"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="55 lbs"),"Bag.Weight"] <- "24.75"
coffee[which(coffee$Bag.Weight=="80 lbs"),"Bag.Weight"] <- "36"
coffee[which(coffee$Bag.Weight=="0 kg"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="0 lbs"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="1"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="1 kg"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="1 kg,lbs"),"Bag.Weight"] <- "<=1"
coffee[which(coffee$Bag.Weight=="10 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="2"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="2 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="2 kg,lbs"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="3 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="3 lbs"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="4 lbs"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="5 lbs"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="4 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="5 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="6 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="6"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="8 kg"),"Bag.Weight"] <- "(1;10>"
coffee[which(coffee$Bag.Weight=="100 lbs"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="130 lbs"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="132 lbs"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="15 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="18 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="20 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="24 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="25 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="29 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="30 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="34 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="35 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="40 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="46 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="50 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="59 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="60 kg"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="45"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="67.5"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="59.4"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="58.5"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="36"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="59.4"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="24.75"),"Bag.Weight"] <- "(10;60>"
coffee[which(coffee$Bag.Weight=="100 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="12000 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="1218 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="13800 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="150 lbs"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="1500 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="18000 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="18975 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="19200 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="350 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="66 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="660 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="67 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="69 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="70 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="9000 kg"),"Bag.Weight"] <- ">60"
coffee[which(coffee$Bag.Weight=="80 kg"),"Bag.Weight"] <- ">60"
coffee$Bag.Weight <- factor(coffee$Bag.Weight)
Now let’s look on the Processing.Method factor:
table(coffee$Processing.Method)
##
## Natural / Dry Other Pulped natural / honey
## 251 26 14
## Semi-washed / Semi-pulped Washed / Wet
## 56 812
I group NA and Other together.
coffee$Processing.Method <- addNA(coffee$Processing.Method)
levels(coffee$Processing.Method)
## [1] "Natural / Dry" "Other"
## [3] "Pulped natural / honey" "Semi-washed / Semi-pulped"
## [5] "Washed / Wet" NA
coffee[which(coffee$Processing.Method=="Other"),"Processing.Method"] <- NA
coffee$Processing.Method <- droplevels(coffee$Processing.Method)
coffee$Processing.Method <- as.character(coffee$Processing.Method)
Now let’s deal with missing values.
coffee <- coffee %>%
group_by(Region) %>%
mutate(Processing.Method.impute.Gmode = if_else(is.na(Processing.Method),
Mode(Processing.Method, na.rm = TRUE),
Processing.Method)) %>%
ungroup()
coffee$Processing.Method <- as.factor(coffee$Processing.Method)
coffee$Processing.Method.impute.Gmode <-
as.factor(coffee$Processing.Method.impute.Gmode)
coffee$Processing.Method.impute.Gmode <-
addNA(coffee$Processing.Method.impute.Gmode)
coffee$Processing.Method.impute.Gmode <-
droplevels(coffee$Processing.Method.impute.Gmode)
coffee$Processing.Method %>%
table() %>%
prop.table()
## .
## Natural / Dry Pulped natural / honey Semi-washed / Semi-pulped
## 0.22153575 0.01235658 0.04942630
## Washed / Wet
## 0.71668138
coffee$Processing.Method.impute.Gmode %>%
table() %>%
prop.table()
## .
## Natural / Dry Pulped natural / honey Semi-washed / Semi-pulped
## 0.19145690 0.01067887 0.04271548
## Washed / Wet
## 0.75514874
Distributions are quite similar.
There are only NAs in Color and Variety color left.Let’s look on the next factor - Color.
coffee$Color <- addNA(coffee$Color)
table(coffee$Color)
##
## Blue-Green Bluish-Green Green None <NA>
## 82 112 850 51 216
I should group NA and None together. Blue-Green and Bluish-Green are the same colors, thus I group them.
coffee[which(coffee$Color=="None"),"Color"] <- NA
coffee[which(coffee$Color=="Blue-Green"),"Color"] <- "Bluish-Green"
coffee$Color <- droplevels(coffee$Color)
table(coffee$Color)
##
## Bluish-Green Green <NA>
## 194 850 267
coffee$Color <- as.character(coffee$Color)
#267 NA values
coffee <- coffee %>%
group_by(Region) %>%
mutate(Color.impute.Gmode = if_else(is.na(Color),
(Mode(Color, na.rm = TRUE)),
Color)) %>%
ungroup()
coffee$Color.impute.Gmode <- addNA(coffee$Color.impute.Gmode)
table(coffee$Color.impute.Gmode)
##
## Bluish-Green Green <NA>
## 194 1117 0
coffee$Color %>%
table() %>%
prop.table()
## .
## Bluish-Green Green
## 0.1858238 0.8141762
coffee$Color.impute.Gmode %>%
table() %>%
prop.table()
## .
## Bluish-Green Green <NA>
## 0.1479786 0.8520214 0.0000000
coffee$Color.impute.Gmode <- droplevels(coffee$Color.impute.Gmode)
table(coffee$Color.impute.Gmode)
##
## Bluish-Green Green
## 194 1117
Now it’s time to look on the last factor - Variety.
coffee$Variety <- as.character(coffee$Variety)
108 Other and 201 NA - I group them.
coffee[which(coffee$Variety=="Other"),"Variety"] <- NA
coffee$Variety <- addNA(coffee$Variety)
table(coffee$Variety)
##
## Arusha Blue Mountain Bourbon
## 5 2 226
## Catimor Catuai Caturra
## 20 74 256
## Ethiopian Heirlooms Ethiopian Yirgacheffe Gesha
## 1 2 12
## Hawaiian Kona Java Mandheling
## 44 2 3
## Marigojipe Moka Peaberry Mundo Novo
## 1 1 33
## Pacamara Pacas Pache Comun
## 8 13 1
## Peaberry Ruiru 11 SL14
## 5 2 17
## SL28 SL34 Sulawesi
## 15 8 1
## Sumatra Sumatra Lintong Typica
## 3 1 211
## Yellow Bourbon <NA>
## 35 309
Now I group small groups into bigger groups based on information from https://varieties.worldcoffeeresearch.org/.
coffee$Variety <- as.character(coffee$Variety)
coffee[which(coffee$Variety=="Java"),"Variety"] <- "Ethiopian Landrace"
coffee[which(coffee$Variety=="Gesha"),"Variety"] <- "Ethiopian Landrace"
coffee[which(coffee$Variety=="Ethiopian Heirlooms"),"Variety"] <- "Ethiopian Landrace"
coffee[which(coffee$Variety=="Ethiopian Yirgacheffe"),"Variety"] <- "Ethiopian Landrace"
coffee[which(coffee$Variety=="Ruiru 11"),"Variety"] <- "Introgressed"
coffee[which(coffee$Variety=="Catimor"),"Variety"] <- "Introgressed"
coffee[which(coffee$Variety=="Arusha"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Blue Mountain"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Mandheling"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Marigojipe"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Pache Comun"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Peaberry"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="SL14"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="SL34"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Sulawesi"),"Variety"] <- "Typica"
coffee[which(coffee$Variety=="Catuai"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="Mundo Novo"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="Pacamara"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="SL28"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="Sumatra"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="Sumatra Lintong"),"Variety"] <- "Bourbon-Typica cross"
coffee[which(coffee$Variety=="Moka Peaberry"),"Variety"] <- "Bourbon"
coffee[which(coffee$Variety=="Pacas"),"Variety"] <- "Bourbon"
coffee <- coffee %>%
group_by(Region, Color.impute.Gmode) %>%
mutate(Variety.impute.Gmode = if_else(is.na(Variety),
Mode(Variety, na.rm = TRUE),
Variety)) %>%
ungroup()
coffee$Variety %>%
table() %>%
prop.table()
## .
## Bourbon Bourbon-Typica cross Caturra
## 0.23952096 0.13373253 0.25548902
## Ethiopian Landrace Hawaiian Kona Introgressed
## 0.01696607 0.04391218 0.02195609
## Typica Yellow Bourbon
## 0.25349301 0.03493014
coffee$Variety.impute.Gmode %>%
table() %>%
prop.table()
## .
## Bourbon Bourbon-Typica cross Caturra
## 0.23798627 0.10450038 0.25629291
## Ethiopian Landrace Hawaiian Kona Introgressed
## 0.01296720 0.03508772 0.01678108
## Typica Yellow Bourbon
## 0.30968726 0.02669718
coffee$Variety <- as.factor(coffee$Variety)
coffee$Variety <- droplevels(coffee$Variety)
coffee$Variety <- addNA(coffee$Variety)
table(coffee$Variety)
##
## Bourbon Bourbon-Typica cross Caturra
## 240 134 256
## Ethiopian Landrace Hawaiian Kona Introgressed
## 17 44 22
## Typica Yellow Bourbon <NA>
## 254 35 309
coffee$Variety.impute.Gmode <- addNA(coffee$Variety.impute.Gmode)
coffee$Variety.impute.Gmode <- as.factor(coffee$Variety.impute.Gmode)
coffee$Variety.impute.Gmode <- droplevels(coffee$Variety.impute.Gmode)
table(coffee$Variety.impute.Gmode)
##
## Bourbon Bourbon-Typica cross Caturra
## 312 137 336
## Ethiopian Landrace Hawaiian Kona Introgressed
## 17 46 22
## Typica Yellow Bourbon
## 406 35
coffee$Variety.impute.Gmode %>%
table() %>%
prop.table()
## .
## Bourbon Bourbon-Typica cross Caturra
## 0.23798627 0.10450038 0.25629291
## Ethiopian Landrace Hawaiian Kona Introgressed
## 0.01296720 0.03508772 0.01678108
## Typica Yellow Bourbon
## 0.30968726 0.02669718
coffee$Variety <- as.character(coffee$Variety)
coffee$Variety %>%
table() %>%
prop.table()
## .
## Bourbon Bourbon-Typica cross Caturra
## 0.23952096 0.13373253 0.25548902
## Ethiopian Landrace Hawaiian Kona Introgressed
## 0.01696607 0.04391218 0.02195609
## Typica Yellow Bourbon
## 0.25349301 0.03493014
coffee$Variety <- as.factor(coffee$Variety)
Distribution has changed, but Typica and Bourbon varieties are most popular in the World, so it is very likely to be truth.
Now let’s look for outliers considering Total.Cup.Points result.
ggplot(coffee,
aes(x = Total.Cup.Points)) +
geom_histogram(fill = "blue",
bins = 100) +
theme_bw()
There is one outlier in Total.Cup.Points:
coffee[which(coffee$Total.Cup.Points==0),]
## # A tibble: 1 x 36
## Country.of.Orig~ Number.of.Bags Bag.Weight Variety Processing.Meth~ Aroma
## <chr> <int> <fct> <fct> <fct> <dbl>
## 1 Honduras 275 >60 Caturra <NA> 0
## # ... with 30 more variables: Flavor <dbl>, Aftertaste <dbl>, Acidity <dbl>,
## # Body <dbl>, Balance <dbl>, Uniformity <dbl>, Clean.Cup <dbl>,
## # Sweetness <dbl>, Cupper.Points <dbl>, Total.Cup.Points <dbl>,
## # Moisture <dbl>, Category.One.Defects <int>, Quakers <dbl>, Color <chr>,
## # Category.Two.Defects <int>, unit_of_measurement <chr>,
## # altitude_low_meters <dbl>, altitude_high_meters <dbl>,
## # altitude_mean_meters <dbl>, altitude_low_meters.impute_mean <dbl>,
## # altitude_high_meters.impute_mean <dbl>,
## # altitude_mean_meters.impute_mean <dbl>,
## # altitude_low_meters.impute_median <dbl>,
## # altitude_high_meters.impute_median <dbl>,
## # altitude_mean_meters.impute_median <dbl>,
## # Country.of.Origin.impute.Gmode <fct>, Region <fct>,
## # Processing.Method.impute.Gmode <fct>, Color.impute.Gmode <fct>,
## # Variety.impute.Gmode <fct>
The other numeric variables are equal to 0, so I delete this observation
coffee <- coffee[-which(coffee$Total.Cup.Points==0),]
I create new dataframe without variables which I will not use in the analysis. I also delete Total.Cup.Points, because it is the sum of Aroma, Flavor, Aftertaste, Acidity, Body, Balance,Uniformity, Clean.Cup, Sweetness and Cupper.Points.
coffee_final <- coffee
coffee_final[,c("Country.of.Origin","Processing.Method","Variety",
"altitude_low_meters","altitude_high_meters",
"altitude_mean_meters","Color","altitude_mean_meters.impute_mean",
"altitude_low_meters.impute_mean",
"altitude_high_meters.impute_mean","Total.Cup.Points")] <- NULL
Before split the dataset, I check if there are variables with zero or near zero variance, because such variables may need some tranformations.
nearZeroVar(coffee_final,
saveMetrics = TRUE) -> coffees_nzv_stats
coffees_nzv_stats %>%
rownames_to_column("variable") %>%
arrange(-zeroVar, -nzv, -freqRatio)
## variable freqRatio percentUnique zeroVar nzv
## 1 Quakers 31.179487 0.8396947 FALSE TRUE
## 2 Clean.Cup 20.586207 0.8396947 FALSE TRUE
## 3 Sweetness 19.967213 0.5343511 FALSE TRUE
## 4 Category.One.Defects 10.990099 1.2213740 FALSE FALSE
## 5 Uniformity 10.071429 0.6870229 FALSE FALSE
## 6 unit_of_measurement 6.197802 0.1526718 FALSE FALSE
## 7 Color.impute.Gmode 5.752577 0.1526718 FALSE FALSE
## 8 altitude_mean_meters.impute_median 4.121212 14.5038168 FALSE FALSE
## 9 altitude_high_meters.impute_median 4.046154 13.5114504 FALSE FALSE
## 10 Processing.Method.impute.Gmode 3.940239 0.3053435 FALSE FALSE
## 11 altitude_low_meters.impute_median 3.400000 13.5877863 FALSE FALSE
## 12 Category.Two.Defects 1.810000 2.9007634 FALSE FALSE
## 13 Number.of.Bags 1.377143 9.9236641 FALSE FALSE
## 14 Moisture 1.346290 1.7557252 FALSE FALSE
## 15 Body 1.328859 2.2900763 FALSE FALSE
## 16 Country.of.Origin.impute.Gmode 1.295082 2.7480916 FALSE FALSE
## 17 Variety.impute.Gmode 1.211940 0.6106870 FALSE FALSE
## 18 Balance 1.186207 2.3664122 FALSE FALSE
## 19 Cupper.Points 1.117647 3.1297710 FALSE FALSE
## 20 Aftertaste 1.080000 2.5954198 FALSE FALSE
## 21 Acidity 1.066667 2.2900763 FALSE FALSE
## 22 Aroma 1.061350 2.4427481 FALSE FALSE
## 23 Region 1.046012 0.4580153 FALSE FALSE
## 24 Bag.Weight 1.026385 0.3053435 FALSE FALSE
## 25 Flavor 1.012346 2.5954198 FALSE FALSE
We havee 3 problematic variables
table(coffee_final$Quakers)
##
## 0 1 2 3 4 5 6 7 8 9 11
## 1216 39 30 5 5 5 4 3 1 1 1
table(coffee_final$Clean.Cup)
##
## 0 1.33 2.67 5.33 6 6.67 7.33 8 8.67 9.33 10
## 1 1 2 3 6 13 3 13 16 58 1194
table(coffee_final$Sweetness)
##
## 1.33 6 6.67 8 8.67 9.33 10
## 1 3 7 8 12 61 1218
I will group them in two groups, because in each column there is one hugly dominant value.
quakers_not_zero_train <- which(coffee_final$Quakers>0)
coffee_final$Quakers <- as.character(coffee_final$Quakers)
coffee_final[quakers_not_zero_train,"Quakers"] <- ">0"
coffee_final$Quakers <- as.factor(coffee_final$Quakers)
table(coffee_final$Quakers)
##
## >0 0
## 94 1216
clean_cup_not_10_train <- which(coffee_final$Clean.Cup<10)
coffee_final$Clean.Cup <- as.character(coffee_final$Clean.Cup)
coffee_final[clean_cup_not_10_train,"Clean.Cup"] <- "<10"
coffee_final$Clean.Cup <- as.factor(coffee_final$Clean.Cup)
table(coffee_final$Clean.Cup)
##
## <10 10
## 116 1194
sweetness_not_10_train <- which(coffee_final$Sweetness<10)
coffee_final$Sweetness <- as.character(coffee_final$Sweetness)
coffee_final[sweetness_not_10_train,"Sweetness"] <- "<10"
coffee_final$Sweetness <- as.factor(coffee_final$Sweetness)
table(coffee_final$Sweetness)
##
## <10 10
## 92 1218
Let’s check if they have still near zero variance:
nearZeroVar(coffee_final,
saveMetrics = TRUE) -> coffees_nzv_stats
coffees_nzv_stats %>%
rownames_to_column("variable") %>%
arrange(-zeroVar, -nzv, -freqRatio)
## variable freqRatio percentUnique zeroVar nzv
## 1 Sweetness 13.239130 0.1526718 FALSE FALSE
## 2 Quakers 12.936170 0.1526718 FALSE FALSE
## 3 Category.One.Defects 10.990099 1.2213740 FALSE FALSE
## 4 Clean.Cup 10.293103 0.1526718 FALSE FALSE
## 5 Uniformity 10.071429 0.6870229 FALSE FALSE
## 6 unit_of_measurement 6.197802 0.1526718 FALSE FALSE
## 7 Color.impute.Gmode 5.752577 0.1526718 FALSE FALSE
## 8 altitude_mean_meters.impute_median 4.121212 14.5038168 FALSE FALSE
## 9 altitude_high_meters.impute_median 4.046154 13.5114504 FALSE FALSE
## 10 Processing.Method.impute.Gmode 3.940239 0.3053435 FALSE FALSE
## 11 altitude_low_meters.impute_median 3.400000 13.5877863 FALSE FALSE
## 12 Category.Two.Defects 1.810000 2.9007634 FALSE FALSE
## 13 Number.of.Bags 1.377143 9.9236641 FALSE FALSE
## 14 Moisture 1.346290 1.7557252 FALSE FALSE
## 15 Body 1.328859 2.2900763 FALSE FALSE
## 16 Country.of.Origin.impute.Gmode 1.295082 2.7480916 FALSE FALSE
## 17 Variety.impute.Gmode 1.211940 0.6106870 FALSE FALSE
## 18 Balance 1.186207 2.3664122 FALSE FALSE
## 19 Cupper.Points 1.117647 3.1297710 FALSE FALSE
## 20 Aftertaste 1.080000 2.5954198 FALSE FALSE
## 21 Acidity 1.066667 2.2900763 FALSE FALSE
## 22 Aroma 1.061350 2.4427481 FALSE FALSE
## 23 Region 1.046012 0.4580153 FALSE FALSE
## 24 Bag.Weight 1.026385 0.3053435 FALSE FALSE
## 25 Flavor 1.012346 2.5954198 FALSE FALSE
There are no near zero variance variables now.
Now let’s divide the set into learning and testing sample
set.seed(987654321)
coffees_which_train <- createDataPartition(coffee_final$Cupper.Points,
p = 0.7,
list = FALSE)
head(coffees_which_train)
## Resample1
## [1,] 1
## [2,] 3
## [3,] 4
## [4,] 6
## [5,] 8
## [6,] 9
coffees_train <- coffee_final[coffees_which_train,]
coffees_test <- coffee_final[-coffees_which_train,]
The distributions of the target variable in both samples are similar:
summary(coffees_train$Cupper.Points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.170 7.250 7.500 7.506 7.750 10.000
summary(coffees_test$Cupper.Points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.170 7.250 7.500 7.499 7.750 10.000
Storing the list of names of numeric variables into a vector:
coffees_numeric_vars <-
sapply(coffees_train, is.numeric) %>%
which() %>%
names()
coffees_numeric_vars
## [1] "Number.of.Bags" "Aroma"
## [3] "Flavor" "Aftertaste"
## [5] "Acidity" "Body"
## [7] "Balance" "Uniformity"
## [9] "Cupper.Points" "Moisture"
## [11] "Category.One.Defects" "Category.Two.Defects"
## [13] "altitude_low_meters.impute_median" "altitude_high_meters.impute_median"
## [15] "altitude_mean_meters.impute_median"
Mutually correlated variables:
coffee_correlations <-
cor(coffees_train[,coffees_numeric_vars],
use = "pairwise.complete.obs")
coffees_numeric_vars_order <-
coffee_correlations[,"Cupper.Points"] %>%
sort(decreasing = TRUE) %>%
names()
corrplot.mixed(coffee_correlations[coffees_numeric_vars_order,
coffees_numeric_vars_order],
upper = "circle",
lower = "pie",
tl.col="black",
tl.pos = "lt",
tl.cex = 0.6)
We can see very high correlation between altitude variables. Number of bags and Category.One.Defects seem to have very small correlation between them and the other variables - we will skip them.
corrplot.mixed(coffee_correlations[coffees_numeric_vars_order[-12:-13],
coffees_numeric_vars_order[-12:-13]],
upper = "circle",
lower = "pie",
tl.col = "black",
tl.pos = "lt",
tl.cex = 0.6)
## Variables relationships
The relation of the target variable with the most strongly correlated variables - Flavour:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Flavor)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
5 coffees clearly not matching the relationship - 2 coffees with the flavour < 8, but Cupper.Points = 10. 3 coffees with Cupper.Points < 5.5 and Flavor > 7.5. Maybe exclude these observations from the sample.
coffees_outliers <- NULL
coffees_outliers_flavor <- which(coffees_train$Flavor < 8 &
coffees_train$Cupper.Points == 10)
coffees_outliers_flavor <- append(coffees_outliers_flavor,
which(coffees_train$Flavor > 7.5 &
coffees_train$Cupper.Points < 5.5))
coffees_outliers <- append(coffees_outliers,coffees_outliers_flavor)
Aftertaste:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Aftertaste)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
5 coffees clearly notmatching the relationship - they look similar to the previous values from Flavor.
coffees_outliers_aftertaste <- which(coffees_train$Aftertaste >= 7.5 &
coffees_train$Cupper.Points < 5.5)
coffees_outliers_aftertaste <- append(coffees_outliers_aftertaste,
which(coffees_train$Aftertaste < 8 &
coffees_train$Cupper.Points == 10))
identical(sort(coffees_outliers_flavor),sort(coffees_outliers_aftertaste))
## [1] TRUE
They were exactly the same outliers.
Balance:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Balance)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
4 clear outliers - Cupper.Points = 10 and Balance = 7 ; Balance > 7.5 and Cupper.Points < 5.5.
coffees_outliers_balance <- which(coffees_train$Balance == 7 &
coffees_train$Cupper.Points == 10)
# Actually there are 2 outliers with exact value of Balance and Cupper.Points
coffees_outliers_balance <- append(coffees_outliers_balance,
which(coffees_train$Balance > 7.5 &
coffees_train$Cupper.Points < 5.5))
identical(sort(coffees_outliers_flavor),sort(coffees_outliers_balance))
## [1] TRUE
They were exactly the same outliers.
Aroma:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Aroma)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
4 clear new outliers - Cupper.Points = 10 and Aroma < 7.75 ; Cupper.Points < 5.5 and Aroma > 7.5.
coffees_outliers_aroma <- which(coffees_train$Aroma < 7.75 &
coffees_train$Cupper.Points == 10)
coffees_outliers_aroma <- append(coffees_outliers_aroma,
which(coffees_train$Aroma > 7.5 &
coffees_train$Cupper.Points < 5.5))
These outliers were marked earlier.
Acidity:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Acidity)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
At least 2 clear outliers.
which(coffees_train$Acidity < 7.5 & coffees_train$Cupper.Points == 10)
## [1] 249 520
# 2 outliers marked earlier
coffees_outliers_acidity <- which(coffees_train$Acidity < 5.5 &
coffees_train$Cupper.Points > 7.5)
# 1 new outlier
coffees_outliers <- append(coffees_outliers,coffees_outliers_acidity)
Body:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Body)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
#At least 2 clear outliers
#which(coffees_train$Body == 7 & coffees_train$Cupper.Points == 10)
# 2 outliers marked earlier
coffees_outliers_body <- which(coffees_train$Body < 5.5 &
coffees_train$Cupper.Points > 7.5)
# 1 new outlier
coffees_outliers <- append(coffees_outliers,coffees_outliers_body)
Uniformity:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Uniformity)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Relationship looks more like relationship with categorical variable.
Moisture:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Moisture)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Many values far from relationship.
Category.One.Defects:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = Category.One.Defects)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
3 clear outliers - Defects >= 15 and Cupper.Points above 6.5.
coffees_outliers_cat1_defects <- which(coffees_train$Category.One.Defects >= 15 &
coffees_train$Cupper.Points > 6.5)
coffees_outliers <- append(coffees_outliers,coffees_outliers_cat1_defects)
Altitude_mean_meters.impute_median:
ggplot(coffees_train,
aes(x = Cupper.Points,
y = altitude_mean_meters.impute_median)) +
geom_point(col = "blue") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
There is some positive relationship, but very unclear - many values far from relationship.
Now I try to find highly correlated variables:
findCorrelation(coffee_correlations,
cutoff = 0.80,
names = TRUE)
## [1] "Aftertaste" "altitude_high_meters.impute_median"
## [3] "altitude_low_meters.impute_median"
These variables are potential candidates to be excluded from the model because of the high correlation between them.
coffees_categorical_vars <-
sapply(coffees_train, is.factor) %>%
which() %>%
names()
coffees_categorical_vars
## [1] "Bag.Weight" "Clean.Cup"
## [3] "Sweetness" "Quakers"
## [5] "Country.of.Origin.impute.Gmode" "Region"
## [7] "Processing.Method.impute.Gmode" "Color.impute.Gmode"
## [9] "Variety.impute.Gmode"
I use the function that retrieves F statistic value for the explanatory categorical variable provided as the function argument.
coffees_F_anova <- function(categorical_var) {
anova_ <- aov(coffees_train$Cupper.Points ~
coffees_train[[categorical_var]])
return(summary(anova_)[[1]][1, 4])
}
Relationship between coffees_categorical_vars and Cupper.Points:
sapply(coffees_categorical_vars,
coffees_F_anova) %>%
sort(decreasing = TRUE) -> coffees_anova_all_categorical
coffees_anova_all_categorical
## Clean.Cup Color.impute.Gmode
## 45.5452462 18.1382503
## Region Bag.Weight
## 17.0062764 12.9751893
## Country.of.Origin.impute.Gmode Variety.impute.Gmode
## 6.2321289 3.2669334
## Processing.Method.impute.Gmode Sweetness
## 2.1798322 1.4017320
## Quakers
## 0.3713259
Relation between Color.impute.Gmode levels and Cupper.Points:
ggplot(coffees_train,
aes(x = Color.impute.Gmode,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
Green has many outliers. Bluish-Green has slightly higher values.
Relation between Country.of.Origin.impute.Gmode levels and Cupper.Points:
ggplot(coffees_train,
aes(x = Country.of.Origin.impute.Gmode,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme(axis.text.x = element_text(angle = 60))
There can be seen some relationships between these variables but there are countries with only few coffees. Thus, we probably choose the region.
ggplot(coffees_train,
aes(x = Region,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
It can be seen that there are differences between Regions.
The relationship between Cupper.Points and Variety.impute.Gmode:
ggplot(coffees_train,
aes(x = Variety.impute.Gmode,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme(axis.text.x = element_text(angle = 45))
There are some differences between varieties of coffee, but not very clear.
The relationship between Cupper.Points and Processing.Method.impute.Gmode:
ggplot(coffees_train,
aes(x = Processing.Method.impute.Gmode,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
There is no clear differences betweend processing methods. There are many values far from mean and median in Washed / Wet.
The relationship between Cupper.Points and Clean.Cup:
ggplot(coffees_train,
aes(x = Clean.Cup,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
Clean.Cup = 10 has higher values than <10, but also many outliers there.
The relationship between Cupper.Points and Sweetness:
ggplot(coffees_train,
aes(x = Sweetness,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
Sweetness = 10 has slightly higher values than <10, but also many outliers there.
The relationship between Cupper.Points and Quakers:
ggplot(coffees_train,
aes(x = Quakers,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
Very Similar mean and similar distributions, but many outliers for Quakers = 0.
The relationship between Cupper.Points and unit_of_measurement.
ggplot(coffees_train,
aes(x = unit_of_measurement,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
The relationship between Cupper.Points and Bag.Weight
ggplot(coffees_train,
aes(x = Bag.Weight,
y = Cupper.Points)) +
geom_boxplot(fill = "red") +
theme_bw()
There are diffrences between Bag.Weights, but not so big.
Bag.Weight and unit_of_measurement can be also higly correlated with Processing_method and Region. Let’s check it with the Cramer’s V coefficient.
DescTools::CramerV(coffees_train$unit_of_measurement,
coffees_train$Region)
## [1] 0.368987
Moderate association between unit_of_measurement and Region.
DescTools::CramerV(coffees_train$unit_of_measurement,
coffees_train$Processing.Method.impute.Gmode)
## [1] 0.1128418
Low association between unit_of_measurement and Processing.Method.impute.Gmode.
DescTools::CramerV(coffees_train$Bag.Weight,
coffees_train$Region)
## [1] 0.4782707
Moderate association between Bag.Weight and Region.
DescTools::CramerV(coffees_train$Bag.Weight,
coffees_train$Processing.Method.impute.Gmode)
## [1] 0.2201453
Low association between Bag.Weight and Processing.Method.impute.Gmode.
DescTools::CramerV(coffees_train$Bag.Weight,
coffees_train$unit_of_measurement)
## [1] 0.1635196
Low association between Bag.Weight and unit_of_measurement.
There is moderate association between Bag.Weight and Region and also between unit_of_measurement and Region. I do not exclude them from the analysis for now.
I check if there is some linear combination of other variables
( findLinearCombos(coffees_train[, coffees_numeric_vars] ) ->
coffees_linearCombos )
## $linearCombos
## list()
##
## $remove
## NULL
There is no such combination.
Now i prepare vector of coffee variables names.
#all coffees_train variables
coffees_variables_all <- names(coffees_train)
Altitudes are very highly correlated to each other, so I choose only altitude_mean_meters.impute_median.
coffees_variables_all <-
coffees_variables_all[!coffees_variables_all %in%
c("altitude_low_meters.impute_median",
"altitude_high_meters.impute_median",
"Category.One.Defects",
"Number.of.Bags")]
Now I delete outliers form training set.
coffees_train <- coffees_train[-coffees_outliers,]
options(contrasts = c("contr.treatment",
"contr.treatment"))
Estimation of the model with all variables:
coffees_lm1 <- lm(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)))
summary(coffees_lm1)
##
## Call:
## lm(formula = Cupper.Points ~ ., data = coffees_train %>% dplyr::select(all_of(coffees_variables_all)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56264 -0.09494 -0.00819 0.08739 0.90802
##
## Coefficients: (5 not defined because of singularities)
## Estimate
## (Intercept) -9.470e-01
## Bag.Weight(10;60> -2.188e-02
## Bag.Weight<=1 -3.852e-02
## Bag.Weight>60 1.925e-02
## Aroma 4.435e-02
## Flavor 3.331e-01
## Aftertaste 2.447e-01
## Acidity 1.554e-01
## Body 1.003e-01
## Balance 2.615e-01
## Uniformity -1.863e-02
## Clean.Cup10 6.987e-02
## Sweetness10 8.083e-03
## Moisture -1.747e-01
## Quakers0 9.811e-03
## Category.Two.Defects -2.184e-03
## unit_of_measurementm 7.870e-02
## altitude_mean_meters.impute_median -3.853e-06
## Country.of.Origin.impute.GmodeBurundi 2.065e-01
## Country.of.Origin.impute.GmodeChina 7.969e-02
## Country.of.Origin.impute.GmodeColombia -8.104e-02
## Country.of.Origin.impute.GmodeCosta Rica 6.509e-02
## Country.of.Origin.impute.GmodeCote d?Ivoire 3.801e-02
## Country.of.Origin.impute.GmodeEcuador -1.544e-01
## Country.of.Origin.impute.GmodeEl Salvador -1.248e-01
## Country.of.Origin.impute.GmodeEthiopia 1.285e-02
## Country.of.Origin.impute.GmodeGuatemala -5.601e-02
## Country.of.Origin.impute.GmodeHaiti 3.151e-03
## Country.of.Origin.impute.GmodeHonduras -9.697e-03
## Country.of.Origin.impute.GmodeIndia 1.894e-01
## Country.of.Origin.impute.GmodeIndonesia 2.935e-02
## Country.of.Origin.impute.GmodeJapan 3.501e-01
## Country.of.Origin.impute.GmodeKenya -2.083e-02
## Country.of.Origin.impute.GmodeLaos 1.024e-01
## Country.of.Origin.impute.GmodeMalawi -2.010e-02
## Country.of.Origin.impute.GmodeMexico 3.405e-03
## Country.of.Origin.impute.GmodeMyanmar -1.320e-02
## Country.of.Origin.impute.GmodeNicaragua -4.643e-02
## Country.of.Origin.impute.GmodePanama -6.419e-02
## Country.of.Origin.impute.GmodePeru -1.085e-01
## Country.of.Origin.impute.GmodePhilippines -9.962e-02
## Country.of.Origin.impute.GmodeRwanda 2.052e-01
## Country.of.Origin.impute.GmodeTaiwan -1.120e-01
## Country.of.Origin.impute.GmodeTanzania, United Republic Of 5.168e-02
## Country.of.Origin.impute.GmodeThailand 3.171e-02
## Country.of.Origin.impute.GmodeUganda 8.400e-03
## Country.of.Origin.impute.GmodeUnited States -1.049e-01
## Country.of.Origin.impute.GmodeUnited States (Hawaii) 7.883e-02
## Country.of.Origin.impute.GmodeUnited States (Puerto Rico) 1.525e-01
## Country.of.Origin.impute.GmodeVietnam 1.696e-01
## Country.of.Origin.impute.GmodeZambia -1.075e-01
## RegionCentral America NA
## RegionEast Africa NA
## RegionNorth America NA
## RegionSouth America NA
## RegionWest Africa NA
## Processing.Method.impute.GmodePulped natural / honey -6.739e-02
## Processing.Method.impute.GmodeSemi-washed / Semi-pulped -3.929e-02
## Processing.Method.impute.GmodeWashed / Wet -2.678e-02
## Color.impute.GmodeGreen 5.652e-03
## Variety.impute.GmodeBourbon-Typica cross -1.379e-02
## Variety.impute.GmodeCaturra 3.077e-02
## Variety.impute.GmodeEthiopian Landrace 6.688e-02
## Variety.impute.GmodeHawaiian Kona -3.014e-02
## Variety.impute.GmodeIntrogressed -4.435e-02
## Variety.impute.GmodeTypica -7.882e-03
## Variety.impute.GmodeYellow Bourbon 1.364e-01
## Std. Error t value
## (Intercept) 2.542e-01 -3.725
## Bag.Weight(10;60> 2.612e-02 -0.838
## Bag.Weight<=1 2.735e-02 -1.408
## Bag.Weight>60 2.642e-02 0.729
## Aroma 3.294e-02 1.346
## Flavor 4.365e-02 7.632
## Aftertaste 4.009e-02 6.103
## Acidity 3.479e-02 4.467
## Body 3.325e-02 3.015
## Balance 2.957e-02 8.844
## Uniformity 1.558e-02 -1.195
## Clean.Cup10 2.911e-02 2.401
## Sweetness10 2.811e-02 0.288
## Moisture 1.503e-01 -1.162
## Quakers0 2.587e-02 0.379
## Category.Two.Defects 1.381e-03 -1.581
## unit_of_measurementm 3.054e-02 2.577
## altitude_mean_meters.impute_median 2.170e-05 -0.178
## Country.of.Origin.impute.GmodeBurundi 1.320e-01 1.565
## Country.of.Origin.impute.GmodeChina 7.728e-02 1.031
## Country.of.Origin.impute.GmodeColombia 4.301e-02 -1.884
## Country.of.Origin.impute.GmodeCosta Rica 4.442e-02 1.465
## Country.of.Origin.impute.GmodeCote d?Ivoire 1.874e-01 0.203
## Country.of.Origin.impute.GmodeEcuador 1.848e-01 -0.835
## Country.of.Origin.impute.GmodeEl Salvador 6.143e-02 -2.031
## Country.of.Origin.impute.GmodeEthiopia 4.825e-02 0.266
## Country.of.Origin.impute.GmodeGuatemala 4.269e-02 -1.312
## Country.of.Origin.impute.GmodeHaiti 8.311e-02 0.038
## Country.of.Origin.impute.GmodeHonduras 4.571e-02 -0.212
## Country.of.Origin.impute.GmodeIndia 1.892e-01 1.001
## Country.of.Origin.impute.GmodeIndonesia 5.750e-02 0.511
## Country.of.Origin.impute.GmodeJapan 1.990e-01 1.760
## Country.of.Origin.impute.GmodeKenya 5.229e-02 -0.398
## Country.of.Origin.impute.GmodeLaos 1.185e-01 0.864
## Country.of.Origin.impute.GmodeMalawi 8.394e-02 -0.240
## Country.of.Origin.impute.GmodeMexico 4.099e-02 0.083
## Country.of.Origin.impute.GmodeMyanmar 7.825e-02 -0.169
## Country.of.Origin.impute.GmodeNicaragua 5.276e-02 -0.880
## Country.of.Origin.impute.GmodePanama 1.104e-01 -0.582
## Country.of.Origin.impute.GmodePeru 8.220e-02 -1.320
## Country.of.Origin.impute.GmodePhilippines 1.851e-01 -0.538
## Country.of.Origin.impute.GmodeRwanda 1.868e-01 1.099
## Country.of.Origin.impute.GmodeTaiwan 4.355e-02 -2.573
## Country.of.Origin.impute.GmodeTanzania, United Republic Of 4.499e-02 1.149
## Country.of.Origin.impute.GmodeThailand 5.214e-02 0.608
## Country.of.Origin.impute.GmodeUganda 5.362e-02 0.157
## Country.of.Origin.impute.GmodeUnited States 8.670e-02 -1.210
## Country.of.Origin.impute.GmodeUnited States (Hawaii) 6.335e-02 1.244
## Country.of.Origin.impute.GmodeUnited States (Puerto Rico) 1.192e-01 1.279
## Country.of.Origin.impute.GmodeVietnam 9.300e-02 1.824
## Country.of.Origin.impute.GmodeZambia 1.863e-01 -0.577
## RegionCentral America NA NA
## RegionEast Africa NA NA
## RegionNorth America NA NA
## RegionSouth America NA NA
## RegionWest Africa NA NA
## Processing.Method.impute.GmodePulped natural / honey 6.925e-02 -0.973
## Processing.Method.impute.GmodeSemi-washed / Semi-pulped 3.400e-02 -1.156
## Processing.Method.impute.GmodeWashed / Wet 1.945e-02 -1.377
## Color.impute.GmodeGreen 1.922e-02 0.294
## Variety.impute.GmodeBourbon-Typica cross 2.688e-02 -0.513
## Variety.impute.GmodeCaturra 2.493e-02 1.234
## Variety.impute.GmodeEthiopian Landrace 7.020e-02 0.953
## Variety.impute.GmodeHawaiian Kona 6.098e-02 -0.494
## Variety.impute.GmodeIntrogressed 6.764e-02 -0.656
## Variety.impute.GmodeTypica 2.643e-02 -0.298
## Variety.impute.GmodeYellow Bourbon 4.864e-02 2.803
## Pr(>|t|)
## (Intercept) 0.000208 ***
## Bag.Weight(10;60> 0.402413
## Bag.Weight<=1 0.159379
## Bag.Weight>60 0.466351
## Aroma 0.178613
## Flavor 6.23e-14 ***
## Aftertaste 1.58e-09 ***
## Acidity 9.00e-06 ***
## Body 0.002649 **
## Balance < 2e-16 ***
## Uniformity 0.232260
## Clean.Cup10 0.016585 *
## Sweetness10 0.773797
## Moisture 0.245434
## Quakers0 0.704619
## Category.Two.Defects 0.114174
## unit_of_measurementm 0.010147 *
## altitude_mean_meters.impute_median 0.859154
## Country.of.Origin.impute.GmodeBurundi 0.117984
## Country.of.Origin.impute.GmodeChina 0.302781
## Country.of.Origin.impute.GmodeColombia 0.059920 .
## Country.of.Origin.impute.GmodeCosta Rica 0.143241
## Country.of.Origin.impute.GmodeCote d?Ivoire 0.839285
## Country.of.Origin.impute.GmodeEcuador 0.403724
## Country.of.Origin.impute.GmodeEl Salvador 0.042526 *
## Country.of.Origin.impute.GmodeEthiopia 0.790112
## Country.of.Origin.impute.GmodeGuatemala 0.189894
## Country.of.Origin.impute.GmodeHaiti 0.969763
## Country.of.Origin.impute.GmodeHonduras 0.832036
## Country.of.Origin.impute.GmodeIndia 0.316972
## Country.of.Origin.impute.GmodeIndonesia 0.609834
## Country.of.Origin.impute.GmodeJapan 0.078820 .
## Country.of.Origin.impute.GmodeKenya 0.690410
## Country.of.Origin.impute.GmodeLaos 0.387870
## Country.of.Origin.impute.GmodeMalawi 0.810760
## Country.of.Origin.impute.GmodeMexico 0.933814
## Country.of.Origin.impute.GmodeMyanmar 0.866038
## Country.of.Origin.impute.GmodeNicaragua 0.379137
## Country.of.Origin.impute.GmodePanama 0.561014
## Country.of.Origin.impute.GmodePeru 0.187245
## Country.of.Origin.impute.GmodePhilippines 0.590633
## Country.of.Origin.impute.GmodeRwanda 0.272115
## Country.of.Origin.impute.GmodeTaiwan 0.010263 *
## Country.of.Origin.impute.GmodeTanzania, United Republic Of 0.250940
## Country.of.Origin.impute.GmodeThailand 0.543310
## Country.of.Origin.impute.GmodeUganda 0.875542
## Country.of.Origin.impute.GmodeUnited States 0.226524
## Country.of.Origin.impute.GmodeUnited States (Hawaii) 0.213693
## Country.of.Origin.impute.GmodeUnited States (Puerto Rico) 0.201128
## Country.of.Origin.impute.GmodeVietnam 0.068502 .
## Country.of.Origin.impute.GmodeZambia 0.564087
## RegionCentral America NA
## RegionEast Africa NA
## RegionNorth America NA
## RegionSouth America NA
## RegionWest Africa NA
## Processing.Method.impute.GmodePulped natural / honey 0.330723
## Processing.Method.impute.GmodeSemi-washed / Semi-pulped 0.248086
## Processing.Method.impute.GmodeWashed / Wet 0.168967
## Color.impute.GmodeGreen 0.768804
## Variety.impute.GmodeBourbon-Typica cross 0.608025
## Variety.impute.GmodeCaturra 0.217417
## Variety.impute.GmodeEthiopian Landrace 0.340975
## Variety.impute.GmodeHawaiian Kona 0.621281
## Variety.impute.GmodeIntrogressed 0.512272
## Variety.impute.GmodeTypica 0.765570
## Variety.impute.GmodeYellow Bourbon 0.005176 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1802 on 846 degrees of freedom
## Multiple R-squared: 0.8047, Adjusted R-squared: 0.7906
## F-statistic: 57.13 on 61 and 846 DF, p-value: < 2.2e-16
There are some missing coefficients.
coef(coffees_lm1)[is.na(coef(coffees_lm1))]
## RegionCentral America RegionEast Africa RegionNorth America
## NA NA NA
## RegionSouth America RegionWest Africa
## NA NA
Unfortunately all of Region dummies seems to be multicollinear. Thus, I create new vector without these variables.
coffees_variables_all2 <-
coffees_variables_all[-which(coffees_variables_all %in%
c("Region","Country.of.Origin.impute.Gmode"))]
coffees_lm2 <- lm(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all2)))
summary(coffees_lm2)
##
## Call:
## lm(formula = Cupper.Points ~ ., data = coffees_train %>% dplyr::select(all_of(coffees_variables_all2)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62803 -0.09817 -0.01219 0.08784 0.92866
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -9.160e-01 2.373e-01
## Bag.Weight(10;60> -2.097e-02 2.185e-02
## Bag.Weight<=1 -3.130e-02 2.223e-02
## Bag.Weight>60 -2.330e-02 2.349e-02
## Aroma 4.407e-02 3.255e-02
## Flavor 3.309e-01 4.257e-02
## Aftertaste 2.550e-01 3.928e-02
## Acidity 1.459e-01 3.357e-02
## Body 1.044e-01 3.293e-02
## Balance 2.592e-01 2.921e-02
## Uniformity -2.359e-02 1.556e-02
## Clean.Cup10 6.265e-02 2.901e-02
## Sweetness10 1.863e-02 2.740e-02
## Moisture -9.120e-02 1.393e-01
## Quakers0 1.419e-03 2.524e-02
## Category.Two.Defects -1.724e-03 1.347e-03
## unit_of_measurementm 6.416e-02 2.238e-02
## altitude_mean_meters.impute_median 1.718e-05 1.872e-05
## Processing.Method.impute.GmodePulped natural / honey -3.261e-02 6.510e-02
## Processing.Method.impute.GmodeSemi-washed / Semi-pulped -5.453e-02 3.397e-02
## Processing.Method.impute.GmodeWashed / Wet -2.452e-02 1.808e-02
## Color.impute.GmodeGreen -6.655e-03 1.847e-02
## Variety.impute.GmodeBourbon-Typica cross 1.961e-04 2.367e-02
## Variety.impute.GmodeCaturra 1.768e-02 1.954e-02
## Variety.impute.GmodeEthiopian Landrace 5.872e-02 5.085e-02
## Variety.impute.GmodeHawaiian Kona 4.381e-02 4.230e-02
## Variety.impute.GmodeIntrogressed 4.286e-02 4.662e-02
## Variety.impute.GmodeTypica -4.214e-03 1.842e-02
## Variety.impute.GmodeYellow Bourbon 1.642e-01 4.510e-02
## t value Pr(>|t|)
## (Intercept) -3.860 0.000122 ***
## Bag.Weight(10;60> -0.960 0.337424
## Bag.Weight<=1 -1.408 0.159489
## Bag.Weight>60 -0.992 0.321472
## Aroma 1.354 0.176199
## Flavor 7.774 2.13e-14 ***
## Aftertaste 6.492 1.42e-10 ***
## Acidity 4.346 1.55e-05 ***
## Body 3.171 0.001572 **
## Balance 8.873 < 2e-16 ***
## Uniformity -1.516 0.129946
## Clean.Cup10 2.160 0.031076 *
## Sweetness10 0.680 0.496783
## Moisture -0.655 0.512859
## Quakers0 0.056 0.955163
## Category.Two.Defects -1.280 0.201010
## unit_of_measurementm 2.867 0.004239 **
## altitude_mean_meters.impute_median 0.918 0.358926
## Processing.Method.impute.GmodePulped natural / honey -0.501 0.616549
## Processing.Method.impute.GmodeSemi-washed / Semi-pulped -1.605 0.108808
## Processing.Method.impute.GmodeWashed / Wet -1.356 0.175300
## Color.impute.GmodeGreen -0.360 0.718678
## Variety.impute.GmodeBourbon-Typica cross 0.008 0.993393
## Variety.impute.GmodeCaturra 0.905 0.365867
## Variety.impute.GmodeEthiopian Landrace 1.155 0.248529
## Variety.impute.GmodeHawaiian Kona 1.036 0.300568
## Variety.impute.GmodeIntrogressed 0.919 0.358181
## Variety.impute.GmodeTypica -0.229 0.819134
## Variety.impute.GmodeYellow Bourbon 3.640 0.000288 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1836 on 879 degrees of freedom
## Multiple R-squared: 0.7894, Adjusted R-squared: 0.7827
## F-statistic: 117.7 on 28 and 879 DF, p-value: < 2.2e-16
After delete the Region and Country variable R-squared has not changed.
Now I run Backward Elimination based on p-value 0.1 on both models:
coffees_lm_backward_1 <-
ols_step_backward_p(coffees_lm1,
prem = 0.1,
progress = FALSE)
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
## Note: model has aliased coefficients
## sums of squares computed by model comparison
coffees_lm_backward_2 <-
ols_step_backward_p(coffees_lm2,
prem = 0.1,
progress = FALSE)
Final model details:
summary(coffees_lm_backward_1$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54991 -0.09442 -0.01003 0.08757 0.96529
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error
## (Intercept) -1.122019 0.199545
## Bag.Weight(10;60> -0.026594 0.025067
## Bag.Weight<=1 -0.042519 0.026362
## Bag.Weight>60 0.016014 0.025416
## Flavor 0.358996 0.040837
## Aftertaste 0.254188 0.039593
## Acidity 0.156435 0.034185
## Body 0.107490 0.032847
## Balance 0.261734 0.029117
## Clean.Cup10 0.053708 0.023257
## unit_of_measurementm 0.076239 0.030172
## Country.of.Origin.impute.GmodeBurundi 0.196003 0.130321
## Country.of.Origin.impute.GmodeChina 0.055536 0.075688
## Country.of.Origin.impute.GmodeColombia -0.085835 0.040232
## Country.of.Origin.impute.GmodeCosta Rica 0.052797 0.041509
## Country.of.Origin.impute.GmodeCote d?Ivoire 0.039735 0.184631
## Country.of.Origin.impute.GmodeEcuador -0.152482 0.183998
## Country.of.Origin.impute.GmodeEl Salvador -0.137930 0.058312
## Country.of.Origin.impute.GmodeEthiopia -0.004584 0.045770
## Country.of.Origin.impute.GmodeGuatemala -0.071272 0.040095
## Country.of.Origin.impute.GmodeHaiti -0.019230 0.081816
## Country.of.Origin.impute.GmodeHonduras -0.025137 0.043748
## Country.of.Origin.impute.GmodeIndia 0.233783 0.184553
## Country.of.Origin.impute.GmodeIndonesia 0.011733 0.056062
## Country.of.Origin.impute.GmodeJapan 0.301269 0.185199
## Country.of.Origin.impute.GmodeKenya -0.034390 0.049571
## Country.of.Origin.impute.GmodeLaos 0.084555 0.116435
## Country.of.Origin.impute.GmodeMalawi -0.035505 0.081914
## Country.of.Origin.impute.GmodeMexico -0.017980 0.038741
## Country.of.Origin.impute.GmodeMyanmar -0.016616 0.076806
## Country.of.Origin.impute.GmodeNicaragua -0.060325 0.051282
## Country.of.Origin.impute.GmodePanama -0.082421 0.109229
## Country.of.Origin.impute.GmodePeru -0.121088 0.080655
## Country.of.Origin.impute.GmodePhilippines -0.083575 0.184483
## Country.of.Origin.impute.GmodeRwanda 0.172790 0.183746
## Country.of.Origin.impute.GmodeTaiwan -0.115798 0.041270
## Country.of.Origin.impute.GmodeTanzania, United Republic Of 0.032537 0.041107
## Country.of.Origin.impute.GmodeThailand 0.020068 0.049864
## Country.of.Origin.impute.GmodeUganda 0.002179 0.051439
## Country.of.Origin.impute.GmodeUnited States -0.113134 0.083185
## Country.of.Origin.impute.GmodeUnited States (Hawaii) 0.075856 0.061373
## Country.of.Origin.impute.GmodeUnited States (Puerto Rico) 0.159123 0.116969
## Country.of.Origin.impute.GmodeVietnam 0.162220 0.092160
## Country.of.Origin.impute.GmodeZambia -0.089978 0.184722
## RegionCentral America NA NA
## RegionEast Africa NA NA
## RegionNorth America NA NA
## RegionSouth America NA NA
## RegionWest Africa NA NA
## Variety.impute.GmodeBourbon-Typica cross -0.017506 0.026729
## Variety.impute.GmodeCaturra 0.030907 0.024675
## Variety.impute.GmodeEthiopian Landrace 0.072871 0.069628
## Variety.impute.GmodeHawaiian Kona -0.026851 0.058224
## Variety.impute.GmodeIntrogressed -0.032483 0.067094
## Variety.impute.GmodeTypica -0.005283 0.026282
## Variety.impute.GmodeYellow Bourbon 0.134834 0.046954
## t value Pr(>|t|)
## (Intercept) -5.623 2.54e-08 ***
## Bag.Weight(10;60> -1.061 0.28903
## Bag.Weight<=1 -1.613 0.10715
## Bag.Weight>60 0.630 0.52880
## Flavor 8.791 < 2e-16 ***
## Aftertaste 6.420 2.25e-10 ***
## Acidity 4.576 5.44e-06 ***
## Body 3.272 0.00111 **
## Balance 8.989 < 2e-16 ***
## Clean.Cup10 2.309 0.02116 *
## unit_of_measurementm 2.527 0.01169 *
## Country.of.Origin.impute.GmodeBurundi 1.504 0.13295
## Country.of.Origin.impute.GmodeChina 0.734 0.46331
## Country.of.Origin.impute.GmodeColombia -2.133 0.03317 *
## Country.of.Origin.impute.GmodeCosta Rica 1.272 0.20374
## Country.of.Origin.impute.GmodeCote d?Ivoire 0.215 0.82965
## Country.of.Origin.impute.GmodeEcuador -0.829 0.40749
## Country.of.Origin.impute.GmodeEl Salvador -2.365 0.01823 *
## Country.of.Origin.impute.GmodeEthiopia -0.100 0.92024
## Country.of.Origin.impute.GmodeGuatemala -1.778 0.07583 .
## Country.of.Origin.impute.GmodeHaiti -0.235 0.81424
## Country.of.Origin.impute.GmodeHonduras -0.575 0.56572
## Country.of.Origin.impute.GmodeIndia 1.267 0.20559
## Country.of.Origin.impute.GmodeIndonesia 0.209 0.83427
## Country.of.Origin.impute.GmodeJapan 1.627 0.10416
## Country.of.Origin.impute.GmodeKenya -0.694 0.48803
## Country.of.Origin.impute.GmodeLaos 0.726 0.46791
## Country.of.Origin.impute.GmodeMalawi -0.433 0.66481
## Country.of.Origin.impute.GmodeMexico -0.464 0.64268
## Country.of.Origin.impute.GmodeMyanmar -0.216 0.82878
## Country.of.Origin.impute.GmodeNicaragua -1.176 0.23979
## Country.of.Origin.impute.GmodePanama -0.755 0.45072
## Country.of.Origin.impute.GmodePeru -1.501 0.13364
## Country.of.Origin.impute.GmodePhilippines -0.453 0.65065
## Country.of.Origin.impute.GmodeRwanda 0.940 0.34729
## Country.of.Origin.impute.GmodeTaiwan -2.806 0.00513 **
## Country.of.Origin.impute.GmodeTanzania, United Republic Of 0.792 0.42886
## Country.of.Origin.impute.GmodeThailand 0.402 0.68745
## Country.of.Origin.impute.GmodeUganda 0.042 0.96623
## Country.of.Origin.impute.GmodeUnited States -1.360 0.17418
## Country.of.Origin.impute.GmodeUnited States (Hawaii) 1.236 0.21681
## Country.of.Origin.impute.GmodeUnited States (Puerto Rico) 1.360 0.17407
## Country.of.Origin.impute.GmodeVietnam 1.760 0.07873 .
## Country.of.Origin.impute.GmodeZambia -0.487 0.62631
## RegionCentral America NA NA
## RegionEast Africa NA NA
## RegionNorth America NA NA
## RegionSouth America NA NA
## RegionWest Africa NA NA
## Variety.impute.GmodeBourbon-Typica cross -0.655 0.51266
## Variety.impute.GmodeCaturra 1.253 0.21070
## Variety.impute.GmodeEthiopian Landrace 1.047 0.29559
## Variety.impute.GmodeHawaiian Kona -0.461 0.64480
## Variety.impute.GmodeIntrogressed -0.484 0.62841
## Variety.impute.GmodeTypica -0.201 0.84074
## Variety.impute.GmodeYellow Bourbon 2.872 0.00418 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1802 on 857 degrees of freedom
## Multiple R-squared: 0.8021, Adjusted R-squared: 0.7906
## F-statistic: 69.47 on 50 and 857 DF, p-value: < 2.2e-16
summary(coffees_lm_backward_2$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62540 -0.09756 -0.01474 0.08669 0.98000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.2023633 0.1719583 -6.992 5.30e-12
## Flavor 0.3555215 0.0397195 8.951 < 2e-16
## Aftertaste 0.2697120 0.0376905 7.156 1.73e-12
## Acidity 0.1435599 0.0324724 4.421 1.10e-05
## Body 0.1160713 0.0322681 3.597 0.000339
## Balance 0.2612798 0.0286254 9.128 < 2e-16
## Clean.Cup10 0.0458423 0.0228808 2.004 0.045423
## unit_of_measurementm 0.0620937 0.0213500 2.908 0.003723
## Variety.impute.GmodeBourbon-Typica cross 0.0002437 0.0229956 0.011 0.991546
## Variety.impute.GmodeCaturra 0.0250487 0.0184274 1.359 0.174389
## Variety.impute.GmodeEthiopian Landrace 0.0705280 0.0494544 1.426 0.154183
## Variety.impute.GmodeHawaiian Kona 0.0589422 0.0374719 1.573 0.116080
## Variety.impute.GmodeIntrogressed 0.0562236 0.0455026 1.236 0.216928
## Variety.impute.GmodeTypica -0.0044689 0.0171297 -0.261 0.794241
## Variety.impute.GmodeYellow Bourbon 0.1660706 0.0416059 3.992 7.10e-05
##
## (Intercept) ***
## Flavor ***
## Aftertaste ***
## Acidity ***
## Body ***
## Balance ***
## Clean.Cup10 *
## unit_of_measurementm **
## Variety.impute.GmodeBourbon-Typica cross
## Variety.impute.GmodeCaturra
## Variety.impute.GmodeEthiopian Landrace
## Variety.impute.GmodeHawaiian Kona
## Variety.impute.GmodeIntrogressed
## Variety.impute.GmodeTypica
## Variety.impute.GmodeYellow Bourbon ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1835 on 893 degrees of freedom
## Multiple R-squared: 0.7861, Adjusted R-squared: 0.7827
## F-statistic: 234.4 on 14 and 893 DF, p-value: < 2.2e-16
coffees_variables_backward_1 <-
coffees_variables_all[-which(coffees_variables_all %in%
coffees_lm_backward_1$removed)]
coffees_variables_backward_2 <-
coffees_variables_all[-which(coffees_variables_all %in%
coffees_lm_backward_2$removed)]
Residuals:
par(mfrow=c(2,2))
hist(coffees_lm1$residuals, breaks = 30, main = "Coffees_lm1")
hist(coffees_lm2$residuals, breaks = 30, main = "Coffees_lm2")
hist(coffees_lm_backward_1$model$residuals, breaks = 30, main = "Coffees_backward_lm1")
hist(coffees_lm_backward_2$model$residuals, breaks = 30, main = "Coffees_backward_lm2")
Model stats:
lm_forcasts <-
data.frame(
regr_metrics_lm1 = predict(coffees_lm1),
regr_metrics_lm2 = predict(coffees_lm2),
regr_metrics_backward1 = predict(coffees_lm_backward_1$model),
regr_metrics_backward2 = predict(coffees_lm_backward_2$model)
)
sapply(lm_forcasts,
function(x) regressionMetrics(real = coffees_train$Cupper.Points,
predicted = x))
## regr_metrics_lm1 regr_metrics_lm2 regr_metrics_backward1
## MSE 0.0302545 0.03261977 0.03064914
## RMSE 0.1739382 0.1806095 0.175069
## MAE 0.1225276 0.1281175 0.1232967
## MedAE 0.09199434 0.09354114 0.09097715
## MSLE 0.0004112481 0.000443927 0.0004163918
## R2 0.804655 0.789383 0.8021069
## regr_metrics_backward2
## MSE 0.03313119
## RMSE 0.1820198
## MAE 0.128165
## MedAE 0.09358055
## MSLE 0.0004510991
## R2 0.786081
Now I create controls for cross validation:
ctrl_cv2 <- trainControl(method = "cv",
number = 2)
ctrl_cv10 <- trainControl(method = "cv",
number = 10)
I estimate the General Linear Model with all variables and without Region and Country variables:
set.seed(987654321)
coffees_glm_1_cv2 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)),
method = "glm",
metric = "RMSE",
trControl = ctrl_cv2)
coffees_glm_1_cv10 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)),
method = "glm",
metric = "RMSE",
trControl = ctrl_cv10)
coffees_glm_2_cv2 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all2)),
method = "glm",
metric = "RMSE",
trControl = ctrl_cv2)
coffees_glm_2_cv10 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all2)),
method = "glm",
metric = "RMSE",
trControl = ctrl_cv10)
set.seed(987654321)
coffees_knn_1_cv2 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)),
method = "knn",
metric = "RMSE",
trControl = ctrl_cv2)
coffees_knn_1_cv10 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)),
method = "knn",
metric = "RMSE",
trControl = ctrl_cv10)
coffees_knn_2_cv2 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all2)),
method = "knn",
metric = "RMSE",
trControl = ctrl_cv2)
coffees_knn_2_cv10 <-
train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all2)),
method = "knn",
metric = "RMSE",
trControl = ctrl_cv10)
parameters_lasso <- expand.grid(alpha = 1,
lambda = seq(10, 1e5, 100))
set.seed(123456789)
coffees_lasso <- train(Cupper.Points ~ .,
data = coffees_train %>%
dplyr::select(all_of(coffees_variables_all)),
method = "glmnet",
tuneGrid = parameters_lasso,
trControl = ctrl_cv10)
Models comparison:
models_forcasts <-
data.frame(
GLM1 = predict(coffees_glm_1_cv10,coffees_test),
GLM2 = predict(coffees_glm_2_cv10,coffees_test),
KNN1 = predict(coffees_knn_1_cv10,coffees_test),
KNN2 = predict(coffees_knn_2_cv10,coffees_test),
LASSO = predict(coffees_lasso,coffees_test)
)
sapply(models_forcasts,
function(x) regressionMetrics(real = coffees_test$Cupper.Points,
predicted = x))
## GLM1 GLM2 KNN1 KNN2 LASSO
## MSE 0.07732528 0.07501572 0.1397743 0.1385196 0.1778125
## RMSE 0.2780742 0.27389 0.373864 0.3721822 0.4216782
## MAE 0.1432694 0.1442826 0.2659763 0.2628268 0.2997219
## MedAE 0.1014087 0.1025869 0.2071429 0.2 0.2418943
## MSLE 0.0009594597 0.0009285225 0.00186801 0.001846849 0.00240981
## R2 0.5649241 0.577919 0.2135505 0.22061 -0.0004741536
resample_results <- resamples(list(GLM1=coffees_glm_1_cv10,
GLM2=coffees_glm_2_cv10,
KNN1=coffees_knn_1_cv10,
KNN2=coffees_knn_2_cv10,
LASSO=coffees_lasso
))
bwplot(resample_results , metric = c("RMSE","MAE"))
It looks, that the best models are linear models. They have lower values of MAE, RMSE and MSE.